From f2c27dafa2950510ba767cd59937ddf5d1974937 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 7 Nov 2016 12:13:04 -0800 Subject: [PATCH] HalfFloat neon armv7 fix for destination pointer. Improved unittests detect different in arm64 rounding. TEST=util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*Half* -a "--libyuv_width=640 --libyuv_height=360" BUG=libyuv:560 R=wangcheng@google.com Review URL: https://codereview.chromium.org/2478313004 . --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_neon.cc | 16 ++++---- source/row_neon64.cc | 3 +- unit_test/planar_test.cc | 80 ++++++++++++++++++++++++++++++---------- 5 files changed, 73 insertions(+), 30 deletions(-) diff --git a/README.chromium b/README.chromium index aed82de2b..aaadf1e8d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1633 +Version: 1634 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 00e11d709..1c6414a86 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1633 +#define LIBYUV_VERSION 1634 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_neon.cc b/source/row_neon.cc index 9385b275d..c31fdcd6f 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2742,16 +2742,16 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { MEMACCESS(0) "vld1.8 {q1}, [%0]! \n" // load 8 shorts "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u8 q2, d2 \n" // 8 int's - "vmovl.u8 q3, d3 \n" + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" "vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q3, q3 \n" "vmul.f32 q2, q2, q0 \n" // adjust exponent "vmul.f32 q3, q3, q0 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat "vqshrn.u32 d3, q3, #13 \n" MEMACCESS(1) - "vst1.8 {q1}, [%0]! \n" + "vst1.8 {q1}, [%1]! \n" "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -2770,16 +2770,16 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { MEMACCESS(0) "vld1.8 {q1}, [%0]! \n" // load 8 shorts "subs %2, %2, #8 \n" // 8 pixels per loop - "vmovl.u8 q2, d2 \n" // 8 int's - "vmovl.u8 q3, d3 \n" + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" "vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q3, q3 \n" "vmul.f32 q2, q2, q0 \n" // adjust exponent "vmul.f32 q3, q3, q0 \n" - "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat "vqshrn.u32 d3, q3, #13 \n" MEMACCESS(1) - "vst1.8 {q1}, [%0]! \n" + "vst1.8 {q1}, [%1]! \n" "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 3ec6bab8c..4ed4e61de 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2711,6 +2711,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ); } +// Caveat - rounds float to half float whereas scaling version truncates. void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { asm volatile ( "1: \n" @@ -2721,7 +2722,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v3.4s, v3.4s \n" - "fcvtn v1.4h, v2.4s \n" // 8 floatsgit + "fcvtn v1.4h, v2.4s \n" // 8 half floats "fcvtn2 v1.8h, v3.4s \n" MEMACCESS(1) "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index c017c26a3..5a840898d 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2120,26 +2120,61 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height, } opt_time = (get_time() - opt_time) / benchmark_iterations; - int diff = 0; - for (i = 0; i < y_plane_size; ++i) { - diff = dst_c[i] - dst_opt[i]; - if (diff) break; + int max_diff = 0; + for (i = 0; i < y_plane_size / 2; ++i) { + int abs_diff = + abs(static_cast(reinterpret_cast(dst_c)[i]) - + static_cast(reinterpret_cast(dst_opt)[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } } free_aligned_buffer_page_end(orig_y); - return diff; + return max_diff; } +#if defined(__arm__) +static void EnableFlushDenormalToZero(void) { + uint32_t cw; + __asm__ __volatile__ ( + "vmrs %0, fpscr \n" + "orr %0, %0, #0x1000000 \n" + "vmsr fpscr, %0 \n" + : "=r"(cw) :: "memory"); +} +#endif + // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes // exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally // happen since scale is 1/(1<