From f2c27dafa2950510ba767cd59937ddf5d1974937 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Mon, 7 Nov 2016 12:13:04 -0800
Subject: [PATCH] HalfFloat neon armv7 fix for destination pointer.

Improved unittests detect different in arm64 rounding.

TEST=util/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*Half* -a "--libyuv_width=640 --libyuv_height=360"
BUG=libyuv:560
R=wangcheng@google.com

Review URL: https://codereview.chromium.org/2478313004 .
---
 README.chromium          |  2 +-
 include/libyuv/version.h |  2 +-
 source/row_neon.cc       | 16 ++++----
 source/row_neon64.cc     |  3 +-
 unit_test/planar_test.cc | 80 ++++++++++++++++++++++++++++++----------
 5 files changed, 73 insertions(+), 30 deletions(-)

diff --git a/README.chromium b/README.chromium
index aed82de2b..aaadf1e8d 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1633
+Version: 1634
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 00e11d709..1c6414a86 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1633
+#define LIBYUV_VERSION 1634
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 9385b275d..c31fdcd6f 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2742,16 +2742,16 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
     MEMACCESS(0)
     "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
     "subs       %2, %2, #8                     \n"  // 8 pixels per loop
-    "vmovl.u8   q2, d2                         \n"  // 8 int's
-    "vmovl.u8   q3, d3                         \n"
+    "vmovl.u16  q2, d2                         \n"  // 8 int's
+    "vmovl.u16  q3, d3                         \n"
     "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
     "vcvt.f32.u32  q3, q3                      \n"
     "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
     "vmul.f32   q3, q3, q0                     \n"
-    "vqshrn.u32 d2, q2, #13                    \n"   // isolate halffloat
+    "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
     "vqshrn.u32 d3, q3, #13                    \n"
     MEMACCESS(1)
-    "vst1.8     {q1}, [%0]!                    \n"
+    "vst1.8     {q1}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src),    // %0
     "+r"(dst),    // %1
@@ -2770,16 +2770,16 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
     MEMACCESS(0)
     "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
     "subs       %2, %2, #8                     \n"  // 8 pixels per loop
-    "vmovl.u8   q2, d2                         \n"  // 8 int's
-    "vmovl.u8   q3, d3                         \n"
+    "vmovl.u16  q2, d2                         \n"  // 8 int's
+    "vmovl.u16  q3, d3                         \n"
     "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
     "vcvt.f32.u32  q3, q3                      \n"
     "vmul.f32   q2, q2, q0                     \n"  // adjust exponent
     "vmul.f32   q3, q3, q0                     \n"
-    "vqshrn.u32 d2, q2, #13                    \n"   // isolate halffloat
+    "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
     "vqshrn.u32 d3, q3, #13                    \n"
     MEMACCESS(1)
-    "vst1.8     {q1}, [%0]!                    \n"
+    "vst1.8     {q1}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src),    // %0
     "+r"(dst),    // %1
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 3ec6bab8c..4ed4e61de 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2711,6 +2711,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
   );
 }
 
+// Caveat - rounds float to half float whereas scaling version truncates.
 void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
   asm volatile (
   "1:                                          \n"
@@ -2721,7 +2722,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
     "uxtl2      v3.4s, v1.8h                   \n"
     "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
     "scvtf      v3.4s, v3.4s                   \n"
-    "fcvtn      v1.4h, v2.4s                   \n"  // 8 floatsgit
+    "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
     "fcvtn2     v1.8h, v3.4s                   \n"
    MEMACCESS(1)
     "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index c017c26a3..5a840898d 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2120,26 +2120,61 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
   }
   opt_time = (get_time() - opt_time) / benchmark_iterations;
 
-  int diff = 0;
-  for (i = 0; i < y_plane_size; ++i) {
-    diff = dst_c[i] - dst_opt[i];
-    if (diff) break;
+  int max_diff = 0;
+  for (i = 0; i < y_plane_size / 2; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(reinterpret_cast<uint16*>(dst_c)[i]) -
+            static_cast<int>(reinterpret_cast<uint16*>(dst_opt)[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
   }
 
   free_aligned_buffer_page_end(orig_y);
-  return diff;
+  return max_diff;
 }
 
+#if defined(__arm__)
+static void EnableFlushDenormalToZero(void) {
+  uint32_t cw;
+  __asm__ __volatile__ (
+    "vmrs   %0, fpscr         \n"
+    "orr    %0, %0, #0x1000000        \n"
+    "vmsr   fpscr, %0         \n"
+    : "=r"(cw) :: "memory");
+}
+#endif
+
 // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
 // exponent to be less than 0.  15 - log2(65536) = -1/  This shouldnt normally
 // happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.
-#define MAXHALFDIFF 0
+
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
+// 32 bit arm rounding on denormal case is off by 1 compared to C.
+#if defined(__arm__)
+  EnableFlushDenormalToZero();
+#endif
   int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                 benchmark_iterations_,
                                 disable_cpu_flags_, benchmark_cpu_info_,
                                 1.0f / 65536.0f, 65535);
-  EXPECT_LE(diff, MAXHALFDIFF);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f, 65535);
+  EXPECT_LE(diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 4096.0f, 65535);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
@@ -2147,7 +2182,7 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
                                 benchmark_iterations_,
                                 disable_cpu_flags_, benchmark_cpu_info_,
                                 1.0f / 1024.0f, 1023);
-  EXPECT_LE(diff, MAXHALFDIFF);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
@@ -2155,7 +2190,7 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
                                 benchmark_iterations_,
                                 disable_cpu_flags_, benchmark_cpu_info_,
                                 1.0f / 512.0f, 511);
-  EXPECT_LE(diff, MAXHALFDIFF);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
@@ -2163,15 +2198,7 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
                                 benchmark_iterations_,
                                 disable_cpu_flags_, benchmark_cpu_info_,
                                 1.0f / 4096.0f, 4095);
-  EXPECT_LE(diff, MAXHALFDIFF);
-}
-
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_One) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_,
-                                disable_cpu_flags_, benchmark_cpu_info_,
-                                1.0f, 4095);
-  EXPECT_LE(diff, MAXHALFDIFF);
+  EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
@@ -2179,9 +2206,24 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
                                 benchmark_iterations_,
                                 disable_cpu_flags_, benchmark_cpu_info_,
                                 1.0f / 4095.0f, 4095);
-  EXPECT_LE(diff, MAXHALFDIFF);
+  EXPECT_EQ(0, diff);
 }
 
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_One) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f, 2047);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_One) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f, 4095);
+  EXPECT_LE(diff, 1);
+}
 
 TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
   SIMD_ALIGNED(uint8 orig_pixels[1280][4]);