From 451af5e922e026c266d25abc92e7519acfc9a4c5 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Fri, 21 Oct 2016 14:30:03 -0700
Subject: [PATCH] scale by 1 for neon implemented

void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
    "uxtl2      v1.4s, v1.8h                   \n"
    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
    "scvtf      v1.4s, v1.4s                   \n"
    "fcvtn      v4.4h, v2.4s                   \n"  // 8 floatsgit
    "fcvtn2     v4.8h, v1.4s                   \n"
   MEMACCESS(1)
    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
    "b.gt       1b                             \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
  :
  : "cc", "memory", "v1", "v2", "v4"
  );
}

void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
    "uxtl2      v1.4s, v1.8h                   \n"
    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
    "scvtf      v1.4s, v1.4s                   \n"
    "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
    "fmul       v1.4s, v1.4s, %3.s[0]          \n"
    "uqshrn     v4.4h, v2.4s, #13              \n"  // isolate halffloat
    "uqshrn2    v4.8h, v1.4s, #13              \n"
   MEMACCESS(1)
    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
    "b.gt       1b                             \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
  : "w"(scale * 1.9259299444e-34f)    // %3
  : "cc", "memory", "v1", "v2", "v4"
  );
}

TEST=LibYUVPlanarTest.TestHalfFloatPlane_One
BUG=libyuv:560
R=hubbe@chromium.org

Review URL: https://codereview.chromium.org/2430313008 .
---
 README.chromium            |  2 +-
 include/libyuv/row.h       |  6 +++++
 include/libyuv/version.h   |  2 +-
 source/planar_functions.cc | 10 ++++---
 source/row_any.cc          |  4 ++-
 source/row_gcc.cc          | 30 +++++++++++++++++++++
 source/row_neon64.cc       | 49 +++++++++++++++++++++++++++++++++
 unit_test/planar_test.cc   | 55 +++++++++++++++++++++++++++++---------
 8 files changed, 138 insertions(+), 20 deletions(-)

diff --git a/README.chromium b/README.chromium
index 6e66021d9..c14ba8429 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1629
+Version: 1630
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 96861befb..601e05acc 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -1959,9 +1959,15 @@ void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
 void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale,
                            int width);
+void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloat1Row_Any_F16C(const uint16* src, uint16* dst, float scale,
+                            int width);
 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width);
 void HalfFloatRow_Any_NEON(const uint16* src, uint16* dst, float scale,
                            int width);
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloat1Row_Any_NEON(const uint16* src, uint16* dst, float scale,
+                            int width);
 
 void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
                              const uint8* luma, uint32 lumacoeff);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 75406bd7f..17a9c6660 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1629
+#define LIBYUV_VERSION 1630
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 143ae869d..7a10a69f7 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -2579,17 +2579,19 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
 #endif
 #if defined(HAS_HALFFLOATROW_F16C)
   if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
-    HalfFloatRow = HalfFloatRow_Any_F16C;
+    HalfFloatRow = (scale == 1.0f) ?
+      HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
     if (IS_ALIGNED(width, 16)) {
-      HalfFloatRow = HalfFloatRow_F16C;
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
     }
   }
 #endif
 #if defined(HAS_HALFFLOATROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    HalfFloatRow = HalfFloatRow_Any_NEON;
+    HalfFloatRow = (scale == 1.0f) ?
+      HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      HalfFloatRow = HalfFloatRow_NEON;
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
     }
   }
 #endif
diff --git a/source/row_any.cc b/source/row_any.cc
index ec0aa21d7..07e606c6e 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -577,16 +577,18 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
     }
 
 #ifdef HAS_HALFFLOATROW_SSE2
-ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 15)
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 7)
 #endif
 #ifdef HAS_HALFFLOATROW_AVX2
 ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
 #endif
 #ifdef HAS_HALFFLOATROW_F16C
 ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
+ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 1, 1, 15)
 #endif
 #ifdef HAS_HALFFLOATROW_NEON
 ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7)
+ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 1, 1, 7)
 #endif
 #undef ANY11P16
 
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 8020108d0..bc15c7719 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -5410,6 +5410,36 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
 }
 #endif  // HAS_HALFFLOATROW_F16C
 
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
+  asm volatile (
+    // 16 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
+    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
+    "lea         " MEMLEA(0x20,0) ",%0         \n"
+    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+    "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+    "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+    "vmovdqu     %%xmm2," MEMACCESS(1) "       \n"
+    "vmovdqu     %%xmm3," MEMACCESS2(0x10,1) " \n"
+    "lea         " MEMLEA(0x20,1) ",%1         \n"
+    "sub         $0x10,%2                      \n"
+    "jg          1b                            \n"
+
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc",
+    "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 3d122680e..9508d4656 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2711,6 +2711,55 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
   );
 }
 
+
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+    "uxtl2      v1.4s, v1.8h                   \n"
+    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+    "scvtf      v1.4s, v1.4s                   \n"
+    "fcvtn      v4.4h, v2.4s                   \n"  // 8 floatsgit
+    "fcvtn2     v4.8h, v1.4s                   \n"
+   MEMACCESS(1)
+    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
+    "b.gt       1b                             \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  :
+  : "cc", "memory", "v1", "v2", "v4"
+  );
+}
+
+void HalfFloatRow_NEON2(const uint16* src, uint16* dst, float scale, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+    "uxtl2      v1.4s, v1.8h                   \n"
+    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+    "scvtf      v1.4s, v1.4s                   \n"
+    "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
+    "fmul       v1.4s, v1.4s, %3.s[0]          \n"
+    "uqshrn     v4.4h, v2.4s, #13              \n"  // isolate halffloat
+    "uqshrn2    v4.8h, v1.4s, #13              \n"
+   MEMACCESS(1)
+    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
+    "b.gt       1b                             \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "w"(scale * 1.9259299444e-34f)    // %3
+  : "cc", "memory", "v1", "v2", "v4"
+  );
+}
+
 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
   asm volatile (
   "1:                                          \n"
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index a2eb1faac..c017c26a3 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2084,17 +2084,22 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
 int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
                        int benchmark_iterations,
                        int disable_cpu_flags, int benchmark_cpu_info,
-                       float scale) {
+                       float scale, int mask) {
   int i, j;
   const int y_plane_size = benchmark_width * benchmark_height * 2;
 
-  align_buffer_page_end(orig_y, y_plane_size);
-  align_buffer_page_end(dst_c, y_plane_size);
-  align_buffer_page_end(dst_opt, y_plane_size);
+  align_buffer_page_end(orig_y, y_plane_size * 3);
+  uint8* dst_opt = orig_y + y_plane_size;
+  uint8* dst_c = orig_y + y_plane_size * 2;
+
   MemRandomize(orig_y, y_plane_size);
   memset(dst_c, 0, y_plane_size);
   memset(dst_opt, 1, y_plane_size);
 
+  for (i = 0; i < y_plane_size / 2; ++i) {
+    reinterpret_cast<uint16*>(orig_y)[i] = static_cast<uint16>(i & mask);
+  }
+
   // Disable all optimizations.
   MaskCpuFlags(disable_cpu_flags);
   double c_time = get_time();
@@ -2122,38 +2127,62 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
   }
 
   free_aligned_buffer_page_end(orig_y);
-  free_aligned_buffer_page_end(dst_c);
-  free_aligned_buffer_page_end(dst_opt);
   return diff;
 }
 
 // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
 // exponent to be less than 0.  15 - log2(65536) = -1/  This shouldnt normally
 // happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_denormal) {
+#define MAXHALFDIFF 0
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
   int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                 benchmark_iterations_,
                                 disable_cpu_flags_, benchmark_cpu_info_,
-                                1.0f / 65536.0f);
-  EXPECT_EQ(diff, 0);
+                                1.0f / 65536.0f, 65535);
+  EXPECT_LE(diff, MAXHALFDIFF);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 1024.0f, 1023);
+  EXPECT_LE(diff, MAXHALFDIFF);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 512.0f, 511);
+  EXPECT_LE(diff, MAXHALFDIFF);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
   int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                 benchmark_iterations_,
                                 disable_cpu_flags_, benchmark_cpu_info_,
-                                1.0f / 4096.0f);
-  EXPECT_EQ(diff, 0);
+                                1.0f / 4096.0f, 4095);
+  EXPECT_LE(diff, MAXHALFDIFF);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_One) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f, 4095);
+  EXPECT_LE(diff, MAXHALFDIFF);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
   int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                 benchmark_iterations_,
                                 disable_cpu_flags_, benchmark_cpu_info_,
-                                1.0f / 1023.0f);
-  EXPECT_EQ(diff, 0);
+                                1.0f / 4095.0f, 4095);
+  EXPECT_LE(diff, MAXHALFDIFF);
 }
 
+
 TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
   SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
   SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);