scale by 1 for neon implemented

void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { asm volatile ( "1: \n" MEMACCESS(0) "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's "uxtl2 v1.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v1.4s, v1.4s \n" "fcvtn v4.4h, v2.4s \n" // 8 floatsgit "fcvtn2 v4.8h, v1.4s \n" MEMACCESS(1) "st1 {v4.16b}, [%1], #16 \n" // store 8 shorts "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v4" ); } void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { asm volatile ( "1: \n" MEMACCESS(0) "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's "uxtl2 v1.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v1.4s, v1.4s \n" "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent "fmul v1.4s, v1.4s, %3.s[0] \n" "uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat "uqshrn2 v4.8h, v1.4s, #13 \n" MEMACCESS(1) "st1 {v4.16b}, [%1], #16 \n" // store 8 shorts "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "w"(scale * 1.9259299444e-34f) // %3 : "cc", "memory", "v1", "v2", "v4" ); } TEST=LibYUVPlanarTest.TestHalfFloatPlane_One BUG=libyuv:560 R=hubbe@chromium.org Review URL: https://codereview.chromium.org/2430313008 .
2026-01-01 03:12:16 +08:00 · 2016-10-21 14:30:03 -07:00 · 2016-10-21 14:30:03 -07:00 · 451af5e922
commit 451af5e922
parent 550cf829fb
8 changed files with 138 additions and 20 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1629
+Version: 1630
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -1959,9 +1959,15 @@ void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
 void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale,
                           int width);
+void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloat1Row_Any_F16C(const uint16* src, uint16* dst, float scale,
+                            int width);
 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width);
 void HalfFloatRow_Any_NEON(const uint16* src, uint16* dst, float scale,
                           int width);
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloat1Row_Any_NEON(const uint16* src, uint16* dst, float scale,
+                            int width);

 void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
                             const uint8* luma, uint32 lumacoeff);
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1629
+#define LIBYUV_VERSION 1630

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -2579,17 +2579,19 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
 #endif
 #if defined(HAS_HALFFLOATROW_F16C)
  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
-    HalfFloatRow = HalfFloatRow_Any_F16C;
+    HalfFloatRow = (scale == 1.0f) ?
+      HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
    if (IS_ALIGNED(width, 16)) {
-      HalfFloatRow = HalfFloatRow_F16C;
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
    }
  }
 #endif
 #if defined(HAS_HALFFLOATROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    HalfFloatRow = HalfFloatRow_Any_NEON;
+    HalfFloatRow = (scale == 1.0f) ?
+      HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      HalfFloatRow = HalfFloatRow_NEON;
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
    }
  }
 #endif
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -577,16 +577,18 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
    }

 #ifdef HAS_HALFFLOATROW_SSE2
-ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 15)
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 7)
 #endif
 #ifdef HAS_HALFFLOATROW_AVX2
 ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
 #endif
 #ifdef HAS_HALFFLOATROW_F16C
 ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
+ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 1, 1, 15)
 #endif
 #ifdef HAS_HALFFLOATROW_NEON
 ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7)
+ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 1, 1, 7)
 #endif
 #undef ANY11P16

--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -5410,6 +5410,36 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
 }
 #endif  // HAS_HALFFLOATROW_F16C

+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
+  asm volatile (
+    // 16 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
+    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
+    "lea         " MEMLEA(0x20,0) ",%0         \n"
+    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+    "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+    "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+    "vmovdqu     %%xmm2," MEMACCESS(1) "       \n"
+    "vmovdqu     %%xmm3," MEMACCESS2(0x10,1) " \n"
+    "lea         " MEMLEA(0x20,1) ",%1         \n"
+    "sub         $0x10,%2                      \n"
+    "jg          1b                            \n"
+
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc",
+    "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2711,6 +2711,55 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
  );
 }

+
+void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+    "uxtl2      v1.4s, v1.8h                   \n"
+    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+    "scvtf      v1.4s, v1.4s                   \n"
+    "fcvtn      v4.4h, v2.4s                   \n"  // 8 floatsgit
+    "fcvtn2     v4.8h, v1.4s                   \n"
+   MEMACCESS(1)
+    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
+    "b.gt       1b                             \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  :
+  : "cc", "memory", "v1", "v2", "v4"
+  );
+}
+
+void HalfFloatRow_NEON2(const uint16* src, uint16* dst, float scale, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+    "uxtl2      v1.4s, v1.8h                   \n"
+    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+    "scvtf      v1.4s, v1.4s                   \n"
+    "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
+    "fmul       v1.4s, v1.4s, %3.s[0]          \n"
+    "uqshrn     v4.4h, v2.4s, #13              \n"  // isolate halffloat
+    "uqshrn2    v4.8h, v1.4s, #13              \n"
+   MEMACCESS(1)
+    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
+    "b.gt       1b                             \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "w"(scale * 1.9259299444e-34f)    // %3
+  : "cc", "memory", "v1", "v2", "v4"
+  );
+}
+
 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
  asm volatile (
  "1:                                          \n"
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -2084,17 +2084,22 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
 int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
                       int benchmark_iterations,
                       int disable_cpu_flags, int benchmark_cpu_info,
-                       float scale) {
+                       float scale, int mask) {
  int i, j;
  const int y_plane_size = benchmark_width * benchmark_height * 2;

-  align_buffer_page_end(orig_y, y_plane_size);
-  align_buffer_page_end(dst_c, y_plane_size);
-  align_buffer_page_end(dst_opt, y_plane_size);
+  align_buffer_page_end(orig_y, y_plane_size * 3);
+  uint8* dst_opt = orig_y + y_plane_size;
+  uint8* dst_c = orig_y + y_plane_size * 2;
+
  MemRandomize(orig_y, y_plane_size);
  memset(dst_c, 0, y_plane_size);
  memset(dst_opt, 1, y_plane_size);

+  for (i = 0; i < y_plane_size / 2; ++i) {
+    reinterpret_cast<uint16*>(orig_y)[i] = static_cast<uint16>(i & mask);
+  }
+
  // Disable all optimizations.
  MaskCpuFlags(disable_cpu_flags);
  double c_time = get_time();
@ -2122,38 +2127,62 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
  }

  free_aligned_buffer_page_end(orig_y);
-  free_aligned_buffer_page_end(dst_c);
-  free_aligned_buffer_page_end(dst_opt);
  return diff;
 }

 // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
 // exponent to be less than 0.  15 - log2(65536) = -1/  This shouldnt normally
 // happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_denormal) {
+#define MAXHALFDIFF 0
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                benchmark_iterations_,
                                disable_cpu_flags_, benchmark_cpu_info_,
-                                1.0f / 65536.0f);
-  EXPECT_EQ(diff, 0);
+                                1.0f / 65536.0f, 65535);
+  EXPECT_LE(diff, MAXHALFDIFF);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 1024.0f, 1023);
+  EXPECT_LE(diff, MAXHALFDIFF);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f / 512.0f, 511);
+  EXPECT_LE(diff, MAXHALFDIFF);
 }

 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                benchmark_iterations_,
                                disable_cpu_flags_, benchmark_cpu_info_,
-                                1.0f / 4096.0f);
-  EXPECT_EQ(diff, 0);
+                                1.0f / 4096.0f, 4095);
+  EXPECT_LE(diff, MAXHALFDIFF);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_One) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_,
+                                disable_cpu_flags_, benchmark_cpu_info_,
+                                1.0f, 4095);
+  EXPECT_LE(diff, MAXHALFDIFF);
 }

 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                benchmark_iterations_,
                                disable_cpu_flags_, benchmark_cpu_info_,
-                                1.0f / 1023.0f);
-  EXPECT_EQ(diff, 0);
+                                1.0f / 4095.0f, 4095);
+  EXPECT_LE(diff, MAXHALFDIFF);
 }

+
 TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);