From 7fc932ddd306c11493a27b65fdd042ae15be79bf Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Thu, 29 Sep 2016 15:06:30 -0700
Subject: [PATCH] Add low level support for 12 bit 420, 422 and 444 YUV video
 frame conversion.

BUG=libyuv:560,chromium:445071
TEST=untested
R=hubbe@chromium.org

Review URL: https://codereview.chromium.org/2371293002 .
---
 include/libyuv/planar_functions.h |  8 ++++++
 include/libyuv/row.h              |  8 ++++--
 source/planar_functions.cc        | 46 +++++++++++++++++++++++++++++++
 source/row_any.cc                 | 22 +++++++++++++++
 source/row_common.cc              | 19 +++++++++++++
 source/row_gcc.cc                 | 33 ++++++++++++++++++++++
 source/row_win.cc                 | 23 ++++++++--------
 unit_test/planar_test.cc          | 40 +++++++++++++++++++++++++++
 8 files changed, 185 insertions(+), 14 deletions(-)

diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 9b0f994b0..1b57b2926 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -281,6 +281,14 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
                    const float* poly,
                    int width, int height);
 
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16* src_y, int src_stride_y,
+                   uint16* dst_y, int dst_stride_y,
+                   float scale,
+                   int width, int height);
+
 // Quantize a rectangle of ARGB. Alpha unaffected.
 // scale is a 16 bit fractional fixed point scaler between 0 and 65535.
 // interval_size should be a value between 1 and 255.
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 227156a19..0b4eec92f 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -231,6 +231,7 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_AVX2
 #define HAS_YUY2TOUVROW_AVX2
 #define HAS_YUY2TOYROW_AVX2
+#define HAS_HALFFLOATROW_AVX2
 
 // Effects:
 #define HAS_ARGBADDROW_AVX2
@@ -252,7 +253,6 @@ extern "C" {
 #define HAS_ARGBTORGB565ROW_AVX2
 #define HAS_J400TOARGBROW_AVX2
 #define HAS_RGB565TOARGBROW_AVX2
-#define HAS_SHORTTOF16ROW_AVX2
 #endif
 
 // The following are also available on x64 Visual C.
@@ -1934,8 +1934,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
                             int width);
 
 // Scale and convert to half float.
-void ShortToF16Row_C(const uint16* src, int16* dst, float scale, int width);
-void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width);
+void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width);
+void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
+                           int width);
 
 void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
                              const uint8* luma, uint32 lumacoeff);
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index b919e9615..20e9c66c0 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -83,6 +83,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
 }
 
 // TODO(fbarchard): Consider support for negative height.
+// TODO(fbarchard): Consider stride measured in bytes.
 LIBYUV_API
 void CopyPlane_16(const uint16* src_y, int src_stride_y,
                   uint16* dst_y, int dst_stride_y,
@@ -2441,6 +2442,51 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16* src_y, int src_stride_y,
+                   uint16* dst_y, int dst_stride_y,
+                   float scale,
+                   int width, int height) {
+  int y;
+  void (*HalfFloatRow)(const uint16* src, uint16* dst, float scale, int width) =
+      HalfFloatRow_C;
+  if (!src_y || !dst_y  || width <= 0 || height == 0) {
+    return -1;
+  }
+  src_stride_y >>= 1;
+  dst_stride_y >>= 1;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y  = src_y  + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_HALFFLOATROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HalfFloatRow = HalfFloatRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = HalfFloatRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    HalfFloatRow(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
 // Apply a lumacolortable to each ARGB pixel.
 LIBYUV_API
 int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
diff --git a/source/row_any.cc b/source/row_any.cc
index 28b6758fc..0a978e987 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -546,6 +546,28 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
 #endif
 #undef ANY11P
 
+// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                        \
+    void NAMEANY(const uint16* src_ptr, uint16* dst_ptr,                       \
+                 T shuffler, int width) {                                      \
+      SIMD_ALIGNED(uint16 temp[32 * 2]);                                       \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
+      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
+    }
+
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
+#endif
+#undef ANY11P16
+
+
 // Any 1 to 1 with yuvconstants
 #define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
     void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
diff --git a/source/row_common.cc b/source/row_common.cc
index 099ab600d..e194e6cd1 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2333,6 +2333,25 @@ void ARGBPolynomialRow_C(const uint8* src_argb,
   }
 }
 
+// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
+// adjust the source integer range to the half float range desired.
+
+// This magic constant is 2^-112. Multiplying by this
+// is the same as subtracting 112 from the exponent, which
+// is the difference in exponent bias between 32-bit and
+// 16-bit floats. Once we've done this subtraction, we can
+// simply extract the low bits of the exponent and the high
+// bits of the mantissa from our float and we're done.
+
+void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) {
+  int i;
+  float mult = 1.9259299444e-34f * scale;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * mult;
+    dst[i] = (uint16)((*(uint32_t*)&value) >> 13);
+  }
+}
+
 void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
                              const uint8* luma, uint32 lumacoeff) {
   uint32 bc = lumacoeff & 0xff;
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 1ac7ef1aa..e4b4c5c1b 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -5366,6 +5366,39 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
+  asm volatile (
+   "vbroadcastss  %3, %%ymm4                  \n"
+
+    // 16 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxwd   " MEMACCESS(0) ",%%ymm0       \n"  // 8 shorts -> 8 ints
+    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm1 \n"  // 8 more
+    "lea         " MEMLEA(0x20,0) ",%0         \n"
+    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"
+    "vcvtdq2ps   %%ymm1,%%ymm1                 \n"
+    "vmulps      %%ymm0,%%ymm4,%%ymm0          \n"
+    "vmulps      %%ymm1,%%ymm4,%%ymm1          \n"
+    "vcvtps2ph   $3, %%ymm0, %%xmm0            \n"
+    "vcvtps2ph   $3, %%ymm1, %%xmm1            \n"
+    "vmovdqu     %%xmm0," MEMACCESS(1) "       \n"
+    "vmovdqu     %%xmm1," MEMACCESS2(0x10,1) " \n"
+    "lea         " MEMLEA(0x20,1) ",%1         \n"
+    "sub         $0x10,%2                      \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  : "x"(scale)   // %3
+  : "memory", "cc",
+    "xmm0", "xmm4"
+  );
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
diff --git a/source/row_win.cc b/source/row_win.cc
index d54f05e29..baf6c940a 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -6095,13 +6095,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
-// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
-// adjust the sample range to 0 to 1 using a float multiply.
-// e.g. 9 bit scale is 1.0f / 512.0f
-// e.g. 10 bit scale is 1.0f / 1024.0f
-#ifdef HAS_SHORTTOHALFFLOAT_AVX2
+#ifdef HAS_HALFFLOATROW_AVX2
 __declspec(naked)
-void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) {
+void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
   __asm {
     mov        eax, [esp + 4]      /* src */
     mov        edx, [esp + 8]      /* dst */
@@ -6111,19 +6107,24 @@ void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) {
     // 8 pixel loop.
  convertloop:
     vpmovzxwd   ymm0, xmmword ptr [eax]  // 8 shorts -> 8 ints
-    lea         eax, [eax + 16]
+    vpmovzxwd   ymm1, xmmword ptr [eax + 16]  // 8 more shorts
+    lea         eax, [eax + 32]
     vcvtdq2ps   ymm0, ymm0        // convert 8 ints to floats
+    vcvtdq2ps   ymm1, ymm1
     vmulps      ymm0, ymm0, ymm4  // scale to normalized range 0 to 1
-    vcvtps2ph   xmm0, ymm0, 0     // float conver to 8 half floats round even
+    vmulps      ymm1, ymm1, ymm4
+    vcvtps2ph   xmm0, ymm0, 3     // float convert to 8 half floats truncate
+    vcvtps2ph   xmm1, ymm1, 3
     vmovdqu     [edx], xmm0
-    lea         edx, [edx + 16]
-    sub         ecx, 8
+    vmovdqu     [edx + 16], xmm1
+    lea         edx, [edx + 32]
+    sub         ecx, 16
     jg          convertloop
     vzeroupper
     ret
   }
 }
-#endif  // HAS_SHORTTOHALFFLOAT_AVX2
+#endif  // HAS_HALFFLOATROW_AVX2
 
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index d30d6b2e1..722074f73 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2081,6 +2081,46 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
   }
 }
 
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) {
+  int i, j;
+  const int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
+
+  align_buffer_page_end(orig_y, y_plane_size);
+  align_buffer_page_end(dst_c, y_plane_size);
+  align_buffer_page_end(dst_opt, y_plane_size);
+  MemRandomize(orig_y, y_plane_size);
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 1, y_plane_size);
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+  double c_time = get_time();
+  for (j = 0; j < benchmark_iterations_; j++) {
+    HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
+                   (uint16*)dst_c, benchmark_width_ * 2,
+                   1.0f / 4096.0f, benchmark_width_, benchmark_height_);
+  }
+  c_time = (get_time() - c_time) / benchmark_iterations_;
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+  double opt_time = get_time();
+  for (j = 0; j < benchmark_iterations_; j++) {
+    HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
+                   (uint16*)dst_opt, benchmark_width_ * 2,
+                   1.0f / 4096.0f, benchmark_width_, benchmark_height_);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+
+  for (i = 0; i < y_plane_size; ++i) {
+    EXPECT_EQ(dst_c[i], dst_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_y);
+  free_aligned_buffer_page_end(dst_c);
+  free_aligned_buffer_page_end(dst_opt);
+}
+
 TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
   SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
   SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);