From 49d1e3b0363b28b8fbfacb680b42aa8d8db09ace Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Thu, 9 Nov 2017 21:09:42 -0800
Subject: [PATCH] MultiplyRow_16_AVX2 for converting 10 bit YUV

When converting from lsb 10 bit formats to msb, the values
need to be shifted to the top 10 bits.  Using a multiply
allows the different numbers of bits to be copied:
// 128 = 9 bits
// 64 = 10 bits
// 16 = 12 bits
// 1 = 16 bits
Bug: libyuv:751
Test: LibYUVPlanarTest.MultiplyRow_16_Opt
Change-Id: I9cf226053a164baa14155215cb175065b1c4f169
Reviewed-on: https://chromium-review.googlesource.com/762951
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Reviewed-by: Frank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
---
 include/libyuv/row.h     | 10 ++++++++++
 source/row_common.cc     | 10 ++++++++++
 source/row_gcc.cc        | 36 +++++++++++++++++++++++++++++++++++-
 unit_test/planar_test.cc | 38 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 507509519..fc5caba4c 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -278,6 +278,7 @@ extern "C" {
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
 #define HAS_MERGEUVROW_16_AVX2
+#define HAS_MULTIPLYROW_16_AVX2
 #endif
 
 // The following are available on Neon platforms:
@@ -1532,6 +1533,15 @@ void MergeUVRow_16_AVX2(const uint16* src_u,
                         int scale,
                         int width);
 
+void MultiplyRow_16_AVX2(const uint16* src_y,
+                         uint16* dst_y,
+                         int scale,
+                         int width);
+void MultiplyRow_16_C(const uint16* src_y,
+                      uint16* dst_y,
+                      int scale,
+                      int width);
+
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_AVX(const uint8* src, uint8* dst, int count);
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
diff --git a/source/row_common.cc b/source/row_common.cc
index 8612665e5..6ffc4febb 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1817,6 +1817,16 @@ void MergeUVRow_16_C(const uint16* src_u,
   }
 }
 
+void MultiplyRow_16_C(const uint16* src_y,
+                      uint16* dst_y,
+                      int scale,
+                      int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = src_y[x] * scale;
+  }
+}
+
 void CopyRow_C(const uint8* src, uint8* dst, int count) {
   memcpy(dst, src, count);
 }
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index ecb77983e..ca220a22b 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -2758,7 +2758,6 @@ void MergeUVRow_SSE2(const uint8* src_u,
 // 64 = 10 bits
 // 16 = 12 bits
 // 1 = 16 bits
-
 #ifdef HAS_MERGEUVROW_16_AVX2
 void MergeUVRow_16_AVX2(const uint16* src_u,
                         const uint16* src_v,
@@ -2801,6 +2800,41 @@ void MergeUVRow_16_AVX2(const uint16* src_u,
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
+
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16* src_y,
+                         uint16* dst_y,
+                         int scale,
+                         int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %3,%%xmm3                      \n"
+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "sub       %0,%1                           \n"
+
+    // 16 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   0x20(%0),%%ymm1                 \n"
+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0,(%0,%1)                  \n"
+    "vmovdqu   %%ymm1,0x20(%0,%1)              \n"
+    "add        $0x40,%0                       \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
 #ifdef HAS_SPLITRGBROW_SSSE3
 
 // Shuffle table for converting RGB to Planar.
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 34414c9fc..f9e6f8abb 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2661,6 +2661,44 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
 }
 #endif
 
+// TODO(fbarchard): improve test for platforms and cpu detect
+#ifdef HAS_MULTIPLYROW_16_AVX2
+TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_y, kPixels * 2);
+  align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
+  align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
+
+  MemRandomize(src_pixels_y, kPixels * 2);
+  memset(dst_pixels_y_opt, 0, kPixels * 2);
+  memset(dst_pixels_y_c, 1, kPixels * 2);
+
+  MultiplyRow_16_C(reinterpret_cast<const uint16*>(src_pixels_y),
+                   reinterpret_cast<uint16*>(dst_pixels_y_c), 64, kPixels);
+
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    if (has_avx2) {
+      MultiplyRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
+                          reinterpret_cast<uint16*>(dst_pixels_y_opt), 64,
+                          kPixels);
+    } else {
+      MultiplyRow_16_C(reinterpret_cast<const uint16*>(src_pixels_y),
+                       reinterpret_cast<uint16*>(dst_pixels_y_opt), 64,
+                       kPixels);
+    }
+  }
+
+  for (int i = 0; i < kPixels * 2; ++i) {
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_y);
+  free_aligned_buffer_page_end(dst_pixels_y_opt);
+  free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+#endif
+
 float TestScaleMaxSamples(int benchmark_width,
                           int benchmark_height,
                           int benchmark_iterations,