MultiplyRow_16_AVX2 for converting 10 bit YUV

When converting from lsb 10 bit formats to msb, the values need to be shifted to the top 10 bits. Using a multiply allows the different numbers of bits to be copied: // 128 = 9 bits // 64 = 10 bits // 16 = 12 bits // 1 = 16 bits Bug: libyuv:751 Test: LibYUVPlanarTest.MultiplyRow_16_Opt Change-Id: I9cf226053a164baa14155215cb175065b1c4f169 Reviewed-on: https://chromium-review.googlesource.com/762951 Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Frank Barchard <fbarchard@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
2026-01-01 03:12:16 +08:00 · 2017-11-09 21:09:42 -08:00 · 2017-11-09 21:09:42 -08:00 · 49d1e3b036
commit 49d1e3b036
parent 2f58d126b9
4 changed files with 93 additions and 1 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -278,6 +278,7 @@ extern "C" {
    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
 #define HAS_MERGEUVROW_16_AVX2
+#define HAS_MULTIPLYROW_16_AVX2
 #endif

 // The following are available on Neon platforms:
@ -1532,6 +1533,15 @@ void MergeUVRow_16_AVX2(const uint16* src_u,
                        int scale,
                        int width);

+void MultiplyRow_16_AVX2(const uint16* src_y,
+                         uint16* dst_y,
+                         int scale,
+                         int width);
+void MultiplyRow_16_C(const uint16* src_y,
+                      uint16* dst_y,
+                      int scale,
+                      int width);
+
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_AVX(const uint8* src, uint8* dst, int count);
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -1817,6 +1817,16 @@ void MergeUVRow_16_C(const uint16* src_u,
  }
 }

+void MultiplyRow_16_C(const uint16* src_y,
+                      uint16* dst_y,
+                      int scale,
+                      int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = src_y[x] * scale;
+  }
+}
+
 void CopyRow_C(const uint8* src, uint8* dst, int count) {
  memcpy(dst, src, count);
 }
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -2758,7 +2758,6 @@ void MergeUVRow_SSE2(const uint8* src_u,
 // 64 = 10 bits
 // 16 = 12 bits
 // 1 = 16 bits
-
 #ifdef HAS_MERGEUVROW_16_AVX2
 void MergeUVRow_16_AVX2(const uint16* src_u,
                        const uint16* src_v,
@ -2801,6 +2800,41 @@ void MergeUVRow_16_AVX2(const uint16* src_u,
 }
 #endif  // HAS_MERGEUVROW_AVX2

+
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16* src_y,
+                         uint16* dst_y,
+                         int scale,
+                         int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %3,%%xmm3                      \n"
+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "sub       %0,%1                           \n"
+
+    // 16 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   0x20(%0),%%ymm1                 \n"
+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0,(%0,%1)                  \n"
+    "vmovdqu   %%ymm1,0x20(%0,%1)              \n"
+    "add        $0x40,%0                       \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
 #ifdef HAS_SPLITRGBROW_SSSE3

 // Shuffle table for converting RGB to Planar.
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -2661,6 +2661,44 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
 }
 #endif

+// TODO(fbarchard): improve test for platforms and cpu detect
+#ifdef HAS_MULTIPLYROW_16_AVX2
+TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_y, kPixels * 2);
+  align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
+  align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
+
+  MemRandomize(src_pixels_y, kPixels * 2);
+  memset(dst_pixels_y_opt, 0, kPixels * 2);
+  memset(dst_pixels_y_c, 1, kPixels * 2);
+
+  MultiplyRow_16_C(reinterpret_cast<const uint16*>(src_pixels_y),
+                   reinterpret_cast<uint16*>(dst_pixels_y_c), 64, kPixels);
+
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    if (has_avx2) {
+      MultiplyRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
+                          reinterpret_cast<uint16*>(dst_pixels_y_opt), 64,
+                          kPixels);
+    } else {
+      MultiplyRow_16_C(reinterpret_cast<const uint16*>(src_pixels_y),
+                       reinterpret_cast<uint16*>(dst_pixels_y_opt), 64,
+                       kPixels);
+    }
+  }
+
+  for (int i = 0; i < kPixels * 2; ++i) {
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_y);
+  free_aligned_buffer_page_end(dst_pixels_y_opt);
+  free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+#endif
+
 float TestScaleMaxSamples(int benchmark_width,
                          int benchmark_height,
                          int benchmark_iterations,