MergeUV10Row_AVX2 use multiply to handle different bit depths

Instead of hardcoded shift, use a multiply by a parameter. 128 = 9 bits 64 = 10 bits 16 = 12 bits 1 = 16 bits Bug: libyuv:751 Test: LibYUVPlanarTest.MergeUV10Row_Opt Change-Id: Id925edfdbf91243370c90641b50eb8e7625ec329 Reviewed-on: https://chromium-review.googlesource.com/762523 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
2025-12-07 01:06:46 +08:00 · 2017-11-09 17:26:08 -08:00 · 2017-11-09 17:26:08 -08:00 · 2f58d126b9
commit 2f58d126b9
parent e26b0a7e0e
4 changed files with 55 additions and 39 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -277,7 +277,7 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) &&                                       \
    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-#define HAS_MERGEUV10ROW_AVX2
+#define HAS_MERGEUVROW_16_AVX2
 #endif
 // The following are available on Neon platforms:
@ -1521,13 +1521,15 @@ void MergeRGBRow_Any_NEON(const uint8* src_r,
                          uint8* dst_rgb,
                          int width);
-void MergeUV10Row_C(const uint16* src_u,
+void MergeUVRow_16_C(const uint16* src_u,
                     const uint16* src_v,
                     uint16* dst_uv,
                     int scale, /* 64 for 10 bit */
                     int width);
-void MergeUV10Row_AVX2(const uint16* src_u,
+void MergeUVRow_16_AVX2(const uint16* src_u,
                        const uint16* src_v,
                        uint16* dst_uv,
                        int scale,
                        int width);
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -1798,21 +1798,22 @@ void MergeRGBRow_C(const uint8* src_r,
  }
 }
-void MergeUV10Row_C(const uint16* src_u,
+void MergeUVRow_16_C(const uint16* src_u,
                     const uint16* src_v,
                     uint16* dst_uv,
                     int scale,
                     int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    dst_uv[0] = src_u[x] << 6;
+    dst_uv[0] = src_u[x] * scale;
-    dst_uv[1] = src_v[x] << 6;
+    dst_uv[1] = src_v[x] * scale;
-    dst_uv[2] = src_u[x + 1] << 6;
+    dst_uv[2] = src_u[x + 1] * scale;
-    dst_uv[3] = src_v[x + 1] << 6;
+    dst_uv[3] = src_v[x + 1] * scale;
    dst_uv += 4;
  }
  if (width & 1) {
-    dst_uv[0] = src_u[width - 1] << 6;
+    dst_uv[0] = src_u[width - 1] * scale;
-    dst_uv[1] = src_v[width - 1] << 6;
+    dst_uv[1] = src_v[width - 1] * scale;
  }
 }
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -2753,13 +2753,23 @@ void MergeUVRow_SSE2(const uint8* src_u,
 }
 #endif  // HAS_MERGEUVROW_SSE2
-#ifdef HAS_MERGEUV10ROW_AVX2
+// Use scale to convert lsb formats to msb, depending how many bits there are:
-void MergeUV10Row_AVX2(const uint16* src_u,
+// 128 = 9 bits
 // 64 = 10 bits
 // 16 = 12 bits
 // 1 = 16 bits
 #ifdef HAS_MERGEUVROW_16_AVX2
 void MergeUVRow_16_AVX2(const uint16* src_u,
                        const uint16* src_v,
                        uint16* dst_uv,
                        int scale,
                        int width) {
  // clang-format off
  asm volatile (
    "vmovd      %4,%%xmm3                      \n"
    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
    "vbroadcastss %%xmm3,%%ymm3                \n"
    "sub       %0,%1                           \n"
    // 16 pixels per loop.
@ -2768,8 +2778,9 @@ void MergeUV10Row_AVX2(const uint16* src_u,
    "vmovdqu   (%0),%%ymm0                     \n"
    "vmovdqu   (%0,%1,1),%%ymm1                \n"
    "add        $0x20,%0                       \n"
-    "vpsllw    $0x6,%%ymm0,%%ymm0              \n"
+
-    "vpsllw    $0x6,%%ymm1,%%ymm1              \n"
+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
    "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"  // mutates
    "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"
    "vextractf128 $0x0,%%ymm2,(%2)             \n"
@ -2784,8 +2795,8 @@ void MergeUV10Row_AVX2(const uint16* src_u,
    "+r"(src_v),   // %1
    "+r"(dst_uv),  // %2
    "+r"(width)    // %3
-  :
+  : "r"(scale)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
  // clang-format on
 }
 #endif  // HAS_MERGEUVROW_AVX2
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -2618,8 +2618,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
 }
 // TODO(fbarchard): improve test for platforms and cpu detect
-#ifdef HAS_MERGEUV10ROW_AVX2
+#ifdef HAS_MERGEUVROW_16_AVX2
-TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
+TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
  const int kPixels = benchmark_width_ * benchmark_height_;
  align_buffer_page_end(src_pixels_u, kPixels * 2);
  align_buffer_page_end(src_pixels_v, kPixels * 2);
@ -2631,20 +2631,22 @@ TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
  memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
  memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
-  MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
+  MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
                  reinterpret_cast<const uint16*>(src_pixels_v),
-                 reinterpret_cast<uint16*>(dst_pixels_uv_c), kPixels);
+                  reinterpret_cast<uint16*>(dst_pixels_uv_c), 64, kPixels);
  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
  for (int i = 0; i < benchmark_iterations_; ++i) {
    if (has_avx2) {
-      MergeUV10Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
+      MergeUVRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
                         reinterpret_cast<const uint16*>(src_pixels_v),
-                        reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
+                         reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
                         kPixels);
    } else {
-      MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
+      MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
                      reinterpret_cast<const uint16*>(src_pixels_v),
-                     reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
+                      reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
                      kPixels);
    }
  }