MergeUV10Row_AVX2 for converting H010 to P010

H010 is 10 bit planar format with 10 bits in lower bits. P010 is 10 bit biplanar format with 10 bits in upper bits. This function weaves the U and V channels and shifts the bits into the upper bits. Bug: libyuv:751 Test: LibYUVPlanarTest.MergeUV10Row_Opt Change-Id: I4a0bac0ef1ff95aa1b8d68261ec8e8e86f2d1fbf Reviewed-on: https://chromium-review.googlesource.com/752692 Reviewed-by: Cheng Wang <wangcheng@google.com> Reviewed-by: Frank Barchard <fbarchard@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
2025-12-07 01:06:46 +08:00 · 2017-11-02 19:52:44 -07:00 · 2017-11-02 19:52:44 -07:00 · a0c32b9e49
commit a0c32b9e49
parent 75ec56b55a
4 changed files with 120 additions and 1 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -271,7 +271,7 @@ extern "C" {
 #define HAS_I422TOARGBROW_SSSE3
 #endif

-// The following are available forr gcc/clang x86 platforms:
+// The following are available for gcc/clang x86 platforms:
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
@ -279,6 +279,14 @@ extern "C" {
 #define HAS_SPLITRGBROW_SSSE3
 #endif

+// The following are available for AVX2 gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_MERGEUV10ROW_AVX2
+#endif
+    
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@ -1523,6 +1531,15 @@ void MergeRGBRow_Any_NEON(const uint8* src_r,
                          uint8* dst_rgb,
                          int width);

+void MergeUV10Row_C(const uint16* src_u,
+                    const uint16* src_v,
+                    uint16* dst_uv,
+                    int width);
+void MergeUV10Row_AVX2(const uint16* src_u,
+                       const uint16* src_v,
+                       uint16* dst_uv,
+                       int width);
+
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_AVX(const uint8* src, uint8* dst, int count);
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -1798,6 +1798,24 @@ void MergeRGBRow_C(const uint8* src_r,
  }
 }

+void MergeUV10Row_C(const uint16* src_u,
+                    const uint16* src_v,
+                    uint16* dst_uv,
+                    int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x] << 6;
+    dst_uv[1] = src_v[x] << 6;
+    dst_uv[2] = src_u[x + 1] << 6;
+    dst_uv[3] = src_v[x + 1] << 6;
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1] << 6;
+    dst_uv[1] = src_v[width - 1] << 6;
+  }
+}
+
 void CopyRow_C(const uint8* src, uint8* dst, int count) {
  memcpy(dst, src, count);
 }
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -2753,6 +2753,48 @@ void MergeUVRow_SSE2(const uint8* src_u,
 }
 #endif  // HAS_MERGEUVROW_SSE2

+#ifdef HAS_MERGEUV10ROW_AVX2
+void MergeUV10Row_AVX2(const uint16* src_u,
+                       const uint16* src_v,
+                       uint16* dst_uv,
+                       int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   (%0,%1,1),%%ymm1                \n"
+    "add        $0x20,%0                       \n"
+    "vpsllw    $0x6,%%ymm0,%%ymm0              \n"
+    "vpsllw    $0x6,%%ymm1,%%ymm1              \n"
+//    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+//    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"
+    "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"
+
+//    "vmovdqu   %%ymm2, (%2)                    \n"
+//    "vmovdqu   %%ymm0, 0x20(%2)                \n"
+
+    "vextractf128 $0x0,%%ymm2,(%2)             \n"
+    "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+    "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+    "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+    "add       $0x40,%2                        \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+
 #ifdef HAS_SPLITRGBROW_SSSE3

 // Shuffle table for converting RGB to Planar.
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -2617,6 +2617,48 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
  free_aligned_buffer_page_end(dst_pixels_c);
 }

+// TODO(fbarchard): improve test for platforms and cpu detect
+#ifdef HAS_MERGEUV10ROW_AVX2
+TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_u, kPixels * 2);
+  align_buffer_page_end(src_pixels_v, kPixels * 2);
+  align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
+  align_buffer_page_end(dst_pixels_uv_c, kPixels * 2 * 2);
+
+  MemRandomize(src_pixels_u, kPixels * 2);
+  MemRandomize(src_pixels_v, kPixels * 2);
+  memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
+  memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
+
+  MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
+                 reinterpret_cast<const uint16*>(src_pixels_v),
+                 reinterpret_cast<uint16*>(dst_pixels_uv_c), kPixels);
+
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    if (has_avx2) {
+      MergeUV10Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
+                        reinterpret_cast<const uint16*>(src_pixels_v),
+                        reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
+    } else {
+      MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
+                     reinterpret_cast<const uint16*>(src_pixels_v),
+                     reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
+    }
+  }
+
+  for (int i = 0; i < kPixels * 2 * 2; ++i) {
+    EXPECT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_u);
+  free_aligned_buffer_page_end(src_pixels_v);
+  free_aligned_buffer_page_end(dst_pixels_uv_opt);
+  free_aligned_buffer_page_end(dst_pixels_uv_c);
+}
+#endif
+
 float TestScaleMaxSamples(int benchmark_width,
                          int benchmark_height,
                          int benchmark_iterations,