Fixes for SplitUVPlane_16 and MergeUVPlane_16

Planar functions pass depth instead of scale factor. Row functions pass shift instead of depth. Add assert to C. AVX shift instruction expects a single shift value in XMM. Neon pass shift as input (not output). Split Neon reimplemented as left shift on shorts by negative to achieve right shift. Add planar unitests Bug: libyuv:888 Change-Id: I8fe62d3d777effc5321c361cd595c58b7f93807e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2782086 Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
2025-12-07 09:16:48 +08:00 · 2021-03-24 13:45:04 -07:00 · 2021-03-24 13:45:04 -07:00 · 312c02a5aa
commit 312c02a5aa
parent d8f1bfc981
10 changed files with 311 additions and 230 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1782
+Version: 1783
 License: BSD
 License File: LICENSE
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1782
+#define LIBYUV_VERSION 1783
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@ -400,7 +400,7 @@ int I210ToI010(const uint16_t* src_y,
 }
 // Any I[420]1[02] to P[420]1[02] format with mirroring.
-static int Ix1xToPx1x(const uint16_t* src_y,
+static int IxxxToPxxx(const uint16_t* src_y,
                      int src_stride_y,
                      const uint16_t* src_u,
                      int src_stride_u,
@ -441,7 +441,7 @@ int I010ToP010(const uint16_t* src_y,
               int dst_stride_uv,
               int width,
               int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                    width, height, 1, 1, 10);
 }
@ -459,7 +459,7 @@ int I210ToP210(const uint16_t* src_y,
               int dst_stride_uv,
               int width,
               int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                    width, height, 1, 0, 10);
 }
@ -477,7 +477,7 @@ int I012ToP012(const uint16_t* src_y,
               int dst_stride_uv,
               int width,
               int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                    width, height, 1, 1, 12);
 }
@ -495,7 +495,7 @@ int I212ToP212(const uint16_t* src_y,
               int dst_stride_uv,
               int width,
               int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                    width, height, 1, 0, 12);
 }
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -10,6 +10,7 @@
 #include "libyuv/planar_functions.h"
 #include <assert.h>
 #include <string.h>  // for memset()
 #include "libyuv/cpu_id.h"
@ -563,9 +564,9 @@ void SplitUVPlane_16(const uint16_t* src_uv,
                     int height,
                     int depth) {
  int y;
-  int scale = 1 << depth;
+  void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
-  void (*SplitUVRow)(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v,
+                        uint16_t* dst_v, int depth, int width) =
-                     int scale, int width) = SplitUVRow_16_C;
+      SplitUVRow_16_C;
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
@ -583,24 +584,24 @@ void SplitUVPlane_16(const uint16_t* src_uv,
  }
 #if defined(HAS_SPLITUVROW_16_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_16_Any_AVX2;
+    SplitUVRow_16 = SplitUVRow_16_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_16_AVX2;
+      SplitUVRow_16 = SplitUVRow_16_AVX2;
    }
  }
 #endif
 #if defined(HAS_SPLITUVROW_16_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_16_Any_NEON;
+    SplitUVRow_16 = SplitUVRow_16_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      SplitUVRow = SplitUVRow_16_NEON;
+      SplitUVRow_16 = SplitUVRow_16_NEON;
    }
  }
 #endif
  for (y = 0; y < height; ++y) {
    // Copy a row of UV.
-    SplitUVRow(src_uv, dst_u, dst_v, scale, width);
+    SplitUVRow_16(src_uv, dst_u, dst_v, depth, width);
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
    src_uv += src_stride_uv;
@ -618,9 +619,11 @@ void MergeUVPlane_16(const uint16_t* src_u,
                     int height,
                     int depth) {
  int y;
-  int scale = 1 << (16 - depth);
+  void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v,
-  void (*MergeUVRow)(const uint16_t* src_u, const uint16_t* src_v,
+                        uint16_t* dst_uv, int depth, int width) =
-                     uint16_t* dst_uv, int scale, int width) = MergeUVRow_16_C;
+      MergeUVRow_16_C;
  assert(depth >= 8);
  assert(depth <= 16);
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
@ -636,24 +639,24 @@ void MergeUVPlane_16(const uint16_t* src_u,
  }
 #if defined(HAS_MERGEUVROW_16_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow = MergeUVRow_16_Any_AVX2;
+    MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      MergeUVRow = MergeUVRow_16_AVX2;
+      MergeUVRow_16 = MergeUVRow_16_AVX2;
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_16_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow = MergeUVRow_16_Any_NEON;
+    MergeUVRow_16 = MergeUVRow_16_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      MergeUVRow = MergeUVRow_16_NEON;
+      MergeUVRow_16 = MergeUVRow_16_NEON;
    }
  }
 #endif
  for (y = 0; y < height; ++y) {
    // Merge a row of U and V into a row of UV.
-    MergeUVRow(src_u, src_v, dst_uv, scale, width);
+    MergeUVRow_16(src_u, src_v, dst_uv, depth, width);
    src_u += src_stride_u;
    src_v += src_stride_v;
    dst_uv += dst_stride_uv;
@ -671,7 +674,7 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
                          int depth) {
  int y;
  int scale = 1 << (16 - depth);
-  void (*MultiplyRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+  void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale,
                         int width) = MultiplyRow_16_C;
  // Negative height means invert the image.
  if (height < 0) {
@ -688,23 +691,23 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
 #if defined(HAS_MULTIPLYROW_16_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    MultiplyRow = MultiplyRow_16_Any_AVX2;
+    MultiplyRow_16 = MultiplyRow_16_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
-      MultiplyRow = MultiplyRow_16_AVX2;
+      MultiplyRow_16 = MultiplyRow_16_AVX2;
    }
  }
 #endif
 #if defined(HAS_MULTIPLYROW_16_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    MultiplyRow = MultiplyRow_16_Any_NEON;
+    MultiplyRow_16 = MultiplyRow_16_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      MultiplyRow = MultiplyRow_16_NEON;
+      MultiplyRow_16 = MultiplyRow_16_NEON;
    }
  }
 #endif
  for (y = 0; y < height; ++y) {
-    MultiplyRow(src_y, dst_y, scale, width);
+    MultiplyRow_16(src_y, dst_y, scale, width);
    src_y += src_stride_y;
    dst_y += dst_stride_y;
  }
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -10,6 +10,7 @@
 #include "libyuv/row.h"
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>  // For memcpy and memset.
@ -3045,6 +3046,8 @@ void MergeUVRow_16_C(const uint16_t* src_u,
                     int depth,
                     int width) {
  int shift = 16 - depth;
  assert(depth >= 8);
  assert(depth <= 16);
  int x;
  for (x = 0; x < width; ++x) {
    dst_uv[0] = src_u[x] << shift;
@ -3061,6 +3064,8 @@ void SplitUVRow_16_C(const uint16_t* src_uv,
                     int width) {
  int shift = 16 - depth;
  int x;
  assert(depth >= 8);
  assert(depth <= 16);
  for (x = 0; x < width; ++x) {
    dst_u[x] = src_uv[0] >> shift;
    dst_v[x] = src_uv[1] >> shift;
@ -3098,6 +3103,9 @@ void Convert16To8Row_C(const uint16_t* src_y,
                       int scale,
                       int width) {
  int x;
  assert(scale >= 256);
  assert(scale <= 32768);
  for (x = 0; x < width; ++x) {
    dst_y[x] = clamp255((src_y[x] * scale) >> 16);
  }
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -4728,8 +4728,6 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
  // clang-format off
  asm volatile (
      "vmovd       %4,%%xmm3                     \n"
      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
      "vbroadcastss %%xmm3,%%xmm3                \n"
      "sub         %0,%1                         \n"
    // 16 pixels per loop.
@ -4761,7 +4759,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
 }
 #endif  // HAS_MERGEUVROW_AVX2
-#ifdef HAS_MERGEUVROW_16_AVX2
+#ifdef HAS_SPLITUVROW_16_AVX2
 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8,  9,  12, 13,
                                 2, 3, 6, 7, 10, 11, 14, 15};
 void SplitUVRow_16_AVX2(const uint16_t* src_uv,
@ -4773,8 +4771,6 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
  // clang-format off
  asm volatile (
      "vmovd       %4,%%xmm3                     \n"
      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
      "vbroadcastss %%xmm3,%%xmm3                \n"
      "vbroadcastf128 %5,%%ymm4                  \n"
      "sub         %1,%2                         \n"
@ -4802,14 +4798,13 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
  : "+r"(src_uv),   // %0
    "+r"(dst_u),    // %1
    "+r"(dst_v),    // %2
-    "+r"(width),    // %3
+    "+r"(width)     // %3
-    "+r"(depth)     // %4
+  : "r"(depth),     // %4
  :
    "m"(kSplitUVShuffle16) // %5
  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  // clang-format on
 }
-#endif  // HAS_MERGEUVROW_AVX2
+#endif  // HAS_SPLITUVROW_16_AVX2
 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 128 = 9 bits
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -3270,32 +3270,22 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
                        uint16_t* dst_v,
                        int depth,
                        int width) {
  int shift = depth - 16;  // Negative for right shift.
  asm volatile(
-      "vdup.32     q0, %3                        \n"
+      "vdup.16     q2, %4                        \n"
      "1:                                        \n"
-      "vld2.16     {q1, q2}, [%0]!               \n"  // load 8 UV
+      "vld2.16     {q0, q1}, [%0]!               \n"  // load 8 UV
-      "vmovl.u16   q3, d2                        \n"
+      "vshl.u16    q0, q0, q2                    \n"
-      "vmovl.u16   q4, d3                        \n"
+      "vshl.u16    q1, q1, q2                    \n"
-      "vshl.u32    q3, q3, q0                    \n"
+      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
-      "vshl.u32    q4, q4, q0                    \n"
+      "vst1.16     {q0}, [%1]!                   \n"  // store 8 U pixels
-      "vmovn.u32   d2, q3                        \n"
+      "vst1.16     {q1}, [%2]!                   \n"  // store 8 V pixels
      "vmovn.u32   d3, q4                        \n"
      "vmovl.u16   q3, d4                        \n"
      "vmovl.u16   q4, d5                        \n"
      "vshl.u32    q3, q3, q0                    \n"
      "vshl.u32    q4, q4, q0                    \n"
      "vmovn.u32   d4, q3                        \n"
      "vmovn.u32   d5, q4                        \n"
      "subs        %4, %4, #8                    \n"  // 8 src pixels per loop
      "vst1.16     {q1}, [%1]!                   \n"  // store 8 U pixels
      "vst1.16     {q2}, [%2]!                   \n"  // store 8 V pixels
      "bgt         1b                            \n"
      : "+r"(src_uv),  // %0
        "+r"(dst_u),   // %1
        "+r"(dst_v),   // %2
-        "+r"(depth),   // %3
+        "+r"(width)    // %3
-        "+r"(width)    // %4
+      : "r"(shift)     // %4
      :
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
 }
@ -3306,21 +3296,20 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                        int width) {
  int shift = 16 - depth;
  asm volatile(
-      "vdup.16     q2, %3                        \n"
+      "vdup.16     q2, %4                        \n"
      "1:                                        \n"
      "vld1.16     {q0}, [%0]!                   \n"  // load 8 U
      "vld1.16     {q1}, [%1]!                   \n"  // load 8 V
      "vshl.u16    q0, q0, q2                    \n"
      "vshl.u16    q1, q1, q2                    \n"
-      "subs        %4, %4, #8                    \n"  // 8 src pixels per loop
+      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
      "vst2.16     {q0, q1}, [%2]!               \n"  // store 8 UV pixels
      "bgt         1b                            \n"
      : "+r"(src_u),   // %0
        "+r"(src_v),   // %1
        "+r"(dst_uv),  // %2
-        "+r"(shift),   // %3
+        "+r"(width)    // %3
-        "+r"(width)    // %4
+      : "r"(shift)     // %4
      :
      : "cc", "memory", "q0", "q1", "q2");
 }
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -2605,6 +2605,64 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
  free_aligned_buffer_page_end(dst_pixels_c);
 }
 // 16 bit channel split and merge
 TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) {
  // Round count up to multiple of 16
  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
  align_buffer_page_end(src_pixels, kPixels * 2 * 2);
  align_buffer_page_end(tmp_pixels_u_c, kPixels * 2);
  align_buffer_page_end(tmp_pixels_v_c, kPixels * 2);
  align_buffer_page_end(tmp_pixels_u_opt, kPixels * 2);
  align_buffer_page_end(tmp_pixels_v_opt, kPixels * 2);
  align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2);
  align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2);
  MemRandomize(src_pixels, kPixels * 2 * 2);
  MemRandomize(tmp_pixels_u_c, kPixels * 2);
  MemRandomize(tmp_pixels_v_c, kPixels * 2);
  MemRandomize(tmp_pixels_u_opt, kPixels * 2);
  MemRandomize(tmp_pixels_v_opt, kPixels * 2);
  MemRandomize(dst_pixels_opt, kPixels * 2 * 2);
  MemRandomize(dst_pixels_c, kPixels * 2 * 2);
  MaskCpuFlags(disable_cpu_flags_);
  SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
                  (uint16_t*)tmp_pixels_u_c, benchmark_width_,
                  (uint16_t*)tmp_pixels_v_c, benchmark_width_, benchmark_width_,
                  benchmark_height_, 12);
  MergeUVPlane_16((const uint16_t*)tmp_pixels_u_c, benchmark_width_,
                  (const uint16_t*)tmp_pixels_v_c, benchmark_width_,
                  (uint16_t*)dst_pixels_c, benchmark_width_ * 2,
                  benchmark_width_, benchmark_height_, 12);
  MaskCpuFlags(benchmark_cpu_info_);
  SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
                  (uint16_t*)tmp_pixels_u_opt, benchmark_width_,
                  (uint16_t*)tmp_pixels_v_opt, benchmark_width_,
                  benchmark_width_, benchmark_height_, 12);
  for (int i = 0; i < benchmark_iterations_; ++i) {
    MergeUVPlane_16((const uint16_t*)tmp_pixels_u_opt, benchmark_width_,
                    (const uint16_t*)tmp_pixels_v_opt, benchmark_width_,
                    (uint16_t*)dst_pixels_opt, benchmark_width_ * 2,
                    benchmark_width_, benchmark_height_, 12);
  }
  for (int i = 0; i < kPixels * 2; ++i) {
    EXPECT_EQ(tmp_pixels_u_c[i], tmp_pixels_u_opt[i]);
    EXPECT_EQ(tmp_pixels_v_c[i], tmp_pixels_v_opt[i]);
  }
  for (int i = 0; i < kPixels * 2 * 2; ++i) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(tmp_pixels_u_c);
  free_aligned_buffer_page_end(tmp_pixels_v_c);
  free_aligned_buffer_page_end(tmp_pixels_u_opt);
  free_aligned_buffer_page_end(tmp_pixels_v_opt);
  free_aligned_buffer_page_end(dst_pixels_opt);
  free_aligned_buffer_page_end(dst_pixels_c);
 }
 TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
  // Round count up to multiple of 16
  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
@ -2649,6 +2707,46 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
  free_aligned_buffer_page_end(dst_pixels_c);
 }
 // 16 bit channel split
 TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) {
  // Round count up to multiple of 16
  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
  align_buffer_page_end(src_pixels, kPixels * 2 * 2);
  align_buffer_page_end(dst_pixels_u_c, kPixels * 2);
  align_buffer_page_end(dst_pixels_v_c, kPixels * 2);
  align_buffer_page_end(dst_pixels_u_opt, kPixels * 2);
  align_buffer_page_end(dst_pixels_v_opt, kPixels * 2);
  MemRandomize(src_pixels, kPixels * 2 * 2);
  MemRandomize(dst_pixels_u_c, kPixels * 2);
  MemRandomize(dst_pixels_v_c, kPixels * 2);
  MemRandomize(dst_pixels_u_opt, kPixels * 2);
  MemRandomize(dst_pixels_v_opt, kPixels * 2);
  MaskCpuFlags(disable_cpu_flags_);
  SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
                  (uint16_t*)dst_pixels_u_c, benchmark_width_,
                  (uint16_t*)dst_pixels_v_c, benchmark_width_, benchmark_width_,
                  benchmark_height_, 10);
  MaskCpuFlags(benchmark_cpu_info_);
  for (int i = 0; i < benchmark_iterations_; ++i) {
    SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
                    (uint16_t*)dst_pixels_u_opt, benchmark_width_,
                    (uint16_t*)dst_pixels_v_opt, benchmark_width_,
                    benchmark_width_, benchmark_height_, 10);
  }
  for (int i = 0; i < kPixels * 2; ++i) {
    EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
    EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(dst_pixels_u_c);
  free_aligned_buffer_page_end(dst_pixels_v_c);
  free_aligned_buffer_page_end(dst_pixels_u_opt);
  free_aligned_buffer_page_end(dst_pixels_v_opt);
 }
 TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
  // Round count up to multiple of 16
  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;