Fixes for SplitUVPlane_16 and MergeUVPlane_16

Planar functions pass depth instead of scale factor. Row functions pass shift instead of depth. Add assert to C. AVX shift instruction expects a single shift value in XMM. Neon pass shift as input (not output). Split Neon reimplemented as left shift on shorts by negative to achieve right shift. Add planar unitests Bug: libyuv:888 Change-Id: I8fe62d3d777effc5321c361cd595c58b7f93807e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2782086 Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
2025-12-06 16:56:55 +08:00 · 2021-03-24 13:45:04 -07:00 · 2021-03-24 13:45:04 -07:00 · 312c02a5aa
commit 312c02a5aa
parent d8f1bfc981
10 changed files with 311 additions and 230 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1782
+Version: 1783
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1782
+#define LIBYUV_VERSION 1783

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@ -400,7 +400,7 @@ int I210ToI010(const uint16_t* src_y,
 }

 // Any I[420]1[02] to P[420]1[02] format with mirroring.
-static int Ix1xToPx1x(const uint16_t* src_y,
+static int IxxxToPxxx(const uint16_t* src_y,
                      int src_stride_y,
                      const uint16_t* src_u,
                      int src_stride_u,
@ -441,7 +441,7 @@ int I010ToP010(const uint16_t* src_y,
               int dst_stride_uv,
               int width,
               int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                    width, height, 1, 1, 10);
 }
@ -459,7 +459,7 @@ int I210ToP210(const uint16_t* src_y,
               int dst_stride_uv,
               int width,
               int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                    width, height, 1, 0, 10);
 }
@ -477,7 +477,7 @@ int I012ToP012(const uint16_t* src_y,
               int dst_stride_uv,
               int width,
               int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                    width, height, 1, 1, 12);
 }
@ -495,7 +495,7 @@ int I212ToP212(const uint16_t* src_y,
               int dst_stride_uv,
               int width,
               int height) {
-  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
                    width, height, 1, 0, 12);
 }
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -10,6 +10,7 @@

 #include "libyuv/planar_functions.h"

+#include <assert.h>
 #include <string.h>  // for memset()

 #include "libyuv/cpu_id.h"
@ -563,9 +564,9 @@ void SplitUVPlane_16(const uint16_t* src_uv,
                     int height,
                     int depth) {
  int y;
-  int scale = 1 << depth;
-  void (*SplitUVRow)(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v,
-                     int scale, int width) = SplitUVRow_16_C;
+  void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
+                        uint16_t* dst_v, int depth, int width) =
+      SplitUVRow_16_C;
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
@ -583,24 +584,24 @@ void SplitUVPlane_16(const uint16_t* src_uv,
  }
 #if defined(HAS_SPLITUVROW_16_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    SplitUVRow = SplitUVRow_16_Any_AVX2;
+    SplitUVRow_16 = SplitUVRow_16_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_16_AVX2;
+      SplitUVRow_16 = SplitUVRow_16_AVX2;
    }
  }
 #endif
 #if defined(HAS_SPLITUVROW_16_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    SplitUVRow = SplitUVRow_16_Any_NEON;
+    SplitUVRow_16 = SplitUVRow_16_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      SplitUVRow = SplitUVRow_16_NEON;
+      SplitUVRow_16 = SplitUVRow_16_NEON;
    }
  }
 #endif

  for (y = 0; y < height; ++y) {
    // Copy a row of UV.
-    SplitUVRow(src_uv, dst_u, dst_v, scale, width);
+    SplitUVRow_16(src_uv, dst_u, dst_v, depth, width);
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
    src_uv += src_stride_uv;
@ -618,9 +619,11 @@ void MergeUVPlane_16(const uint16_t* src_u,
                     int height,
                     int depth) {
  int y;
-  int scale = 1 << (16 - depth);
-  void (*MergeUVRow)(const uint16_t* src_u, const uint16_t* src_v,
-                     uint16_t* dst_uv, int scale, int width) = MergeUVRow_16_C;
+  void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v,
+                        uint16_t* dst_uv, int depth, int width) =
+      MergeUVRow_16_C;
+  assert(depth >= 8);
+  assert(depth <= 16);
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
@ -636,24 +639,24 @@ void MergeUVPlane_16(const uint16_t* src_u,
  }
 #if defined(HAS_MERGEUVROW_16_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    MergeUVRow = MergeUVRow_16_Any_AVX2;
+    MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      MergeUVRow = MergeUVRow_16_AVX2;
+      MergeUVRow_16 = MergeUVRow_16_AVX2;
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_16_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    MergeUVRow = MergeUVRow_16_Any_NEON;
+    MergeUVRow_16 = MergeUVRow_16_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      MergeUVRow = MergeUVRow_16_NEON;
+      MergeUVRow_16 = MergeUVRow_16_NEON;
    }
  }
 #endif

  for (y = 0; y < height; ++y) {
    // Merge a row of U and V into a row of UV.
-    MergeUVRow(src_u, src_v, dst_uv, scale, width);
+    MergeUVRow_16(src_u, src_v, dst_uv, depth, width);
    src_u += src_stride_u;
    src_v += src_stride_v;
    dst_uv += dst_stride_uv;
@ -671,7 +674,7 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
                          int depth) {
  int y;
  int scale = 1 << (16 - depth);
-  void (*MultiplyRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+  void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale,
                         int width) = MultiplyRow_16_C;
  // Negative height means invert the image.
  if (height < 0) {
@ -688,23 +691,23 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,

 #if defined(HAS_MULTIPLYROW_16_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    MultiplyRow = MultiplyRow_16_Any_AVX2;
+    MultiplyRow_16 = MultiplyRow_16_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
-      MultiplyRow = MultiplyRow_16_AVX2;
+      MultiplyRow_16 = MultiplyRow_16_AVX2;
    }
  }
 #endif
 #if defined(HAS_MULTIPLYROW_16_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    MultiplyRow = MultiplyRow_16_Any_NEON;
+    MultiplyRow_16 = MultiplyRow_16_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      MultiplyRow = MultiplyRow_16_NEON;
+      MultiplyRow_16 = MultiplyRow_16_NEON;
    }
  }
 #endif

  for (y = 0; y < height; ++y) {
-    MultiplyRow(src_y, dst_y, scale, width);
+    MultiplyRow_16(src_y, dst_y, scale, width);
    src_y += src_stride_y;
    dst_y += dst_stride_y;
  }
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -10,6 +10,7 @@

 #include "libyuv/row.h"

+#include <assert.h>
 #include <stdio.h>
 #include <string.h>  // For memcpy and memset.

@ -3045,6 +3046,8 @@ void MergeUVRow_16_C(const uint16_t* src_u,
                     int depth,
                     int width) {
  int shift = 16 - depth;
+  assert(depth >= 8);
+  assert(depth <= 16);
  int x;
  for (x = 0; x < width; ++x) {
    dst_uv[0] = src_u[x] << shift;
@ -3061,6 +3064,8 @@ void SplitUVRow_16_C(const uint16_t* src_uv,
                     int width) {
  int shift = 16 - depth;
  int x;
+  assert(depth >= 8);
+  assert(depth <= 16);
  for (x = 0; x < width; ++x) {
    dst_u[x] = src_uv[0] >> shift;
    dst_v[x] = src_uv[1] >> shift;
@ -3098,6 +3103,9 @@ void Convert16To8Row_C(const uint16_t* src_y,
                       int scale,
                       int width) {
  int x;
+  assert(scale >= 256);
+  assert(scale <= 32768);
+
  for (x = 0; x < width; ++x) {
    dst_y[x] = clamp255((src_y[x] * scale) >> 16);
  }
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -4728,8 +4728,6 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
  // clang-format off
  asm volatile (
      "vmovd       %4,%%xmm3                     \n"
-      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-      "vbroadcastss %%xmm3,%%xmm3                \n"
      "sub         %0,%1                         \n"

    // 16 pixels per loop.
@ -4761,7 +4759,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
 }
 #endif  // HAS_MERGEUVROW_AVX2

-#ifdef HAS_MERGEUVROW_16_AVX2
+#ifdef HAS_SPLITUVROW_16_AVX2
 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8,  9,  12, 13,
                                 2, 3, 6, 7, 10, 11, 14, 15};
 void SplitUVRow_16_AVX2(const uint16_t* src_uv,
@ -4773,8 +4771,6 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
  // clang-format off
  asm volatile (
      "vmovd       %4,%%xmm3                     \n"
-      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-      "vbroadcastss %%xmm3,%%xmm3                \n"
      "vbroadcastf128 %5,%%ymm4                  \n"
      "sub         %1,%2                         \n"

@ -4802,14 +4798,13 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
  : "+r"(src_uv),   // %0
    "+r"(dst_u),    // %1
    "+r"(dst_v),    // %2
-    "+r"(width),    // %3
-    "+r"(depth)     // %4
-  :
+    "+r"(width)     // %3
+  : "r"(depth),     // %4
    "m"(kSplitUVShuffle16) // %5
  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
  // clang-format on
 }
-#endif  // HAS_MERGEUVROW_AVX2
+#endif  // HAS_SPLITUVROW_16_AVX2

 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 128 = 9 bits
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -3270,32 +3270,22 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
                        uint16_t* dst_v,
                        int depth,
                        int width) {
+  int shift = depth - 16;  // Negative for right shift.
  asm volatile(
-      "vdup.32     q0, %3                        \n"
+      "vdup.16     q2, %4                        \n"
      "1:                                        \n"
-      "vld2.16     {q1, q2}, [%0]!               \n"  // load 8 UV
-      "vmovl.u16   q3, d2                        \n"
-      "vmovl.u16   q4, d3                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vmovn.u32   d2, q3                        \n"
-      "vmovn.u32   d3, q4                        \n"
-      "vmovl.u16   q3, d4                        \n"
-      "vmovl.u16   q4, d5                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vmovn.u32   d4, q3                        \n"
-      "vmovn.u32   d5, q4                        \n"
-      "subs        %4, %4, #8                    \n"  // 8 src pixels per loop
-      "vst1.16     {q1}, [%1]!                   \n"  // store 8 U pixels
-      "vst1.16     {q2}, [%2]!                   \n"  // store 8 V pixels
+      "vld2.16     {q0, q1}, [%0]!               \n"  // load 8 UV
+      "vshl.u16    q0, q0, q2                    \n"
+      "vshl.u16    q1, q1, q2                    \n"
+      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
+      "vst1.16     {q0}, [%1]!                   \n"  // store 8 U pixels
+      "vst1.16     {q1}, [%2]!                   \n"  // store 8 V pixels
      "bgt         1b                            \n"
      : "+r"(src_uv),  // %0
        "+r"(dst_u),   // %1
        "+r"(dst_v),   // %2
-        "+r"(depth),   // %3
-        "+r"(width)    // %4
-      :
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
 }

@ -3306,21 +3296,20 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                        int width) {
  int shift = 16 - depth;
  asm volatile(
-      "vdup.16     q2, %3                        \n"
+      "vdup.16     q2, %4                        \n"
      "1:                                        \n"
      "vld1.16     {q0}, [%0]!                   \n"  // load 8 U
      "vld1.16     {q1}, [%1]!                   \n"  // load 8 V
      "vshl.u16    q0, q0, q2                    \n"
      "vshl.u16    q1, q1, q2                    \n"
-      "subs        %4, %4, #8                    \n"  // 8 src pixels per loop
+      "subs        %3, %3, #8                    \n"  // 8 src pixels per loop
      "vst2.16     {q0, q1}, [%2]!               \n"  // store 8 UV pixels
      "bgt         1b                            \n"
      : "+r"(src_u),   // %0
        "+r"(src_v),   // %1
        "+r"(dst_uv),  // %2
-        "+r"(shift),   // %3
-        "+r"(width)    // %4
-      :
+        "+r"(width)    // %3
+      : "r"(shift)     // %4
      : "cc", "memory", "q0", "q1", "q2");
 }

--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -2605,6 +2605,64 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
  free_aligned_buffer_page_end(dst_pixels_c);
 }

+// 16 bit channel split and merge
+TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) {
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  align_buffer_page_end(src_pixels, kPixels * 2 * 2);
+  align_buffer_page_end(tmp_pixels_u_c, kPixels * 2);
+  align_buffer_page_end(tmp_pixels_v_c, kPixels * 2);
+  align_buffer_page_end(tmp_pixels_u_opt, kPixels * 2);
+  align_buffer_page_end(tmp_pixels_v_opt, kPixels * 2);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2);
+  align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2);
+  MemRandomize(src_pixels, kPixels * 2 * 2);
+  MemRandomize(tmp_pixels_u_c, kPixels * 2);
+  MemRandomize(tmp_pixels_v_c, kPixels * 2);
+  MemRandomize(tmp_pixels_u_opt, kPixels * 2);
+  MemRandomize(tmp_pixels_v_opt, kPixels * 2);
+  MemRandomize(dst_pixels_opt, kPixels * 2 * 2);
+  MemRandomize(dst_pixels_c, kPixels * 2 * 2);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
+                  (uint16_t*)tmp_pixels_u_c, benchmark_width_,
+                  (uint16_t*)tmp_pixels_v_c, benchmark_width_, benchmark_width_,
+                  benchmark_height_, 12);
+  MergeUVPlane_16((const uint16_t*)tmp_pixels_u_c, benchmark_width_,
+                  (const uint16_t*)tmp_pixels_v_c, benchmark_width_,
+                  (uint16_t*)dst_pixels_c, benchmark_width_ * 2,
+                  benchmark_width_, benchmark_height_, 12);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
+                  (uint16_t*)tmp_pixels_u_opt, benchmark_width_,
+                  (uint16_t*)tmp_pixels_v_opt, benchmark_width_,
+                  benchmark_width_, benchmark_height_, 12);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MergeUVPlane_16((const uint16_t*)tmp_pixels_u_opt, benchmark_width_,
+                    (const uint16_t*)tmp_pixels_v_opt, benchmark_width_,
+                    (uint16_t*)dst_pixels_opt, benchmark_width_ * 2,
+                    benchmark_width_, benchmark_height_, 12);
+  }
+
+  for (int i = 0; i < kPixels * 2; ++i) {
+    EXPECT_EQ(tmp_pixels_u_c[i], tmp_pixels_u_opt[i]);
+    EXPECT_EQ(tmp_pixels_v_c[i], tmp_pixels_v_opt[i]);
+  }
+  for (int i = 0; i < kPixels * 2 * 2; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(tmp_pixels_u_c);
+  free_aligned_buffer_page_end(tmp_pixels_v_c);
+  free_aligned_buffer_page_end(tmp_pixels_u_opt);
+  free_aligned_buffer_page_end(tmp_pixels_v_opt);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
 TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
  // Round count up to multiple of 16
  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
@ -2649,6 +2707,46 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
  free_aligned_buffer_page_end(dst_pixels_c);
 }

+// 16 bit channel split
+TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) {
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  align_buffer_page_end(src_pixels, kPixels * 2 * 2);
+  align_buffer_page_end(dst_pixels_u_c, kPixels * 2);
+  align_buffer_page_end(dst_pixels_v_c, kPixels * 2);
+  align_buffer_page_end(dst_pixels_u_opt, kPixels * 2);
+  align_buffer_page_end(dst_pixels_v_opt, kPixels * 2);
+  MemRandomize(src_pixels, kPixels * 2 * 2);
+  MemRandomize(dst_pixels_u_c, kPixels * 2);
+  MemRandomize(dst_pixels_v_c, kPixels * 2);
+  MemRandomize(dst_pixels_u_opt, kPixels * 2);
+  MemRandomize(dst_pixels_v_opt, kPixels * 2);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
+                  (uint16_t*)dst_pixels_u_c, benchmark_width_,
+                  (uint16_t*)dst_pixels_v_c, benchmark_width_, benchmark_width_,
+                  benchmark_height_, 10);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
+                    (uint16_t*)dst_pixels_u_opt, benchmark_width_,
+                    (uint16_t*)dst_pixels_v_opt, benchmark_width_,
+                    benchmark_width_, benchmark_height_, 10);
+  }
+
+  for (int i = 0; i < kPixels * 2; ++i) {
+    EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
+    EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_u_c);
+  free_aligned_buffer_page_end(dst_pixels_v_c);
+  free_aligned_buffer_page_end(dst_pixels_u_opt);
+  free_aligned_buffer_page_end(dst_pixels_v_opt);
+}
+
 TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
  // Round count up to multiple of 16
  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;