ARGBToI420 C version match SIMD

Bug: libyuv:447 Change-Id: Iafb28cf635b355837caf41c26baee665642f4f95 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2181779 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2025-12-06 16:56:55 +08:00 · 2020-05-05 11:16:59 -07:00 · 2020-05-05 11:16:59 -07:00 · 0b8bb60f2e
commit 0b8bb60f2e
parent 7a61759f78
6 changed files with 115 additions and 48 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1751
+Version: 1752
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1751
+#define LIBYUV_VERSION 1752

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -579,6 +579,15 @@ int NV21ToNV12(const uint8_t* src_y,
  if (dst_y) {
    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_vu = src_vu + (halfheight - 1) * src_stride_vu;
+    src_stride_vu = -src_stride_vu;
+  }
+
  SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
              halfheight);
  return 0;
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -27,6 +27,12 @@ extern "C" {
 #define LIBYUV_RGB7 1
 #endif

+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+    defined(_M_IX86)
+#define LIBYUV_ARGBTOUV_PAVGB 1
+#define LIBYUV_RGBTOU_TRUNCATE 1
+#endif
+
 // llvm x86 is poor at ternary operator, so use branchless min/max.

 #define USE_BRANCHLESS 1
@ -420,14 +426,36 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
 }
 #endif

+#ifdef LIBYUV_RGBTOU_TRUNCATE
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+  return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
+}
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+  return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
+}
+#else
 static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
 }
 static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
 }
+#endif
+
+#if !defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
+  return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+  return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
+}
+#endif
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)

 // ARGBToY_C and ARGBToUV_C
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
 #define MAKEROWY(NAME, R, G, B, BPP)                                         \
  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
    int x;                                                                   \
@ -442,15 +470,12 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
    int x;                                                                   \
    for (x = 0; x < width - 1; x += 2) {                                     \
-      uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
-                    src_rgb1[B + BPP]) >>                                    \
-                   2;                                                        \
-      uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
-                    src_rgb1[G + BPP]) >>                                    \
-                   2;                                                        \
-      uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
-                    src_rgb1[R + BPP]) >>                                    \
-                   2;                                                        \
+      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                      \
+                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));         \
+      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                      \
+                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));         \
+      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                      \
+                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));         \
      dst_u[0] = RGBToU(ar, ag, ab);                                         \
      dst_v[0] = RGBToV(ar, ag, ab);                                         \
      src_rgb0 += BPP * 2;                                                   \
@ -459,13 +484,54 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
      dst_v += 1;                                                            \
    }                                                                        \
    if (width & 1) {                                                         \
-      uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                         \
-      uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                         \
-      uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                         \
+      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                           \
+      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                           \
+      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                           \
      dst_u[0] = RGBToU(ar, ag, ab);                                         \
      dst_v[0] = RGBToV(ar, ag, ab);                                         \
    }                                                                        \
  }
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWY(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                   \
+    for (x = 0; x < width; ++x) {                                            \
+      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                      \
+      dst_y += 1;                                                            \
+    }                                                                        \
+  }                                                                          \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
+    int x;                                                                   \
+    for (x = 0; x < width - 1; x += 2) {                                     \
+      uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +         \
+                     src_rgb1[B + BPP] + 1) >>                               \
+                    1;                                                       \
+      uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +         \
+                     src_rgb1[G + BPP] + 1) >>                               \
+                    1;                                                       \
+      uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +         \
+                     src_rgb1[R + BPP] + 1) >>                               \
+                    1;                                                       \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                       \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                       \
+      src_rgb0 += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                   \
+      dst_u += 1;                                                            \
+      dst_v += 1;                                                            \
+    }                                                                        \
+    if (width & 1) {                                                         \
+      uint16_t ab = (src_rgb0[B] + src_rgb1[B]);                             \
+      uint16_t ag = (src_rgb0[G] + src_rgb1[G]);                             \
+      uint16_t ar = (src_rgb0[R] + src_rgb1[R]);                             \
+      dst_u[0] = RGB2xToU(ar, ag, ab);                                       \
+      dst_v[0] = RGB2xToV(ar, ag, ab);                                       \
+    }                                                                        \
+  }
+#endif

 MAKEROWY(ARGB, 2, 1, 0, 4)
 MAKEROWY(BGRA, 1, 2, 3, 4)
@ -519,8 +585,6 @@ static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 }

-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
-
 // ARGBToYJ_C and ARGBToUVJ_C
 #define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -32,8 +32,9 @@
 #endif

 #if defined(__arm__) || defined(__aarch64__)
-// arm version subsamples by summing 4 pixels then multiplying by matrix with
-// 4x smaller coefficients which are rounded to nearest integer.
+// arm version subsamples by summing 4 pixels, rounding divide by 2, then
+// multiplying by matrix with 2x smaller coefficients which are rounded
+// to nearest integer.
 #define ARM_YUV_ERROR 4
 #else
 #define ARM_YUV_ERROR 0
@ -246,7 +247,7 @@ TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
        }                                                                     \
      }                                                                       \
    }                                                                         \
-    EXPECT_LE(max_diff, 3);                                                   \
+    EXPECT_LE(max_diff, 0);                                                   \
    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                \
        int abs_diff = abs(                                                   \
@ -1008,30 +1009,28 @@ TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2)
  TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
                 benchmark_width_, DIFF, _Opt, +, 0)

-TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
-TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 0)
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 0)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 0)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 0)
 TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR)
 TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR)
 #ifdef INTEL_TEST
 TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
 TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
-#endif
-TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
-TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
-TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, ARM_YUV_ERROR)
-#ifdef INTEL_TEST
 TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
 #endif
-TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
-TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 0)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 0)
+TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 0)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 0)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 0)
+TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, ARM_YUV_ERROR)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 0)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 0)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 0)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 0)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 0)

 #define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X,          \
                         SUBSAMP_Y, W1280, N, NEG, OFF)                       \
@ -1072,7 +1071,7 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
        }                                                                     \
      }                                                                       \
    }                                                                         \
-    EXPECT_LE(max_diff, 4);                                                   \
+    EXPECT_LE(max_diff, 0);                                                   \
    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
      for (int j = 0; j < kStrideUV * 2; ++j) {                               \
        int abs_diff =                                                        \
@ -1083,7 +1082,7 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
        }                                                                     \
      }                                                                       \
    }                                                                         \
-    EXPECT_LE(max_diff, 4);                                                   \
+    EXPECT_LE(max_diff, 0);                                                   \
    free_aligned_buffer_page_end(dst_y_c);                                    \
    free_aligned_buffer_page_end(dst_uv_c);                                   \
    free_aligned_buffer_page_end(dst_y_opt);                                  \
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -3512,7 +3512,6 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
 }

 TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
-  // Round count up to multiple of 16
  int dst_width = (benchmark_width_ + 1) / 2;
  int dst_height = (benchmark_height_ + 1) / 2;
  align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_);
@ -3529,15 +3528,11 @@ TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
  MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height);
  MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height);

-  ScalePlane(src_pixels_u, benchmark_width_, benchmark_width_,
-             benchmark_height_,
-
-             tmp_pixels_u, dst_width, dst_width, dst_height, kFilterBilinear);
-  ScalePlane(src_pixels_v, benchmark_width_, benchmark_width_,
-             benchmark_height_, tmp_pixels_v, dst_width, dst_width, dst_height,
-             kFilterBilinear);
-  MergeUVPlane(tmp_pixels_u, dst_width, tmp_pixels_v, dst_width,
-               dst_pixels_uv_c, dst_width * 2, dst_width, dst_height);
+  MaskCpuFlags(disable_cpu_flags_);
+  HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+                   benchmark_width_, dst_pixels_uv_c, dst_width * 2,
+                   benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);

  for (int i = 0; i < benchmark_iterations_; ++i) {
    HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,