UVScale down by 2 fix for C and optimize for NEON

- update cpu_id to use "re" for fopen to avoid leaking handles if a thread is started while the file is open. Bug: libyuv:958 Change-Id: I1af9de68fce12e440e1226fc8070634ccb1bf090 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4417176 Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2025-12-06 16:56:55 +08:00 · 2023-04-12 15:32:23 -07:00 · 2023-04-12 15:32:23 -07:00 · 68659d0d68
commit 68659d0d68
parent ee3e71c7ce
10 changed files with 149 additions and 86 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1865
+Version: 1866
 License: BSD
 License File: LICENSE

--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@ -133,6 +133,8 @@ extern "C" {
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
 #define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEUVROWDOWN2_NEON
+#define HAS_SCALEUVROWDOWN2LINEAR_NEON
 #define HAS_SCALEUVROWDOWN2BOX_NEON
 #define HAS_SCALEUVROWDOWNEVEN_NEON
 #define HAS_SCALEROWUP2_LINEAR_NEON
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1865
+#define LIBYUV_VERSION 1866

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@ -137,7 +137,7 @@ static int GetXCR0() {
 // For Arm, but public to allow testing on any CPU
 LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
  char cpuinfo_line[512];
-  FILE* f = fopen(cpuinfo_name, "r");
+  FILE* f = fopen(cpuinfo_name, "re");
  if (!f) {
    // Assume Neon if /proc/cpuinfo is unavailable.
    // This will occur for Chrome sandbox for Pepper or Render process.
@ -166,7 +166,7 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
 LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
  char cpuinfo_line[512];
  int flag = 0x0;
-  FILE* f = fopen(cpuinfo_name, "r");
+  FILE* f = fopen(cpuinfo_name, "re");
  if (!f) {
    // Assume nothing if /proc/cpuinfo is unavailable.
    // This will occur for Chrome sandbox for Pepper or Render process.
@ -194,7 +194,7 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
 LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) {
  char cpuinfo_line[512];
  int flag = 0x0;
-  FILE* f = fopen(cpuinfo_name, "r");
+  FILE* f = fopen(cpuinfo_name, "re");
  if (!f) {
    // Assume nothing if /proc/cpuinfo is unavailable.
    // This will occur for Chrome sandbox for Pepper or Render process.
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@ -128,6 +128,22 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
      1,
      15)
 #endif
+#ifdef HAS_SCALEUVROWDOWN2_NEON
+SDANY(ScaleUVRowDown2_Any_NEON,
+      ScaleUVRowDown2_NEON,
+      ScaleUVRowDown2_C,
+      2,
+      2,
+      7)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2LINEAR_NEON
+SDANY(ScaleUVRowDown2Linear_Any_NEON,
+      ScaleUVRowDown2Linear_NEON,
+      ScaleUVRowDown2Linear_C,
+      2,
+      2,
+      7)
+#endif
 #ifdef HAS_SCALEUVROWDOWN2BOX_NEON
 SDANY(ScaleUVRowDown2Box_Any_NEON,
      ScaleUVRowDown2Box_NEON,
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -1280,18 +1280,13 @@ void ScaleUVRowDown2_C(const uint8_t* src_uv,
                       ptrdiff_t src_stride,
                       uint8_t* dst_uv,
                       int dst_width) {
-  const uint16_t* src = (const uint16_t*)(src_uv);
-  uint16_t* dst = (uint16_t*)(dst_uv);
  int x;
  (void)src_stride;
-  for (x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[1];
-    dst[1] = src[3];
-    src += 2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[1];
+  for (x = 0; x < dst_width; ++x) {
+    dst_uv[0] = src_uv[2];  // Store the 2nd UV
+    dst_uv[1] = src_uv[3];
+    src_uv += 4;
+    dst_uv += 2;
  }
 }

--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@ -1428,6 +1428,45 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,

 #undef LOAD2_DATA32_LANE

+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vst1.16     {q1}, [%1]!                   \n"  // store 8 UV
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1");
+}
+
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst,
+                                int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
+      "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
+      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
+      "vrhadd.u8   q0, q0, q1                    \n"  // rounding half add
+      "vst1.16     {q0}, [%1]!                   \n"  // store 8 UV
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1");
+}
+
 void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst,
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -1568,6 +1568,45 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
  );
 }

+void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v1.8h}, [%1], #16            \n"  // store 8 UV
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1");
+}
+
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst,
+                                int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "st1         {v0.8h}, [%1], #16            \n"  // store 8 UV
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1");
+}
+
 void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst,
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@ -112,6 +112,22 @@ static void ScaleUVDown2(int src_width,
    }
  }
 #endif
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+                                          : ScaleUVRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+                                            : ScaleUVRowDown2Box_NEON);
+    }
+  }
+#endif

 // This code is not enabled.  Only box filter is available at this time.
 #if defined(HAS_SCALEUVROWDOWN2_SSSE3)
@ -130,23 +146,7 @@ static void ScaleUVDown2(int src_width,
    }
  }
 #endif
-// This code is not enabled.  Only box filter is available at this time.
-#if defined(HAS_SCALEUVROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleUVRowDown2 =
-        filtering == kFilterNone
-            ? ScaleUVRowDown2_Any_NEON
-            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
-                                          : ScaleUVRowDown2Box_Any_NEON);
-    if (IS_ALIGNED(dst_width, 8)) {
-      ScaleUVRowDown2 =
-          filtering == kFilterNone
-              ? ScaleUVRowDown2_NEON
-              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
-                                            : ScaleUVRowDown2Box_NEON);
-    }
-  }
-#endif
+
 #if defined(HAS_SCALEUVROWDOWN2_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    ScaleUVRowDown2 =
--- a/unit_test/scale_uv_test.cc
+++ b/unit_test/scale_uv_test.cc
@ -39,55 +39,35 @@ static int UVTestFilter(int src_width,
    return 0;
  }

-  int i, j;
-  const int b = 0;  // 128 to test for padding/stride.
-  int64_t src_uv_plane_size =
-      (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 2LL;
-  int src_stride_uv = (b * 2 + Abs(src_width)) * 2;
+  int i;
+  int64_t src_uv_plane_size = Abs(src_width) * Abs(src_height) * 2LL;
+  int src_stride_uv = Abs(src_width) * 2;
+  int64_t dst_uv_plane_size = dst_width * dst_height * 2LL;
+  int dst_stride_uv = dst_width * 2;

  align_buffer_page_end(src_uv, src_uv_plane_size);
-  if (!src_uv) {
+  align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
+  align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
+
+  if (!src_uv || !dst_uv_c || !dst_uv_opt) {
    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
    return 0;
  }
  MemRandomize(src_uv, src_uv_plane_size);
-
-  int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL;
-  int dst_stride_uv = (b * 2 + dst_width) * 2;
-
-  align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
-  align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
-  if (!dst_uv_c || !dst_uv_opt) {
-    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
-    return 0;
-  }
  memset(dst_uv_c, 2, dst_uv_plane_size);
-  memset(dst_uv_opt, 3, dst_uv_plane_size);
-
-  // Warm up both versions for consistent benchmarks.
-  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-          dst_width, dst_height, f);
-  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-          dst_width, dst_height, f);
+  memset(dst_uv_opt, 123, dst_uv_plane_size);

  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
  double c_time = get_time();
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-          dst_width, dst_height, f);
-
+  UVScale(src_uv, src_stride_uv, src_width, src_height,
+          dst_uv_c, dst_stride_uv, dst_width, dst_height, f);
  c_time = (get_time() - c_time);

  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
  double opt_time = get_time();
  for (i = 0; i < benchmark_iterations; ++i) {
-    UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-            src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-            dst_width, dst_height, f);
+    UVScale(src_uv, src_stride_uv, src_width, src_height,
+            dst_uv_opt, dst_stride_uv, dst_width, dst_height, f);
  }
  opt_time = (get_time() - opt_time) / benchmark_iterations;

@ -95,20 +75,13 @@ static int UVTestFilter(int src_width,
  printf("filter %d - %8d us C - %8d us OPT\n", f,
         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));

-  // C version may be a little off from the optimized. Order of
-  //  operations may introduce rounding somewhere. So do a difference
-  //  of the buffers and look to see that the max difference isn't
-  //  over 2.
  int max_diff = 0;
-  for (i = b; i < (dst_height + b); ++i) {
-    for (j = b * 2; j < (dst_width + b) * 2; ++j) {
-      int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] -
-                         dst_uv_opt[(i * dst_stride_uv) + j]);
+  for (i = 0; i < dst_uv_plane_size; ++i) {
+    int abs_diff = Abs(dst_uv_c[i] - dst_uv_opt[i]);
    if (abs_diff > max_diff) {
      max_diff = abs_diff;
    }
  }
-  }

  free_aligned_buffer_page_end(dst_uv_c);
  free_aligned_buffer_page_end(dst_uv_opt);
@ -121,28 +94,27 @@ static int UVTestFilter(int src_width,
 #define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
 #define SX(x, nom, denom) static_cast<int>((x / nom) * denom)

-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
+#define TEST_FACTOR1(name, filter, nom, denom)                     \
  TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) {                  \
    int diff = UVTestFilter(                                                 \
        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
        benchmark_cpu_info_);                                                \
-    EXPECT_LE(diff, max_diff);                                               \
+    EXPECT_EQ(0, diff);                                               \
  }

 #if defined(ENABLE_FULL_TESTS)
-// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
-// filtering is different fixed point implementations for SSSE3, Neon and C.
+// Test a scale factor with all 4 filters.  Expect exact for SIMD vs C.
 #define TEST_FACTOR(name, nom, denom)         \
-  TEST_FACTOR1(name, None, nom, denom, 0)     \
-  TEST_FACTOR1(name, Linear, nom, denom, 3)   \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
-  TEST_FACTOR1(name, Box, nom, denom, 3)
+  TEST_FACTOR1(name, None, nom, denom)     \
+  TEST_FACTOR1(name, Linear, nom, denom)   \
+  TEST_FACTOR1(name, Bilinear, nom, denom) \
+  TEST_FACTOR1(name, Box, nom, denom)
 #else
 // Test a scale factor with Bilinear.
 #define TEST_FACTOR(name, nom, denom) \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3)
+  TEST_FACTOR1(name, Bilinear, nom, denom)
 #endif

 TEST_FACTOR(2, 1, 2)