[AArch64] Add SVE2 and SME implementations for Convert8To8Row

SVE can make use of the UMULH instruction to avoid needing separate widening multiply and narrowing steps for the scale application. Reduction in runtime for Convert8To8Row_SVE2 observed compared to the existing Neon implementation: Cortex-A510: -13.2% Cortex-A520: -16.4% Cortex-A710: -37.1% Cortex-A715: -38.5% Cortex-A720: -38.4% Cortex-X2: -33.2% Cortex-X3: -31.8% Cortex-X4: -31.8% Cortex-X925: -13.9% Change-Id: I17c0cb81661c5fbce786b47cdf481549cfdcbfc7 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6207692 Reviewed-by: Wan-Teh Chang <wtc@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2025-01-28 10:21:16 +00:00 · 2025-01-28 10:21:16 +00:00 · c4a0c8d34a
commit c4a0c8d34a
parent eacb08c83e
5 changed files with 92 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -553,6 +553,7 @@ extern "C" {
 #define HAS_AYUVTOUVROW_SVE2
 #define HAS_AYUVTOVUROW_SVE2
 #define HAS_BGRATOUVROW_SVE2
+#define HAS_CONVERT8TO8ROW_SVE2
 #define HAS_DIVIDEROW_16_SVE2
 #define HAS_HALFFLOATROW_SVE2
 #define HAS_I210ALPHATOARGBROW_SVE2
@ -595,6 +596,7 @@ extern "C" {
    defined(__aarch64__)
 #define HAS_ARGBMULTIPLYROW_SME
 #define HAS_CONVERT16TO8ROW_SME
+#define HAS_CONVERT8TO8ROW_SME
 #define HAS_COPYROW_SME
 #define HAS_I210ALPHATOARGBROW_SME
 #define HAS_I210TOAR30ROW_SME
@ -3658,6 +3660,16 @@ void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
                             int scale,
                             int bias,
                             int width);
+void Convert8To8Row_SVE2(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width);
+void Convert8To8Row_SME(const uint8_t* src_y,
+                        uint8_t* dst_y,
+                        int scale,
+                        int bias,
+                        int width);
 void Convert8To8Row_AVX2(const uint8_t* src_y,
                         uint8_t* dst_y,
                         int scale,
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@ -1725,6 +1725,60 @@ static inline void I212ToARGBRow_SVE_SC(const uint16_t* src_y,
      : "cc", "memory", YUVTORGB_SVE_REGS);
 }

+#define CONVERT8TO8_SVE                                  \
+  "ld1b        {z0.b}, p0/z, [%[src]]                \n" \
+  "ld1b        {z1.b}, p1/z, [%[src], #1, mul vl]    \n" \
+  "incb        %[src], all, mul #2                   \n" \
+  "subs        %w[width], %w[width], %w[vl], lsl #1  \n" \
+  "umulh       z0.b, z0.b, z2.b                      \n" \
+  "umulh       z1.b, z1.b, z2.b                      \n" \
+  "prfm        pldl1keep, [%[src], 448]              \n" \
+  "add         z0.b, z0.b, z3.b                      \n" \
+  "add         z1.b, z1.b, z3.b                      \n" \
+  "st1b        {z0.b}, p0, [%[dst]]                  \n" \
+  "st1b        {z1.b}, p1, [%[dst], #1, mul vl]      \n" \
+  "incb        %[dst], all, mul #2                   \n"
+
+static inline void Convert8To8Row_SVE_SC(const uint8_t* src_y,
+                                         uint8_t* dst_y,
+                                         int scale,
+                                         int bias,
+                                         int width) STREAMING_COMPATIBLE {
+  uint64_t vl;
+  asm volatile(
+      "dup      z2.b, %w[scale]                         \n"
+      "dup      z3.b, %w[bias]                          \n"
+      "cntb     %[vl]                                   \n"
+      "subs     %w[width], %w[width], %w[vl], lsl #1    \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with all-true predicates to avoid predicate
+      // generation overhead.
+      "ptrue    p0.b                                    \n"
+      "ptrue    p1.b                                    \n"
+      "1:                                               \n"  //
+      CONVERT8TO8_SVE
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl], lsl #1    \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate predicates for the final iteration to deal with the tail.
+      "whilelt     p0.b, wzr, %w2                       \n"
+      "whilelt     p1.b, %w[vl], %w2                    \n"  //
+      CONVERT8TO8_SVE
+
+      "99:                                              \n"
+      : [src] "+r"(src_y),    // %[src]
+        [dst] "+r"(dst_y),    // %[dst]
+        [width] "+r"(width),  // %[width]
+        [vl] "=&r"(vl)        // %[vl]
+      : [scale] "r"(scale),   // %[scale]
+        [bias] "r"(bias)      // %[bias]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "p0", "p1");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)

 #ifdef __cplusplus
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -271,6 +271,16 @@ void Convert8To8Plane(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_CONVERT8TO8ROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    Convert8To8Row = Convert8To8Row_SVE2;
+  }
+#endif
+#if defined(HAS_CONVERT8TO8ROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    Convert8To8Row = Convert8To8Row_SME;
+  }
+#endif
 #if defined(HAS_CONVERT8TO8ROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    Convert8To8Row = Convert8To8Row_Any_AVX2;
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -1075,6 +1075,14 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0");
 }

+__arm_locally_streaming void Convert8To8Row_SME(const uint8_t* src_y,
+                                                uint8_t* dst_y,
+                                                int scale,
+                                                int bias,
+                                                int width) {
+  Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width);
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
        // defined(__aarch64__)

--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -1237,6 +1237,14 @@ void I212ToARGBRow_SVE2(const uint16_t* src_y,
  I212ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }

+void Convert8To8Row_SVE2(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width) {
+  Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width);
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)

 #ifdef __cplusplus