From c4a0c8d34aa28cec37fd4ac9ea6fd2a1eb187c10 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 28 Jan 2025 10:21:16 +0000
Subject: [PATCH] [AArch64] Add SVE2 and SME implementations for Convert8To8Row

SVE can make use of the UMULH instruction to avoid needing separate
widening multiply and narrowing steps for the scale application.

Reduction in runtime for Convert8To8Row_SVE2 observed compared to the
existing Neon implementation:

        Cortex-A510: -13.2%
        Cortex-A520: -16.4%
        Cortex-A710: -37.1%
        Cortex-A715: -38.5%
        Cortex-A720: -38.4%
          Cortex-X2: -33.2%
          Cortex-X3: -31.8%
          Cortex-X4: -31.8%
        Cortex-X925: -13.9%

Change-Id: I17c0cb81661c5fbce786b47cdf481549cfdcbfc7
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6207692
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h       | 12 +++++++++
 include/libyuv/row_sve.h   | 54 ++++++++++++++++++++++++++++++++++++++
 source/planar_functions.cc | 10 +++++++
 source/row_sme.cc          |  8 ++++++
 source/row_sve.cc          |  8 ++++++
 5 files changed, 92 insertions(+)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 98808b75e..3ec11cb8d 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -553,6 +553,7 @@ extern "C" {
 #define HAS_AYUVTOUVROW_SVE2
 #define HAS_AYUVTOVUROW_SVE2
 #define HAS_BGRATOUVROW_SVE2
+#define HAS_CONVERT8TO8ROW_SVE2
 #define HAS_DIVIDEROW_16_SVE2
 #define HAS_HALFFLOATROW_SVE2
 #define HAS_I210ALPHATOARGBROW_SVE2
@@ -595,6 +596,7 @@ extern "C" {
     defined(__aarch64__)
 #define HAS_ARGBMULTIPLYROW_SME
 #define HAS_CONVERT16TO8ROW_SME
+#define HAS_CONVERT8TO8ROW_SME
 #define HAS_COPYROW_SME
 #define HAS_I210ALPHATOARGBROW_SME
 #define HAS_I210TOAR30ROW_SME
@@ -3658,6 +3660,16 @@ void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
                              int scale,
                              int bias,
                              int width);
+void Convert8To8Row_SVE2(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width);
+void Convert8To8Row_SME(const uint8_t* src_y,
+                        uint8_t* dst_y,
+                        int scale,
+                        int bias,
+                        int width);
 void Convert8To8Row_AVX2(const uint8_t* src_y,
                          uint8_t* dst_y,
                          int scale,
diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h
index b52a38a99..1e10c1e9a 100644
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@@ -1725,6 +1725,60 @@ static inline void I212ToARGBRow_SVE_SC(const uint16_t* src_y,
       : "cc", "memory", YUVTORGB_SVE_REGS);
 }
 
+#define CONVERT8TO8_SVE                                  \
+  "ld1b        {z0.b}, p0/z, [%[src]]                \n" \
+  "ld1b        {z1.b}, p1/z, [%[src], #1, mul vl]    \n" \
+  "incb        %[src], all, mul #2                   \n" \
+  "subs        %w[width], %w[width], %w[vl], lsl #1  \n" \
+  "umulh       z0.b, z0.b, z2.b                      \n" \
+  "umulh       z1.b, z1.b, z2.b                      \n" \
+  "prfm        pldl1keep, [%[src], 448]              \n" \
+  "add         z0.b, z0.b, z3.b                      \n" \
+  "add         z1.b, z1.b, z3.b                      \n" \
+  "st1b        {z0.b}, p0, [%[dst]]                  \n" \
+  "st1b        {z1.b}, p1, [%[dst], #1, mul vl]      \n" \
+  "incb        %[dst], all, mul #2                   \n"
+
+static inline void Convert8To8Row_SVE_SC(const uint8_t* src_y,
+                                         uint8_t* dst_y,
+                                         int scale,
+                                         int bias,
+                                         int width) STREAMING_COMPATIBLE {
+  uint64_t vl;
+  asm volatile(
+      "dup      z2.b, %w[scale]                         \n"
+      "dup      z3.b, %w[bias]                          \n"
+      "cntb     %[vl]                                   \n"
+      "subs     %w[width], %w[width], %w[vl], lsl #1    \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with all-true predicates to avoid predicate
+      // generation overhead.
+      "ptrue    p0.b                                    \n"
+      "ptrue    p1.b                                    \n"
+      "1:                                               \n"  //
+      CONVERT8TO8_SVE
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl], lsl #1    \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate predicates for the final iteration to deal with the tail.
+      "whilelt     p0.b, wzr, %w2                       \n"
+      "whilelt     p1.b, %w[vl], %w2                    \n"  //
+      CONVERT8TO8_SVE
+
+      "99:                                              \n"
+      : [src] "+r"(src_y),    // %[src]
+        [dst] "+r"(dst_y),    // %[dst]
+        [width] "+r"(width),  // %[width]
+        [vl] "=&r"(vl)        // %[vl]
+      : [scale] "r"(scale),   // %[scale]
+        [bias] "r"(bias)      // %[bias]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "p0", "p1");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index ca0bfea90..c2d4b67a4 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -271,6 +271,16 @@ void Convert8To8Plane(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_CONVERT8TO8ROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    Convert8To8Row = Convert8To8Row_SVE2;
+  }
+#endif
+#if defined(HAS_CONVERT8TO8ROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    Convert8To8Row = Convert8To8Row_SME;
+  }
+#endif
 #if defined(HAS_CONVERT8TO8ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     Convert8To8Row = Convert8To8Row_Any_AVX2;
diff --git a/source/row_sme.cc b/source/row_sme.cc
index 561b7f73d..1cbc42f3e 100644
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@@ -1075,6 +1075,14 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
       : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0");
 }
 
+__arm_locally_streaming void Convert8To8Row_SME(const uint8_t* src_y,
+                                                uint8_t* dst_y,
+                                                int scale,
+                                                int bias,
+                                                int width) {
+  Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width);
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
         // defined(__aarch64__)
 
diff --git a/source/row_sve.cc b/source/row_sve.cc
index 8076c9ebc..0bab8e16f 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -1237,6 +1237,14 @@ void I212ToARGBRow_SVE2(const uint16_t* src_y,
   I212ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }
 
+void Convert8To8Row_SVE2(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width) {
+  Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width);
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
 
 #ifdef __cplusplus