From 949cb623bf904c5e7a7c060ab0ae609574870fb3 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Sun, 23 Mar 2025 10:13:57 +0000
Subject: [PATCH] Add SVE2 and SME implementations of I444ToRGB24Row

Move the READYUV444_SVE_2X and I444TORGB_SVE_2X macros to row_sve.h so
they are usable in both SVE2 and SME implementations, and use them to
add new I444ToRGB24Row implementations for SVE2 and SME. We need to use
the unrolled versions here to use the ST3B interleaving store
instructions, since there is no partial vector version of this store
instruction.

Reduction in time taken observed for the new SVE2 implementation,
compared to the existing Neon implementation:

Cortex-A510: -57.6%
Cortex-A520: -38.1%
Cortex-A710: -15.5%
Cortex-A715:  -9.2%
Cortex-A720:  -9.2%
  Cortex-X2: -25.8%
  Cortex-X3: -26.2%
  Cortex-X4: -23.2%
Cortex-X925: -17.8%

Change-Id: I6acd0b798a35e5352d4fad664769f12d3d938ed7
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6530646
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h     | 14 +++++++
 include/libyuv/row_sve.h | 86 ++++++++++++++++++++++++++++++++++++++++
 source/convert_argb.cc   | 30 ++++++++++++++
 source/row_sme.cc        | 49 +++++------------------
 source/row_sve.cc        |  9 +++++
 5 files changed, 149 insertions(+), 39 deletions(-)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 46cec2723..e83773f11 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -597,6 +597,7 @@ extern "C" {
 #define HAS_I422TORGBAROW_SVE2
 #define HAS_I444ALPHATOARGBROW_SVE2
 #define HAS_I444TOARGBROW_SVE2
+#define HAS_I444TORGB24ROW_SVE2
 #define HAS_NV12TOARGBROW_SVE2
 #define HAS_NV12TORGB24ROW_SVE2
 #define HAS_NV21TOARGBROW_SVE2
@@ -639,6 +640,7 @@ extern "C" {
 #define HAS_I422TORGBAROW_SME
 #define HAS_I444ALPHATOARGBROW_SME
 #define HAS_I444TOARGBROW_SME
+#define HAS_I444TORGB24ROW_SME
 #define HAS_INTERPOLATEROW_16_SME
 #define HAS_INTERPOLATEROW_16TO8_SME
 #define HAS_INTERPOLATEROW_SME
@@ -1217,6 +1219,18 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void I444ToRGB24Row_SVE2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I444ToRGB24Row_SME(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I210ToARGBRow_NEON(const uint16_t* src_y,
                         const uint16_t* src_u,
                         const uint16_t* src_v,
diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h
index 82103419c..5c0c210df 100644
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@@ -51,6 +51,21 @@ extern "C" {
   "trn1       z0.b, z0.b, z0.b               \n" \
   "prfm       pldl1keep, [%[src_v], 448]     \n"
 
+// Read twice as much data from YUV, putting the even elements from the Y data
+// in z0.h and odd elements in z1.h.
+#define READYUV444_SVE_2X                        \
+  "ld1b       {z0.b}, p1/z, [%[src_y]]       \n" \
+  "ld1b       {z2.b}, p1/z, [%[src_u]]       \n" \
+  "ld1b       {z3.b}, p1/z, [%[src_v]]       \n" \
+  "incb       %[src_y]                       \n" \
+  "incb       %[src_u]                       \n" \
+  "incb       %[src_v]                       \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "prfm       pldl1keep, [%[src_u], 128]     \n" \
+  "prfm       pldl1keep, [%[src_v], 128]     \n" \
+  "trn2       z1.b, z0.b, z0.b               \n" \
+  "trn1       z0.b, z0.b, z0.b               \n"
+
 #define READYUV400_SVE                           \
   "ld1b       {z0.h}, p1/z, [%[src_y]]       \n" \
   "inch       %[src_y]                       \n" \
@@ -193,6 +208,30 @@ extern "C" {
   "uqsub      z16.h, z16.h, z25.h            \n" /* B */  \
   "uqsub      z18.h, z18.h, z27.h            \n" /* R */
 
+#define I444TORGB_SVE_2X                                  \
+  "umulh      z0.h, z24.h, z0.h              \n" /* Y0 */ \
+  "umulh      z1.h, z24.h, z1.h              \n" /* Y1 */ \
+  "umullb     z6.h, z30.b, z2.b              \n"          \
+  "umullt     z7.h, z30.b, z2.b              \n"          \
+  "umullb     z4.h, z28.b, z2.b              \n" /* DB */ \
+  "umullt     z2.h, z28.b, z2.b              \n" /* DB */ \
+  "umlalb     z6.h, z31.b, z3.b              \n" /* DG */ \
+  "umlalt     z7.h, z31.b, z3.b              \n" /* DG */ \
+  "umullb     z5.h, z29.b, z3.b              \n" /* DR */ \
+  "umullt     z3.h, z29.b, z3.b              \n" /* DR */ \
+  "add        z17.h, z0.h, z26.h             \n" /* G */  \
+  "add        z21.h, z1.h, z26.h             \n" /* G */  \
+  "add        z16.h, z0.h, z4.h              \n" /* B */  \
+  "add        z20.h, z1.h, z2.h              \n" /* B */  \
+  "add        z18.h, z0.h, z5.h              \n" /* R */  \
+  "add        z22.h, z1.h, z3.h              \n" /* R */  \
+  "uqsub      z17.h, z17.h, z6.h             \n" /* G */  \
+  "uqsub      z21.h, z21.h, z7.h             \n" /* G */  \
+  "uqsub      z16.h, z16.h, z25.h            \n" /* B */  \
+  "uqsub      z20.h, z20.h, z25.h            \n" /* B */  \
+  "uqsub      z18.h, z18.h, z27.h            \n" /* R */  \
+  "uqsub      z22.h, z22.h, z27.h            \n" /* R */
+
 // Like I4XXTORGB_SVE but U/V components are stored in even/odd .b lanes of z1
 // rather than widened .h elements of z1/z2.
 #define NVTORGB_SVE                                       \
@@ -318,6 +357,53 @@ extern "C" {
       "z20", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30",   \
       "z31", "p0", "p1", "p2", "p3"
 
+static inline void I444ToRGB24Row_SVE_SC(
+    const uint8_t* src_y,
+    const uint8_t* src_u,
+    const uint8_t* src_v,
+    uint8_t* dst_rgb24,
+    const struct YuvConstants* yuvconstants,
+    int width) STREAMING_COMPATIBLE {
+  uint64_t vl;
+  asm volatile(
+      "cntb     %[vl]                                     \n"
+      "ptrue    p0.b                                      \n"  //
+      YUVTORGB_SVE_SETUP
+      "subs     %w[width], %w[width], %w[vl]              \n"
+      "b.lt     2f                                        \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p1.b                                      \n"
+      "1:                                                 \n"  //
+      READYUV444_SVE_2X I444TORGB_SVE_2X RGBTOARGB8_SVE_2X
+      "subs     %w[width], %w[width], %w[vl]              \n"
+      "st3b     {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"
+      "incb     %[dst_rgb24], all, mul #3                 \n"
+      "b.ge     1b                                        \n"
+
+      "2:                                                 \n"
+      "adds     %w[width], %w[width], %w[vl]              \n"
+      "b.eq     99f                                       \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "cnth     %[vl]                                     \n"
+      "whilelt  p1.b, wzr, %w[width]                      \n"  //
+      READYUV444_SVE_2X I444TORGB_SVE_2X RGBTOARGB8_SVE_2X
+      "st3b     {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"
+
+      "99:                                                \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_argb]
+        [width] "+r"(width),                               // %[width]
+        [vl] "=&r"(vl)                                     // %[vl]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
 static inline void I400ToARGBRow_SVE_SC(const uint8_t* src_y,
                                         uint8_t* dst_argb,
                                         const struct YuvConstants* yuvconstants,
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 41997fe3b..e9346f3e8 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -871,6 +871,16 @@ int I444ToRGB24Matrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444TORGB24ROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I444ToRGB24Row = I444ToRGB24Row_SVE2;
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    I444ToRGB24Row = I444ToRGB24Row_SME;
+  }
+#endif
 #if defined(HAS_I444TORGB24ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
@@ -7127,6 +7137,16 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444TORGB24ROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I444ToRGB24Row = I444ToRGB24Row_SVE2;
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    I444ToRGB24Row = I444ToRGB24Row_SME;
+  }
+#endif
 #if defined(HAS_I444TORGB24ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
@@ -8952,6 +8972,16 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I444TORGB24ROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I444ToRGB24Row = I444ToRGB24Row_SVE2;
+  }
+#endif
+#if defined(HAS_I444TORGB24ROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    I444ToRGB24Row = I444ToRGB24Row_SME;
+  }
+#endif
 #if defined(HAS_I444TORGB24ROW_RVV)
   if (TestCpuFlag(kCpuHasRVV)) {
     I444ToRGB24Row = I444ToRGB24Row_RVV;
diff --git a/source/row_sme.cc b/source/row_sme.cc
index 1cbc42f3e..c6917bf3c 100644
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@@ -19,45 +19,6 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
     defined(__aarch64__)
 
-// Read twice as much data from YUV, putting the even elements from the Y data
-// in z0.h and odd elements in z1.h.
-#define READYUV444_SVE_2X                        \
-  "ld1b       {z0.b}, p1/z, [%[src_y]]       \n" \
-  "ld1b       {z2.b}, p1/z, [%[src_u]]       \n" \
-  "ld1b       {z3.b}, p1/z, [%[src_v]]       \n" \
-  "incb       %[src_y]                       \n" \
-  "incb       %[src_u]                       \n" \
-  "incb       %[src_v]                       \n" \
-  "prfm       pldl1keep, [%[src_y], 448]     \n" \
-  "prfm       pldl1keep, [%[src_u], 128]     \n" \
-  "prfm       pldl1keep, [%[src_v], 128]     \n" \
-  "trn2       z1.b, z0.b, z0.b               \n" \
-  "trn1       z0.b, z0.b, z0.b               \n"
-
-#define I444TORGB_SVE_2X                                  \
-  "umulh      z0.h, z24.h, z0.h              \n" /* Y0 */ \
-  "umulh      z1.h, z24.h, z1.h              \n" /* Y1 */ \
-  "umullb     z6.h, z30.b, z2.b              \n"          \
-  "umullt     z7.h, z30.b, z2.b              \n"          \
-  "umullb     z4.h, z28.b, z2.b              \n" /* DB */ \
-  "umullt     z2.h, z28.b, z2.b              \n" /* DB */ \
-  "umlalb     z6.h, z31.b, z3.b              \n" /* DG */ \
-  "umlalt     z7.h, z31.b, z3.b              \n" /* DG */ \
-  "umullb     z5.h, z29.b, z3.b              \n" /* DR */ \
-  "umullt     z3.h, z29.b, z3.b              \n" /* DR */ \
-  "add        z17.h, z0.h, z26.h             \n" /* G */  \
-  "add        z21.h, z1.h, z26.h             \n" /* G */  \
-  "add        z16.h, z0.h, z4.h              \n" /* B */  \
-  "add        z20.h, z1.h, z2.h              \n" /* B */  \
-  "add        z18.h, z0.h, z5.h              \n" /* R */  \
-  "add        z22.h, z1.h, z3.h              \n" /* R */  \
-  "uqsub      z17.h, z17.h, z6.h             \n" /* G */  \
-  "uqsub      z21.h, z21.h, z7.h             \n" /* G */  \
-  "uqsub      z16.h, z16.h, z25.h            \n" /* B */  \
-  "uqsub      z20.h, z20.h, z25.h            \n" /* B */  \
-  "uqsub      z18.h, z18.h, z27.h            \n" /* R */  \
-  "uqsub      z22.h, z22.h, z27.h            \n" /* R */
-
 #define RGBTOARGB8_SVE_2X                                 \
   /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.b */ \
   "uqshrnb     z16.b, z16.h, #6     \n" /* B0 */          \
@@ -115,6 +76,16 @@ __arm_locally_streaming void I444ToARGBRow_SME(
       : "cc", "memory", YUVTORGB_SVE_REGS);
 }
 
+__arm_locally_streaming void I444ToRGB24Row_SME(
+    const uint8_t* src_y,
+    const uint8_t* src_u,
+    const uint8_t* src_v,
+    uint8_t* dst_rgb24,
+    const struct YuvConstants* yuvconstants,
+    int width) {
+  I444ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+}
+
 __arm_locally_streaming void I400ToARGBRow_SME(
     const uint8_t* src_y,
     uint8_t* dst_argb,
diff --git a/source/row_sve.cc b/source/row_sve.cc
index ba89b163a..474c7950c 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -82,6 +82,15 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
       : "cc", "memory", YUVTORGB_SVE_REGS);
 }
 
+void I444ToRGB24Row_SVE2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  I444ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+}
+
 void I400ToARGBRow_SVE2(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,