From 96bbdb53ed6b5bdf2e940f6068016a78afcc7852 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 23 Apr 2024 08:22:57 +0100
Subject: [PATCH] [AArch64] Add SVE2 implementation of I422ToRGBARow

This is almost identical to the existing I422ToARGBRow_SVE2 kernel, we
just need to interleave differently for the output.

The RGBA format actually saves us an instruction compared to ARGB since
there is no need to merge in the alpha component, we can just replace
the odd elements of the alpha vector itself during the narrowing.

Also rename some existing macros to make more sense when distinguishing
between ARGB and RGBA.

Reductions in runtime observed compared to the existing Neon code:

Cortex-A510: -27.0%
Cortex-A720:  -5.3%
  Cortex-X2: -14.7%

Bug: libyuv:973
Change-Id: I1e12ff608ee49c25b918097007e16d87b39cb067
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5593797
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h   |  7 ++++
 source/convert_argb.cc | 10 +++++
 source/row_sve.cc      | 92 +++++++++++++++++++++++++++++++++---------
 3 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 6a7d30c0d..5625a9f25 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -585,6 +585,7 @@ extern "C" {
 #define HAS_BGRATOUVROW_SVE2
 #define HAS_I422ALPHATOARGBROW_SVE2
 #define HAS_I422TOARGBROW_SVE2
+#define HAS_I422TORGBAROW_SVE2
 #define HAS_I444ALPHATOARGBROW_SVE2
 #define HAS_I444TOARGBROW_SVE2
 #define HAS_RGBATOUVROW_SVE2
@@ -1154,6 +1155,12 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I422ToRGBARow_SVE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I422ToRGB24Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 11c7f3cc4..2d6aec160 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -4906,6 +4906,11 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I422ToRGBARow = I422ToRGBARow_SVE2;
+  }
+#endif
 #if defined(HAS_I422TORGBAROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToRGBARow = I422ToRGBARow_Any_MSA;
@@ -5134,6 +5139,11 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGBAROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I422ToRGBARow = I422ToRGBARow_SVE2;
+  }
+#endif
 #if defined(HAS_I422TORGBAROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToRGBARow = I422ToRGBARow_Any_MSA;
diff --git a/source/row_sve.cc b/source/row_sve.cc
index 4ef937d1c..79376a51a 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -66,17 +66,27 @@ extern "C" {
   "uqsub      z16.h, z16.h, z25.h            \n" /* B */  \
   "uqsub      z18.h, z18.h, z27.h            \n" /* R */
 
-// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as BG and RA
+// Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA
 // pairs to allow us to use ST2 for storing rather than ST4.
-#define RGBTORGBA8_SVE                  \
-  "uqshrnb     z16.b, z16.h, #6     \n" \
-  "uqshrnb     z18.b, z18.h, #6     \n" \
-  "uqshrnt     z16.b, z17.h, #6     \n" \
-  "trn1        z17.b, z18.b, z19.b  \n"
+#define RGBTOARGB8_SVE                                    \
+  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.b */ \
+  "uqshrnb     z16.b, z16.h, #6     \n" /* B0 */          \
+  "uqshrnb     z18.b, z18.h, #6     \n" /* R0 */          \
+  "uqshrnt     z16.b, z17.h, #6     \n" /* BG */          \
+  "trn1        z17.b, z18.b, z19.b  \n" /* RA */
+
+// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as AB and GR
+// pairs to allow us to use ST2 for storing rather than ST4.
+#define RGBTORGBA8_SVE                                    \
+  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.b */ \
+  "uqshrnt     z19.b, z16.h, #6     \n" /* AB */          \
+  "uqshrnb     z20.b, z17.h, #6     \n" /* G0 */          \
+  "uqshrnt     z20.b, z18.h, #6     \n" /* GR */
 
 #define YUVTORGB_SVE_REGS                                                     \
   "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", \
-      "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "p0", "p1"
+      "z20", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "p0",    \
+      "p1"
 
 void I444ToARGBRow_SVE2(const uint8_t* src_y,
                         const uint8_t* src_u,
@@ -95,7 +105,7 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
       // generation overhead.
       "ptrue    p1.h                                    \n"
       "1:                                               \n" READYUV444_SVE
-          I4XXTORGB_SVE RGBTORGBA8_SVE
+          I4XXTORGB_SVE RGBTOARGB8_SVE
       "subs     %w[width], %w[width], %w[vl]            \n"
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
       "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@@ -107,7 +117,7 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
 
       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p1.h, wzr, %w[width]                    \n" READYUV444_SVE
-          I4XXTORGB_SVE RGBTORGBA8_SVE
+          I4XXTORGB_SVE RGBTOARGB8_SVE
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
 
       "99:                                              \n"
@@ -139,7 +149,7 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
       // generation overhead.
       "ptrue    p1.h                                    \n"
       "1:                                               \n" READYUV422_SVE
-          I4XXTORGB_SVE RGBTORGBA8_SVE
+          I4XXTORGB_SVE RGBTOARGB8_SVE
       "subs     %w[width], %w[width], %w[vl]            \n"
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
       "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@@ -151,7 +161,7 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
 
       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p1.h, wzr, %w[width]                    \n" READYUV422_SVE
-          I4XXTORGB_SVE RGBTORGBA8_SVE
+          I4XXTORGB_SVE RGBTOARGB8_SVE
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
 
       "99:                                              \n"
@@ -166,6 +176,50 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
       : "cc", "memory", YUVTORGB_SVE_REGS);
 }
 
+void I422ToRGBARow_SVE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  uint64_t vl;
+  asm("cnth     %[vl]                                   \n"
+      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
+      "dup      z19.b, #255                             \n"  // A
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.le     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p1.h                                    \n"
+      "1:                                               \n"  //
+      READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "st2h     {z19.h, z20.h}, p1, [%[dst_argb]]       \n"
+      "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
+      "b.gt     1b                                      \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "2:                                               \n"
+      "adds    %w[width], %w[width], %w[vl]             \n"
+      "b.eq    99f                                      \n"
+
+      "whilelt  p1.h, wzr, %w[width]                    \n"  //
+      READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE
+      "st2h     {z19.h, z20.h}, p1, [%[dst_argb]]       \n"
+
+      "99:                                              \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width),                               // %[width]
+        [vl] "=&r"(vl)                                     // %[vl]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
 void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
                              const uint8_t* src_u,
                              const uint8_t* src_v,
@@ -183,9 +237,9 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
       // generation overhead.
       "ptrue    p1.h                                    \n"
       "1:                                               \n" READYUV444_SVE
-      "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"  // A
-      "add      %[src_a], %[src_a], %[vl]               \n" I4XXTORGB_SVE
-          RGBTORGBA8_SVE
+      "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"
+      "add      %[src_a], %[src_a], %[vl]               \n"  // A
+      I4XXTORGB_SVE RGBTOARGB8_SVE
       "subs     %w[width], %w[width], %w[vl]            \n"
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
       "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@@ -198,7 +252,7 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p1.h, wzr, %w[width]                    \n" READYUV444_SVE
       "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"  // A
-      I4XXTORGB_SVE RGBTORGBA8_SVE
+      I4XXTORGB_SVE RGBTOARGB8_SVE
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
 
       "99:                                              \n"
@@ -231,9 +285,9 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
       // generation overhead.
       "ptrue    p1.h                                    \n"
       "1:                                               \n" READYUV422_SVE
-      "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"  // A
-      "add      %[src_a], %[src_a], %[vl]               \n" I4XXTORGB_SVE
-          RGBTORGBA8_SVE
+      "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"
+      "add      %[src_a], %[src_a], %[vl]               \n"  // A
+      I4XXTORGB_SVE RGBTOARGB8_SVE
       "subs     %w[width], %w[width], %w[vl]            \n"
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
       "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@@ -246,7 +300,7 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
       // Calculate a predicate for the final iteration to deal with the tail.
       "whilelt  p1.h, wzr, %w[width]                    \n" READYUV422_SVE
       "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"  // A
-      I4XXTORGB_SVE RGBTORGBA8_SVE
+      I4XXTORGB_SVE RGBTOARGB8_SVE
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
 
       "99:                                              \n"