[AArch64] Add SVE2 implementation of I422ToRGBARow

This is almost identical to the existing I422ToARGBRow_SVE2 kernel, we just need to interleave differently for the output. The RGBA format actually saves us an instruction compared to ARGB since there is no need to merge in the alpha component, we can just replace the odd elements of the alpha vector itself during the narrowing. Also rename some existing macros to make more sense when distinguishing between ARGB and RGBA. Reductions in runtime observed compared to the existing Neon code: Cortex-A510: -27.0% Cortex-A720: -5.3% Cortex-X2: -14.7% Bug: libyuv:973 Change-Id: I1e12ff608ee49c25b918097007e16d87b39cb067 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5593797 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2024-04-23 08:22:57 +01:00 · 2024-04-23 08:22:57 +01:00 · 96bbdb53ed
commit 96bbdb53ed
parent 004352ba16
3 changed files with 90 additions and 19 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -585,6 +585,7 @@ extern "C" {
 #define HAS_BGRATOUVROW_SVE2
 #define HAS_I422ALPHATOARGBROW_SVE2
 #define HAS_I422TOARGBROW_SVE2
+#define HAS_I422TORGBAROW_SVE2
 #define HAS_I444ALPHATOARGBROW_SVE2
 #define HAS_I444TOARGBROW_SVE2
 #define HAS_RGBATOUVROW_SVE2
@ -1154,6 +1155,12 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
                        uint8_t* dst_rgba,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToRGBARow_SVE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I422ToRGB24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -4906,6 +4906,11 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TORGBAROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I422ToRGBARow = I422ToRGBARow_SVE2;
+  }
+#endif
 #if defined(HAS_I422TORGBAROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    I422ToRGBARow = I422ToRGBARow_Any_MSA;
@ -5134,6 +5139,11 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TORGBAROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I422ToRGBARow = I422ToRGBARow_SVE2;
+  }
+#endif
 #if defined(HAS_I422TORGBAROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    I422ToRGBARow = I422ToRGBARow_Any_MSA;
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -66,17 +66,27 @@ extern "C" {
  "uqsub      z16.h, z16.h, z25.h            \n" /* B */  \
  "uqsub      z18.h, z18.h, z27.h            \n" /* R */

-// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as BG and RA
+// Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA
 // pairs to allow us to use ST2 for storing rather than ST4.
-#define RGBTORGBA8_SVE                  \
-  "uqshrnb     z16.b, z16.h, #6     \n" \
-  "uqshrnb     z18.b, z18.h, #6     \n" \
-  "uqshrnt     z16.b, z17.h, #6     \n" \
-  "trn1        z17.b, z18.b, z19.b  \n"
+#define RGBTOARGB8_SVE                                    \
+  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.b */ \
+  "uqshrnb     z16.b, z16.h, #6     \n" /* B0 */          \
+  "uqshrnb     z18.b, z18.h, #6     \n" /* R0 */          \
+  "uqshrnt     z16.b, z17.h, #6     \n" /* BG */          \
+  "trn1        z17.b, z18.b, z19.b  \n" /* RA */
+
+// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as AB and GR
+// pairs to allow us to use ST2 for storing rather than ST4.
+#define RGBTORGBA8_SVE                                    \
+  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h,  A: z19.b */ \
+  "uqshrnt     z19.b, z16.h, #6     \n" /* AB */          \
+  "uqshrnb     z20.b, z17.h, #6     \n" /* G0 */          \
+  "uqshrnt     z20.b, z18.h, #6     \n" /* GR */

 #define YUVTORGB_SVE_REGS                                                     \
  "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", \
-      "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "p0", "p1"
+      "z20", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "p0",    \
+      "p1"

 void I444ToARGBRow_SVE2(const uint8_t* src_y,
                        const uint8_t* src_u,
@ -95,7 +105,7 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
      // generation overhead.
      "ptrue    p1.h                                    \n"
      "1:                                               \n" READYUV444_SVE
-          I4XXTORGB_SVE RGBTORGBA8_SVE
+          I4XXTORGB_SVE RGBTOARGB8_SVE
      "subs     %w[width], %w[width], %w[vl]            \n"
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
      "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@ -107,7 +117,7 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt  p1.h, wzr, %w[width]                    \n" READYUV444_SVE
-          I4XXTORGB_SVE RGBTORGBA8_SVE
+          I4XXTORGB_SVE RGBTOARGB8_SVE
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"

      "99:                                              \n"
@ -139,7 +149,7 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
      // generation overhead.
      "ptrue    p1.h                                    \n"
      "1:                                               \n" READYUV422_SVE
-          I4XXTORGB_SVE RGBTORGBA8_SVE
+          I4XXTORGB_SVE RGBTOARGB8_SVE
      "subs     %w[width], %w[width], %w[vl]            \n"
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
      "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@ -151,7 +161,7 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,

      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt  p1.h, wzr, %w[width]                    \n" READYUV422_SVE
-          I4XXTORGB_SVE RGBTORGBA8_SVE
+          I4XXTORGB_SVE RGBTOARGB8_SVE
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"

      "99:                                              \n"
@ -166,6 +176,50 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
      : "cc", "memory", YUVTORGB_SVE_REGS);
 }

+void I422ToRGBARow_SVE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  uint64_t vl;
+  asm("cnth     %[vl]                                   \n"
+      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
+      "dup      z19.b, #255                             \n"  // A
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.le     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p1.h                                    \n"
+      "1:                                               \n"  //
+      READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "st2h     {z19.h, z20.h}, p1, [%[dst_argb]]       \n"
+      "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
+      "b.gt     1b                                      \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "2:                                               \n"
+      "adds    %w[width], %w[width], %w[vl]             \n"
+      "b.eq    99f                                      \n"
+
+      "whilelt  p1.h, wzr, %w[width]                    \n"  //
+      READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE
+      "st2h     {z19.h, z20.h}, p1, [%[dst_argb]]       \n"
+
+      "99:                                              \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width),                               // %[width]
+        [vl] "=&r"(vl)                                     // %[vl]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
 void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@ -183,9 +237,9 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
      // generation overhead.
      "ptrue    p1.h                                    \n"
      "1:                                               \n" READYUV444_SVE
-      "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"  // A
-      "add      %[src_a], %[src_a], %[vl]               \n" I4XXTORGB_SVE
-          RGBTORGBA8_SVE
+      "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"
+      "add      %[src_a], %[src_a], %[vl]               \n"  // A
+      I4XXTORGB_SVE RGBTOARGB8_SVE
      "subs     %w[width], %w[width], %w[vl]            \n"
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
      "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@ -198,7 +252,7 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt  p1.h, wzr, %w[width]                    \n" READYUV444_SVE
      "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"  // A
-      I4XXTORGB_SVE RGBTORGBA8_SVE
+      I4XXTORGB_SVE RGBTOARGB8_SVE
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"

      "99:                                              \n"
@ -231,9 +285,9 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
      // generation overhead.
      "ptrue    p1.h                                    \n"
      "1:                                               \n" READYUV422_SVE
-      "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"  // A
-      "add      %[src_a], %[src_a], %[vl]               \n" I4XXTORGB_SVE
-          RGBTORGBA8_SVE
+      "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"
+      "add      %[src_a], %[src_a], %[vl]               \n"  // A
+      I4XXTORGB_SVE RGBTOARGB8_SVE
      "subs     %w[width], %w[width], %w[vl]            \n"
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
      "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@ -246,7 +300,7 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
      // Calculate a predicate for the final iteration to deal with the tail.
      "whilelt  p1.h, wzr, %w[width]                    \n" READYUV422_SVE
      "ld1b     {z19.h}, p1/z, [%[src_a]]               \n"  // A
-      I4XXTORGB_SVE RGBTORGBA8_SVE
+      I4XXTORGB_SVE RGBTOARGB8_SVE
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"

      "99:                                              \n"