[AArch64] Add SVE2 implementation of I400ToARGBRow

This is mostly a copy of the I422ToARGBRow_SVE2 implementation, but we can pre-calculate the UV component results before the loop body. Unlike in the Neon version of the code we can make use of MOVPRFX and USQADD to avoid needing to apply the bias separately from the UV coefficient multiply additions. Reduction in runtime observed compared to the existing Neon code: Cortex-A510: -26.1% Cortex-A520: -5.9% Cortex-A715: -49.5% Cortex-A720: -49.4% Cortex-X2: -22.5% Cortex-X3: -23.5% Cortex-X4: -21.6% Bug: libyuv:973 Change-Id: Ib9fc52bd53a1c6a1aac8bd865ab88539aca098ea Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5598767 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-06-16 08:56:09 +08:00 · 2024-04-23 15:13:47 +01:00 · 2024-04-23 15:13:47 +01:00 · cd4113f4e8
commit cd4113f4e8
parent 34abe98fe2
3 changed files with 70 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -585,6 +585,7 @@ extern "C" {
 #define HAS_AYUVTOUVROW_SVE2
 #define HAS_AYUVTOVUROW_SVE2
 #define HAS_BGRATOUVROW_SVE2
+#define HAS_I400TOARGBROW_SVE2
 #define HAS_I422ALPHATOARGBROW_SVE2
 #define HAS_I422TOARGBROW_SVE2
 #define HAS_I422TORGBAROW_SVE2
@ -4743,6 +4744,10 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I400ToARGBRow_SVE2(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I400ToARGBRow_MSA(const uint8_t* src_y,
                       uint8_t* dst_argb,
                       const struct YuvConstants* yuvconstants,
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -2980,6 +2980,11 @@ int I400ToARGBMatrix(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I400TOARGBROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I400ToARGBRow = I400ToARGBRow_SVE2;
+  }
+#endif
 #if defined(HAS_I400TOARGBROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    I400ToARGBRow = I400ToARGBRow_Any_MSA;
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -43,6 +43,12 @@ extern "C" {
  "trn1       z1.h, z1.h, z1.h               \n" \
  "trn1       z2.h, z2.h, z2.h               \n"

+#define READYUV400_SVE                           \
+  "ld1b       {z0.h}, p1/z, [%[src_y]]       \n" \
+  "inch       %[src_y]                       \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "trn1       z0.b, z0.b, z0.b               \n"
+
 // We need a different predicate for the UV component to handle the tail.
 // If there is a single element remaining then we want to load one Y element
 // but two UV elements.
@ -80,6 +86,14 @@ extern "C" {
  "uqsub      z16.h, z16.h, z25.h            \n" /* B */  \
  "uqsub      z18.h, z18.h, z27.h            \n" /* R */

+#define I400TORGB_SVE                                    \
+  "umulh      z18.h, z24.h, z0.h             \n" /* Y */ \
+  "movprfx    z16, z18                       \n"         \
+  "usqadd     z16.h, p0/m, z16.h, z4.h       \n" /* B */ \
+  "movprfx    z17, z18                       \n"         \
+  "usqadd     z17.h, p0/m, z17.h, z6.h       \n" /* G */ \
+  "usqadd     z18.h, p0/m, z18.h, z5.h       \n" /* R */
+
 // Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA
 // pairs to allow us to use ST2 for storing rather than ST4.
 #define RGBTOARGB8_SVE                                    \
@ -146,6 +160,52 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
      : "cc", "memory", YUVTORGB_SVE_REGS);
 }

+void I400ToARGBRow_SVE2(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  uint64_t vl;
+  asm("cnth     %[vl]                                   \n"
+      "ptrue    p0.b                                    \n"
+      "dup      z19.b, #255                             \n"  // A
+      YUVTORGB_SVE_SETUP
+      "cmp      %w[width], %w[vl]                       \n"
+      "mov      z1.h, #128                              \n"  // U/V
+      "mul      z6.h, z30.h, z1.h                       \n"
+      "mul      z4.h, z28.h, z1.h                       \n"  // DB
+      "mul      z5.h, z29.h, z1.h                       \n"  // DR
+      "mla      z6.h, p0/m, z31.h, z1.h                 \n"  // DG
+      "sub      z4.h, z4.h, z25.h                       \n"
+      "sub      z5.h, z5.h, z27.h                       \n"
+      "sub      z6.h, z26.h, z6.h                       \n"
+      "b.le     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p1.h                                    \n"
+      "sub      %w[width], %w[width], %w[vl]            \n"
+      "1:                                               \n"  //
+      READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
+      "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
+      "b.gt     1b                                      \n"
+      "add      %w[width], %w[width], %w[vl]            \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "2:                                               \n"
+      "whilelt  p1.h, wzr, %w[width]                    \n"  //
+      READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE
+      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width),                               // %[width]
+        [vl] "=&r"(vl)                                     // %[vl]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
 void I422ToARGBRow_SVE2(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,