[AArch64] Add SVE2 implementations for RAWTo{ARGB,RGBA}Row

We can construct particular predicates to load only up to 3/4 of a full vector, allowing us to use TBL to shuffle elements into the correct place rather than needing to rely on more expensive LD3 or ST4 instructions. Reduction in runtimes observed compared to the existing Neon implementation: | RAWToARGBRow | RAWToRGBARow Cortex-A510 | -32.4% | -31.9% Cortex-A720 | -15.7% | -15.6% Cortex-X2 | -24.6% | -24.4% Bug: libyuv:973 Change-Id: I271c625d97bab3b0e08ac1e9d7fcf7d18f3d6894 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5631542 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
2026-01-01 03:12:16 +08:00 · 2024-04-25 14:51:56 +01:00 · 2024-04-25 14:51:56 +01:00 · c613c3f102
commit c613c3f102
parent d1ec694ad3
3 changed files with 79 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -600,6 +600,8 @@ extern "C" {
 #define HAS_I444TOARGBROW_SVE2
 #define HAS_NV12TOARGBROW_SVE2
 #define HAS_NV21TOARGBROW_SVE2
+#define HAS_RAWTOARGBROW_SVE2
+#define HAS_RAWTORGBAROW_SVE2
 #define HAS_RGBATOUVROW_SVE2
 #define HAS_UYVYTOARGBROW_SVE2
 #define HAS_YUY2TOARGBROW_SVE2
@ -3510,7 +3512,9 @@ void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
                         int width);
 void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
+void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -3555,6 +3555,11 @@ int RAWToARGB(const uint8_t* src_raw,
    }
  }
 #endif
+#if defined(HAS_RAWTOARGBROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RAWToARGBRow = RAWToARGBRow_SVE2;
+  }
+#endif
 #if defined(HAS_RAWTOARGBROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    RAWToARGBRow = RAWToARGBRow_Any_MSA;
@ -3635,6 +3640,11 @@ int RAWToRGBA(const uint8_t* src_raw,
    }
  }
 #endif
+#if defined(HAS_RAWTORGBAROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RAWToRGBARow = RAWToRGBARow_SVE2;
+  }
+#endif
 #if defined(HAS_RAWTORGBAROW_RVV)
  if (TestCpuFlag(kCpuHasRVV)) {
    RAWToRGBARow = RAWToRGBARow_RVV;
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -1113,6 +1113,71 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
      : "cc", "memory", YUVTORGB_SVE_REGS, "p2");
 }

+static inline void RAWToWXYZRow_SVE2(const uint8_t* src_raw,
+                                     uint8_t* dst_wxyz,
+                                     int width,
+                                     uint32_t idx_start,
+                                     uint32_t idx_step,
+                                     uint32_t alpha) {
+  uint32_t vl;
+  asm("cntw %x0" : "=r"(vl));
+  uint32_t vl_mul3 = vl * 3;
+  asm volatile(
+      "index   z31.s, %w[idx_start], %w[idx_step]        \n"
+      "dup     z30.s, %w[alpha]                          \n"
+      "subs     %w[width], %w[width], %w[vl]             \n"
+      "b.lt     2f                                       \n"
+
+      // Run bulk of computation with the same predicates to avoid predicate
+      // generation overhead. We set up p1 to only load 3/4 of a vector.
+      "ptrue   p0.s                                      \n"
+      "whilelt p1.b, wzr, %w[vl_mul3]                    \n"
+      "1:                                                \n"
+      "ld1b    {z0.b}, p1/z, [%[src]]                    \n"
+      "add     %[src], %[src], %x[vl_mul3]               \n"
+      "tbl     z0.b, {z0.b}, z31.b                       \n"
+      "subs    %w[width], %w[width], %w[vl]              \n"
+      "orr     z0.d, z0.d, z30.d                         \n"
+      "st1w    {z0.s}, p0, [%[dst]]                      \n"
+      "incb    %[dst]                                    \n"
+      "b.ge    1b                                        \n"
+
+      "2:                                                \n"
+      "adds     %w[width], %w[width], %w[vl]             \n"
+      "b.eq     99f                                      \n"
+
+      // Calculate a pair of predicates for the final iteration to deal with
+      // the tail.
+      "add     %w[vl_mul3], %w[width], %w[width], lsl #1 \n"
+      "whilelt p0.s, wzr, %w[width]                      \n"
+      "whilelt p1.b, wzr, %w[vl_mul3]                    \n"
+      "ld1b    {z0.b}, p1/z, [%[src]]                    \n"
+      "tbl     z0.b, {z0.b}, z31.b                       \n"
+      "orr     z0.d, z0.d, z30.d                         \n"
+      "st1w    {z0.s}, p0, [%[dst]]                      \n"
+
+      "99:                                               \n"
+      : [src] "+r"(src_raw),         // %[src]
+        [dst] "+r"(dst_wxyz),        // %[dst]
+        [width] "+r"(width),         // %[width]
+        [vl_mul3] "+r"(vl_mul3)      // %[vl_mul3]
+      : [idx_start] "r"(idx_start),  // %[idx_start]
+        [idx_step] "r"(idx_step),    // %[idx_step]
+        [alpha] "r"(alpha),          // %[alpha]
+        [vl] "r"(vl)                 // %[vl]
+      : "cc", "memory", "z0", "z30", "z31", "p0", "p1");
+}
+
+void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  RAWToWXYZRow_SVE2(src_raw, dst_argb, width, 0xff000102U, 0x00030303U,
+                    0xff000000U);
+}
+
+void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  RAWToWXYZRow_SVE2(src_raw, dst_rgba, width, 0x000102ffU, 0x03030300U,
+                    0x000000ffU);
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)

 #ifdef __cplusplus