From c613c3f1024d6d59b30f3816fc717a4f074b532e Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 25 Apr 2024 14:51:56 +0100
Subject: [PATCH] [AArch64] Add SVE2 implementations for RAWTo{ARGB,RGBA}Row

We can construct particular predicates to load only up to 3/4 of a full
vector, allowing us to use TBL to shuffle elements into the correct
place rather than needing to rely on more expensive LD3 or ST4
instructions.

Reduction in runtimes observed compared to the existing Neon
implementation:

            | RAWToARGBRow | RAWToRGBARow
Cortex-A510 |       -32.4% |       -31.9%
Cortex-A720 |       -15.7% |       -15.6%
  Cortex-X2 |       -24.6% |       -24.4%

Bug: libyuv:973
Change-Id: I271c625d97bab3b0e08ac1e9d7fcf7d18f3d6894
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5631542
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
---
 include/libyuv/row.h   |  4 +++
 source/convert_argb.cc | 10 +++++++
 source/row_sve.cc      | 65 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 86b024321..8e3c6cb0d 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -600,6 +600,8 @@ extern "C" {
 #define HAS_I444TOARGBROW_SVE2
 #define HAS_NV12TOARGBROW_SVE2
 #define HAS_NV21TOARGBROW_SVE2
+#define HAS_RAWTOARGBROW_SVE2
+#define HAS_RAWTORGBAROW_SVE2
 #define HAS_RGBATOUVROW_SVE2
 #define HAS_UYVYTOARGBROW_SVE2
 #define HAS_YUY2TOARGBROW_SVE2
@@ -3510,7 +3512,9 @@ void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
                          int width);
 void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
+void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 11948726a..73f004b81 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -3555,6 +3555,11 @@ int RAWToARGB(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOARGBROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RAWToARGBRow = RAWToARGBRow_SVE2;
+  }
+#endif
 #if defined(HAS_RAWTOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     RAWToARGBRow = RAWToARGBRow_Any_MSA;
@@ -3635,6 +3640,11 @@ int RAWToRGBA(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTORGBAROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RAWToRGBARow = RAWToRGBARow_SVE2;
+  }
+#endif
 #if defined(HAS_RAWTORGBAROW_RVV)
   if (TestCpuFlag(kCpuHasRVV)) {
     RAWToRGBARow = RAWToRGBARow_RVV;
diff --git a/source/row_sve.cc b/source/row_sve.cc
index 89a86d53b..66e1d17df 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -1113,6 +1113,71 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
       : "cc", "memory", YUVTORGB_SVE_REGS, "p2");
 }
 
+static inline void RAWToWXYZRow_SVE2(const uint8_t* src_raw,
+                                     uint8_t* dst_wxyz,
+                                     int width,
+                                     uint32_t idx_start,
+                                     uint32_t idx_step,
+                                     uint32_t alpha) {
+  uint32_t vl;
+  asm("cntw %x0" : "=r"(vl));
+  uint32_t vl_mul3 = vl * 3;
+  asm volatile(
+      "index   z31.s, %w[idx_start], %w[idx_step]        \n"
+      "dup     z30.s, %w[alpha]                          \n"
+      "subs     %w[width], %w[width], %w[vl]             \n"
+      "b.lt     2f                                       \n"
+
+      // Run bulk of computation with the same predicates to avoid predicate
+      // generation overhead. We set up p1 to only load 3/4 of a vector.
+      "ptrue   p0.s                                      \n"
+      "whilelt p1.b, wzr, %w[vl_mul3]                    \n"
+      "1:                                                \n"
+      "ld1b    {z0.b}, p1/z, [%[src]]                    \n"
+      "add     %[src], %[src], %x[vl_mul3]               \n"
+      "tbl     z0.b, {z0.b}, z31.b                       \n"
+      "subs    %w[width], %w[width], %w[vl]              \n"
+      "orr     z0.d, z0.d, z30.d                         \n"
+      "st1w    {z0.s}, p0, [%[dst]]                      \n"
+      "incb    %[dst]                                    \n"
+      "b.ge    1b                                        \n"
+
+      "2:                                                \n"
+      "adds     %w[width], %w[width], %w[vl]             \n"
+      "b.eq     99f                                      \n"
+
+      // Calculate a pair of predicates for the final iteration to deal with
+      // the tail.
+      "add     %w[vl_mul3], %w[width], %w[width], lsl #1 \n"
+      "whilelt p0.s, wzr, %w[width]                      \n"
+      "whilelt p1.b, wzr, %w[vl_mul3]                    \n"
+      "ld1b    {z0.b}, p1/z, [%[src]]                    \n"
+      "tbl     z0.b, {z0.b}, z31.b                       \n"
+      "orr     z0.d, z0.d, z30.d                         \n"
+      "st1w    {z0.s}, p0, [%[dst]]                      \n"
+
+      "99:                                               \n"
+      : [src] "+r"(src_raw),         // %[src]
+        [dst] "+r"(dst_wxyz),        // %[dst]
+        [width] "+r"(width),         // %[width]
+        [vl_mul3] "+r"(vl_mul3)      // %[vl_mul3]
+      : [idx_start] "r"(idx_start),  // %[idx_start]
+        [idx_step] "r"(idx_step),    // %[idx_step]
+        [alpha] "r"(alpha),          // %[alpha]
+        [vl] "r"(vl)                 // %[vl]
+      : "cc", "memory", "z0", "z30", "z31", "p0", "p1");
+}
+
+void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  RAWToWXYZRow_SVE2(src_raw, dst_argb, width, 0xff000102U, 0x00030303U,
+                    0xff000000U);
+}
+
+void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+  RAWToWXYZRow_SVE2(src_raw, dst_rgba, width, 0x000102ffU, 0x03030300U,
+                    0x000000ffU);
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
 
 #ifdef __cplusplus