[AArch64] Add SVE2 implementation of RAWToRGB24Row

There is no nice way of forming the TBL permute indices here since we are operating on sets of three bytes at a time, so instead load the appropriate indices from a static array. We can make use of SVE predication to ensure we are operating on a multiple of three bytes for the load/store instructions rather than needing to make use of more expensive LD3 or ST3 instructions. Reduction in runtime observed compared to the existing Neon implementation: Cortex-A510: -39.2% Cortex-A720: -34.5% Cortex-X2: -31.0% Bug: libyuv:973 Change-Id: I68560bde7a529e5cec150b0e9d3ffe4341038fb8 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5631543 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-07 01:06:46 +08:00 · 2024-04-25 21:18:19 +01:00 · 2024-04-25 21:18:19 +01:00 · 11ff6067a5
commit 11ff6067a5
parent c613c3f102
3 changed files with 70 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -601,6 +601,7 @@ extern "C" {
 #define HAS_NV12TOARGBROW_SVE2
 #define HAS_NV21TOARGBROW_SVE2
 #define HAS_RAWTOARGBROW_SVE2
+#define HAS_RAWTORGB24ROW_SVE2
 #define HAS_RAWTORGBAROW_SVE2
 #define HAS_RGBATOUVROW_SVE2
 #define HAS_UYVYTOARGBROW_SVE2
@ -3521,6 +3522,7 @@ void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_SVE2(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -3358,6 +3358,11 @@ int RAWToRGB24(const uint8_t* src_raw,
    }
  }
 #endif
+#if defined(HAS_RAWTORGB24ROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    RAWToRGB24Row = RAWToRGB24Row_SVE2;
+  }
+#endif
 #if defined(HAS_RAWTORGB24ROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -1178,6 +1178,69 @@ void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
                    0x000000ffU);
 }

+static const uint8_t kRAWToRGB24Indices[] = {
+    2,   1,   0,   5,   4,   3,   8,   7,   6,   11,  10,  9,   14,  13,  12,
+    17,  16,  15,  20,  19,  18,  23,  22,  21,  26,  25,  24,  29,  28,  27,
+    32,  31,  30,  35,  34,  33,  38,  37,  36,  41,  40,  39,  44,  43,  42,
+    47,  46,  45,  50,  49,  48,  53,  52,  51,  56,  55,  54,  59,  58,  57,
+    62,  61,  60,  65,  64,  63,  68,  67,  66,  71,  70,  69,  74,  73,  72,
+    77,  76,  75,  80,  79,  78,  83,  82,  81,  86,  85,  84,  89,  88,  87,
+    92,  91,  90,  95,  94,  93,  98,  97,  96,  101, 100, 99,  104, 103, 102,
+    107, 106, 105, 110, 109, 108, 113, 112, 111, 116, 115, 114, 119, 118, 117,
+    122, 121, 120, 125, 124, 123, 128, 127, 126, 131, 130, 129, 134, 133, 132,
+    137, 136, 135, 140, 139, 138, 143, 142, 141, 146, 145, 144, 149, 148, 147,
+    152, 151, 150, 155, 154, 153, 158, 157, 156, 161, 160, 159, 164, 163, 162,
+    167, 166, 165, 170, 169, 168, 173, 172, 171, 176, 175, 174, 179, 178, 177,
+    182, 181, 180, 185, 184, 183, 188, 187, 186, 191, 190, 189, 194, 193, 192,
+    197, 196, 195, 200, 199, 198, 203, 202, 201, 206, 205, 204, 209, 208, 207,
+    212, 211, 210, 215, 214, 213, 218, 217, 216, 221, 220, 219, 224, 223, 222,
+    227, 226, 225, 230, 229, 228, 233, 232, 231, 236, 235, 234, 239, 238, 237,
+    242, 241, 240, 245, 244, 243, 248, 247, 246, 251, 250, 249, 254, 253, 252};
+
+void RAWToRGB24Row_SVE2(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  // width is in elements, convert to bytes.
+  width *= 3;
+  // we use the mul3 predicate pattern throughout to use the largest multiple
+  // of three number of lanes, for instance with a vector length of 16 bytes
+  // only the first 15 bytes will be used for load/store instructions.
+  uint32_t vl;
+  asm volatile(
+      "cntb    %x[vl], mul3                              \n"
+      "ptrue   p0.b, mul3                                \n"
+      "ld1b    {z31.b}, p0/z, [%[kIndices]]              \n"
+      "subs    %w[width], %w[width], %w[vl]              \n"
+      "b.lt    2f                                        \n"
+
+      // Run bulk of computation with the same predicate to avoid predicate
+      // generation overhead.
+      "1:                                                \n"
+      "ld1b    {z0.b}, p0/z, [%[src]]                    \n"
+      "add     %[src], %[src], %x[vl]                    \n"
+      "tbl     z0.b, {z0.b}, z31.b                       \n"
+      "subs    %w[width], %w[width], %w[vl]              \n"
+      "st1b    {z0.b}, p0, [%[dst]]                      \n"
+      "add     %[dst], %[dst], %x[vl]                    \n"
+      "b.ge    1b                                        \n"
+
+      "2:                                                \n"
+      "adds    %w[width], %w[width], %w[vl]              \n"
+      "b.eq    99f                                       \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt p0.b, wzr, %w[width]                      \n"
+      "ld1b    {z0.b}, p0/z, [%[src]]                    \n"
+      "tbl     z0.b, {z0.b}, z31.b                       \n"
+      "st1b    {z0.b}, p0, [%[dst]]                      \n"
+
+      "99:                                               \n"
+      : [src] "+r"(src_raw),                // %[src]
+        [dst] "+r"(dst_rgb24),              // %[dst]
+        [width] "+r"(width),                // %[width]
+        [vl] "=&r"(vl)                      // %[vl]
+      : [kIndices] "r"(kRAWToRGB24Indices)  // %[kIndices]
+      : "cc", "memory", "z0", "z31", "p0");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)

 #ifdef __cplusplus