[AArch64] Unroll {RAW,RGB24}To{ARGB,RGBA}Row_SVE2

Unrolling gives a nice improvement to the little cores and even a small improvement to the big cores thanks to avoiding the loop control overhead. Observed performance improvement relative to the existing SVE2 code. | Cortex-A510 | Cortex-A720 | Cortex-X2 RAWToARGBRow_SVE2 | -28.4% | -10.1% | -3.5% RAWToRGBARow_SVE2 | -28.5% | -10.1% | -4.4% RGB24ToARGBRow_SVE2 | -28.5% | -10.4% | -5.5% Bug: libyuv:973 Change-Id: I7aa03fdaa1a24ecfdd13418647a02e5effe8333f Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5725174 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2024-05-03 13:56:29 +01:00 · 2024-05-03 13:56:29 +01:00 · 42d33341d3
commit 42d33341d3
parent 4ad050b5ec
1 changed files with 31 additions and 19 deletions
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -1164,11 +1164,12 @@ static inline void RAWToWXYZRow_SVE2(const uint8_t* src_raw,
  uint32_t vl;
  asm("cntw %x0" : "=r"(vl));
  uint32_t vl_mul3 = vl * 3;
+  uint32_t rem_mul3;
  asm volatile(
      "index   z31.s, %w[idx_start], %w[idx_step]        \n"
      "dup     z30.s, %w[alpha]                          \n"
-      "subs     %w[width], %w[width], %w[vl]             \n"
-      "b.lt     2f                                       \n"
+      "subs    %w[width], %w[width], %w[vl], lsl #1      \n"
+      "b.lt    2f                                        \n"

      // Run bulk of computation with the same predicates to avoid predicate
      // generation overhead. We set up p1 to only load 3/4 of a vector.
@ -1177,37 +1178,48 @@ static inline void RAWToWXYZRow_SVE2(const uint8_t* src_raw,
      "1:                                                \n"
      "ld1b    {z0.b}, p1/z, [%[src]]                    \n"
      "add     %[src], %[src], %x[vl_mul3]               \n"
+      "ld1b    {z1.b}, p1/z, [%[src]]                    \n"
+      "add     %[src], %[src], %x[vl_mul3]               \n"
+      "tbl     z0.b, {z0.b}, z31.b                       \n"
+      "tbl     z1.b, {z1.b}, z31.b                       \n"
+      "subs    %w[width], %w[width], %w[vl], lsl #1      \n"
+      "orr     z0.d, z0.d, z30.d                         \n"
+      "orr     z1.d, z1.d, z30.d                         \n"
+      "st1w    {z0.s}, p0, [%[dst]]                      \n"
+      "st1w    {z1.s}, p0, [%[dst], #1, mul vl]          \n"
+      "incb    %[dst], all, mul #2                       \n"
+      "b.ge    1b                                        \n"
+
+      "2:                                                \n"
+      "adds     %w[width], %w[width], %w[vl], lsl #1     \n"
+      "b.eq     99f                                      \n"
+
+      // Calculate a pair of predicates for the final iteration to deal with
+      // the tail.
+      "3:                                                \n"
+      "add     %w[rem_mul3], %w[width], %w[width], lsl #1 \n"
+      "whilelt p0.s, wzr, %w[width]                      \n"
+      "whilelt p1.b, wzr, %w[rem_mul3]                    \n"
+      "ld1b    {z0.b}, p1/z, [%[src]]                    \n"
+      "add     %[src], %[src], %x[vl_mul3]               \n"
      "tbl     z0.b, {z0.b}, z31.b                       \n"
      "subs    %w[width], %w[width], %w[vl]              \n"
      "orr     z0.d, z0.d, z30.d                         \n"
      "st1w    {z0.s}, p0, [%[dst]]                      \n"
      "incb    %[dst]                                    \n"
-      "b.ge    1b                                        \n"
-
-      "2:                                                \n"
-      "adds     %w[width], %w[width], %w[vl]             \n"
-      "b.eq     99f                                      \n"
-
-      // Calculate a pair of predicates for the final iteration to deal with
-      // the tail.
-      "add     %w[vl_mul3], %w[width], %w[width], lsl #1 \n"
-      "whilelt p0.s, wzr, %w[width]                      \n"
-      "whilelt p1.b, wzr, %w[vl_mul3]                    \n"
-      "ld1b    {z0.b}, p1/z, [%[src]]                    \n"
-      "tbl     z0.b, {z0.b}, z31.b                       \n"
-      "orr     z0.d, z0.d, z30.d                         \n"
-      "st1w    {z0.s}, p0, [%[dst]]                      \n"
+      "b.gt    3b                                        \n"

      "99:                                               \n"
      : [src] "+r"(src_raw),         // %[src]
        [dst] "+r"(dst_wxyz),        // %[dst]
        [width] "+r"(width),         // %[width]
-        [vl_mul3] "+r"(vl_mul3)      // %[vl_mul3]
+        [vl_mul3] "+r"(vl_mul3),     // %[vl_mul3]
+        [rem_mul3] "=&r"(rem_mul3)   // %[rem_mul3]
      : [idx_start] "r"(idx_start),  // %[idx_start]
        [idx_step] "r"(idx_step),    // %[idx_step]
        [alpha] "r"(alpha),          // %[alpha]
        [vl] "r"(vl)                 // %[vl]
-      : "cc", "memory", "z0", "z30", "z31", "p0", "p1");
+      : "cc", "memory", "z0", "z1", "z30", "z31", "p0", "p1");
 }

 void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {