[AArch64] Unroll I422ToAR30Row_{SVE2,SME}

The existing STOREAR30_SVE macro works fine for out of order cores, however for in-order cores the number of dependent vector instructions laid out consecutively impacts performance. We can improve this by unrolling the loop to process two sets of vectors at a time, allowing little cores to process two independent streams of vector instructions at the same time to improve performance. Using one set of ZIP instructions at the end allows us to (a) avoid ST4 which we know is slow on some micro-architectures, and (b) enable the use of predication and avoid the need for separate "any" kernels. Reduction in run times of I422ToAR30Row_SVE2 observed compared to the previous SVE2 implementation: Cortex-A510: -37.7% Cortex-A520: -38.8% Cortex-A710: -14.8% Cortex-A715: -17.1% Cortex-A720: -16.9% Cortex-X2: -10.3% Cortex-X3: -6.7% Cortex-X4: -9.4% Cortex-X925: -7.1% Change-Id: I160fb41300d2d08fce2e6eb92181324fd723a02d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6632916 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
2026-04-30 19:09:18 +08:00 · 2025-06-04 10:47:48 +01:00 · 2025-06-04 10:47:48 +01:00 · 867bdc51ed
commit 867bdc51ed
parent 843cda7e7b
1 changed files with 38 additions and 9 deletions
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@ -342,6 +342,7 @@ extern "C" {
 // rather than needing a pair of shifts to saturate and then insert into the
 // correct position in the lane.
 #define STOREAR30_SVE                                                    \
+  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h */                           \
  "uqshl    z16.h, p0/m, z16.h, #2            \n" /* bbbbbbbbbbxxxxxx */ \
  "uqshl    z17.h, p0/m, z17.h, #2            \n" /* ggggggggggxxxxxx */ \
  "umin     z18.h, p0/m, z18.h, z23.h         \n" /* 00rrrrrrrrrrxxxx */ \
@ -352,6 +353,31 @@ extern "C" {
  "st2h     {z17.h, z18.h}, p1, [%[dst_ar30]] \n"                        \
  "incb     %[dst_ar30], all, mul #2          \n"

+#define STOREAR30_SVE_2X                                                 \
+  /* Inputs: B: z16.h,  G: z17.h,  R: z18.h */                           \
+  /*         B: z20.h,  G: z21.h,  R: z22.h */                           \
+  "uqshl    z16.h, p0/m, z16.h, #2            \n" /* bbbbbbbbbbxxxxxx */ \
+  "uqshl    z20.h, p0/m, z20.h, #2            \n" /* bbbbbbbbbbxxxxxx */ \
+  "uqshl    z17.h, p0/m, z17.h, #2            \n" /* ggggggggggxxxxxx */ \
+  "uqshl    z21.h, p0/m, z21.h, #2            \n" /* ggggggggggxxxxxx */ \
+  "umin     z18.h, p0/m, z18.h, z23.h         \n" /* 00rrrrrrrrrrxxxx */ \
+  "umin     z22.h, p0/m, z22.h, z23.h         \n" /* 00rrrrrrrrrrxxxx */ \
+  "orr      z18.h, z18.h, #0xc000             \n" /* 11rrrrrrrrrrxxxx */ \
+  "orr      z22.h, z22.h, #0xc000             \n" /* 11rrrrrrrrrrxxxx */ \
+  "sri      z18.h, z17.h, #12                 \n" /* 11rrrrrrrrrrgggg */ \
+  "sri      z22.h, z21.h, #12                 \n" /* 11rrrrrrrrrrgggg */ \
+  "lsl      z17.h, z17.h, #4                  \n" /* ggggggxxxxxx0000 */ \
+  "lsl      z19.h, z21.h, #4                  \n" /* ggggggxxxxxx0000 */ \
+  "sri      z17.h, z16.h, #6                  \n" /* ggggggbbbbbbbbbb */ \
+  "sri      z19.h, z20.h, #6                  \n" /* ggggggbbbbbbbbbb */ \
+  "zip2     z16.h, z17.h, z19.h               \n"                        \
+  "zip1     z21.h, z17.h, z19.h               \n"                        \
+  "zip2     z17.h, z18.h, z22.h               \n"                        \
+  "zip1     z22.h, z18.h, z22.h               \n"                        \
+  "st2h     {z21.h, z22.h}, p2, [%[dst_ar30]] \n"                        \
+  "st2h     {z16.h, z17.h}, p3, [%[dst_ar30], #2, mul vl] \n"            \
+  "incb     %[dst_ar30], all, mul #4          \n"
+
 #define YUVTORGB_SVE_REGS                                                     \
  "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", \
      "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29",   \
@ -776,32 +802,35 @@ static inline void I422ToAR30Row_SVE_SC(const uint8_t* src_y,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint64_t vl;
-  // The limit is used for saturating the 2.14 red channel in STOREAR30_SVE.
+  // The limit is used for saturating the 2.14 red channel in STOREAR30_SVE_2X.
  const uint16_t limit = 0x3ff0;
  asm volatile(
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n"  //
      YUVTORGB_SVE_SETUP
-      "dup      z19.b, #255                             \n"  // Alpha
      "dup      z23.h, %w[limit]                        \n"
-      "subs     %w[width], %w[width], %w[vl]            \n"
+      "subs     %w[width], %w[width], %w[vl], lsl #1    \n"
      "b.le     2f                                      \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
-      "ptrue    p1.h                                    \n"
+      "ptrue    p1.b                                    \n"
+      "ptrue    p2.b                                    \n"
+      "ptrue    p3.b                                    \n"
      "1:                                               \n"  //
-      READYUV422_SVE I4XXTORGB_SVE STOREAR30_SVE
-      "subs     %w[width], %w[width], %w[vl]            \n"
+      READYUV422_SVE_2X I422TORGB_SVE_2X STOREAR30_SVE_2X
+      "subs     %w[width], %w[width], %w[vl], lsl #1    \n"
      "b.gt     1b                                      \n"

      // Calculate a predicate for the final iteration to deal with the tail.
      "2:                                               \n"
-      "adds    %w[width], %w[width], %w[vl]             \n"
+      "adds    %w[width], %w[width], %w[vl], lsl #1     \n"
      "b.eq    99f                                      \n"

-      "whilelt  p1.h, wzr, %w[width]                    \n"  //
-      READYUV422_SVE I4XXTORGB_SVE STOREAR30_SVE
+      "whilelt  p1.b, wzr, %w[width]                    \n"
+      "whilelt  p2.h, wzr, %w[width]                    \n"
+      "whilelt  p3.h, %w[vl], %w[width]                 \n"  //
+      READYUV422_SVE_2X I422TORGB_SVE_2X STOREAR30_SVE_2X

      "99:                                              \n"
      : [src_y] "+r"(src_y),                                // %[src_y]