[AArch64] Improve ARGBToUVRow_SVE2 and related kernels

This commit reworks the implementation of ARGBToUVMatrixRow_SVE2, using an approach similar to that recently used in 61bdaee13a701d2b52c6dc943ccc5c888077a591. In particular we can rework these SVE2 implementations to use 8-bit dot-product instructions instead of 16-bit, allowing us to process more data in a single vector. To ensure that the input values fit in 8-bits, negate the UV constants arrays passed to the kernel and undo the now-unnecessary flipping of the middle two component values. This commit mostly reverses the performance inversion where the Neon I8MM implementation was previously faster than the SVE2 implementation. The reduction in runtime observed compared to the existing Neon I8MM implementation is now: Cortex-A510: +5.6% (!) Cortex-A520: -3.0% Cortex-A710: -12.6% Cortex-A715: -10.9% Cortex-A720: -10.8% Cortex-X2: -3.8% Cortex-X3: -10.3% Cortex-X4: -9.5% Cortex-X925: -6.7% Change-Id: I30253976dc8e3651cfb5fd39b63a6763975d41e3 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6640990 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
2025-12-06 16:56:55 +08:00 · 2025-03-26 19:44:24 +00:00 · 2025-03-26 19:44:24 +00:00 · 3d66e94fb5
commit 3d66e94fb5
parent 1b2f6cdbe8
1 changed files with 125 additions and 159 deletions
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -217,9 +217,7 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
  NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
 }

-// Dot-product constants are stored as four-tuples with the two innermost
-// elements flipped to account for the interleaving nature of the widening
-// addition instructions.
+// SVE constants are stored negated such that we can store 128 in int8_t.

 // RGB to BT601 coefficients
 // UB   0.875 coefficient = 112
@ -229,25 +227,24 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
 // VG -0.7344 coefficient = -94
 // VR   0.875 coefficient = 112

-// SVE constants are not negated
-static const int16_t kARGBToUVCoefficients[] = {
-    // UB, -UR, -UG, 0, -VB, VR, -VG, 0
-    112, -38, -74, 0, -18, 112, -94, 0,
+static const int8_t kARGBToUVCoefficients[] = {
+    // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
+    -112, 74, 38, 0, 18, 94, -112, 0,
 };

-static const int16_t kRGBAToUVCoefficients[] = {
-    // 0, -UG, UB, -UR, 0, -VG, -VB, VR
-    0, -74, 112, -38, 0, -94, -18, 112,
+static const int8_t kABGRToUVCoefficients[] = {
+    // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
+    38, 74, -112, 0, -112, 94, 18, 0,
 };

-static const int16_t kBGRAToUVCoefficients[] = {
-    // 0, -UG, -UR, UB, 0, -VG, VR, -VB
-    0, -74, -38, 112, 0, -94, 112, -18,
+static const int8_t kBGRAToUVCoefficients[] = {
+    // 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
+    0, 38, 74, -112, 0, -112, 94, 18,
 };

-static const int16_t kABGRToUVCoefficients[] = {
-    // -UR, UB, -UG, 0, VR, -VB, -VG, 0
-    -38, 112, -74, 0, 112, -18, -94, 0,
+static const int8_t kRGBAToUVCoefficients[] = {
+    // 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
+    0, -112, 74, 38, 0, 18, 94, -112,
 };

 // RGB to JPEG coefficients
@ -258,169 +255,138 @@ static const int16_t kABGRToUVCoefficients[] = {
 // VG -0.41869  coefficient = -107
 // VR 0.500     coefficient = 128

-static const int16_t kARGBToUVJCoefficients[] = {
-    // UB, -UR, -UG, 0, -VB, VR, -VG, 0
-    128, -43, -85, 0, -21, 128, -107, 0,
+static const int8_t kARGBToUVJCoefficients[] = {
+    // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
+    -128, 85, 43, 0, 21, 107, -128, 0,
 };

-static const int16_t kABGRToUVJCoefficients[] = {
-    // -UR, UB, -UG, 0, VR, -VB, -VG, 0
-    -43, 128, -85, 0, 128, -21, -107, 0,
+static const int8_t kABGRToUVJCoefficients[] = {
+    // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
+    43, 85, -128, 0, -128, 107, 21, 0,
 };

+#define ABCDTOUVMATRIX_SVE                                                  \
+  "ld1d     {z0.d}, p1/z, [%[src0]]               \n" /* ABCD(bgra) */      \
+  "ld1d     {z1.d}, p2/z, [%[src0], #1, mul vl]   \n" /* EFGH(bgra) */      \
+  "ld1d     {z2.d}, p3/z, [%[src0], #2, mul vl]   \n" /* IJKL(bgra) */      \
+  "ld1d     {z3.d}, p4/z, [%[src0], #3, mul vl]   \n" /* MNOP(bgra) */      \
+  "ld1d     {z4.d}, p1/z, [%[src1]]               \n" /* ABCD(bgra) */      \
+  "ld1d     {z5.d}, p2/z, [%[src1], #1, mul vl]   \n" /* EFGH(bgra) */      \
+  "ld1d     {z6.d}, p3/z, [%[src1], #2, mul vl]   \n" /* IJKL(bgra) */      \
+  "ld1d     {z7.d}, p4/z, [%[src1], #3, mul vl]   \n" /* MNOP(bgra) */      \
+  "incb     %[src0], all, mul #4                  \n"                       \
+  "incb     %[src1], all, mul #4                  \n"                       \
+                                                                            \
+  "uaddlb   z16.h, z0.b, z4.b                     \n" /* ABCD(br) */        \
+  "uaddlb   z18.h, z1.b, z5.b                     \n" /* EFGH(br) */        \
+  "uaddlb   z20.h, z2.b, z6.b                     \n" /* IJKL(br) */        \
+  "uaddlb   z22.h, z3.b, z7.b                     \n" /* MNOP(br) */        \
+  "uaddlt   z17.h, z0.b, z4.b                     \n" /* ABCD(ga) */        \
+  "uaddlt   z19.h, z1.b, z5.b                     \n" /* EFGH(ga) */        \
+  "uaddlt   z21.h, z2.b, z6.b                     \n" /* IJKL(ga) */        \
+  "uaddlt   z23.h, z3.b, z7.b                     \n" /* MNOP(ga) */        \
+                                                                            \
+  /* Use ADDP on 32-bit elements to add adjacent pairs of 9-bit unsigned */ \
+  "addp     z16.s, p0/m, z16.s, z18.s             \n" /* ABEFCDGH(br) */    \
+  "addp     z17.s, p0/m, z17.s, z19.s             \n" /* ABEFCDGH(ga) */    \
+  "addp     z20.s, p0/m, z20.s, z22.s             \n" /* IJMNKLOP(br) */    \
+  "addp     z21.s, p0/m, z21.s, z23.s             \n" /* IJMNKLOP(ga) */    \
+                                                                            \
+  "rshrnb    z0.b, z16.h, #2                      \n" /* ABEFCDGH(b0r0) */  \
+  "rshrnb    z1.b, z20.h, #2                      \n" /* IJMNKLOP(b0r0) */  \
+  "rshrnt    z0.b, z17.h, #2                      \n" /* ABEFCDGH(bgra) */  \
+  "rshrnt    z1.b, z21.h, #2                      \n" /* IJMNKLOP(bgra) */  \
+                                                                            \
+  "tbl       z0.s, {z0.s}, z27.s                  \n" /* ABCDEFGH */        \
+  "tbl       z1.s, {z1.s}, z27.s                  \n" /* IJKLMNOP */        \
+                                                                            \
+  "subs     %w[width], %w[width], %w[vl], lsl #2  \n" /* VL per loop */     \
+                                                                            \
+  "movi     v16.8h, #0                            \n"                       \
+  "movi     v17.8h, #0                            \n"                       \
+  "movi     v20.8h, #0                            \n"                       \
+  "movi     v21.8h, #0                            \n"                       \
+                                                                            \
+  "usdot     z16.s, z0.b, z24.b                   \n"                       \
+  "usdot     z17.s, z1.b, z24.b                   \n"                       \
+  "usdot     z20.s, z0.b, z25.b                   \n"                       \
+  "usdot     z21.s, z1.b, z25.b                   \n"                       \
+                                                                            \
+  "subhnb   z16.b, z26.h, z16.h                   \n" /* U */               \
+  "subhnb   z20.b, z26.h, z20.h                   \n" /* V */               \
+  "subhnb   z17.b, z26.h, z17.h                   \n" /* U */               \
+  "subhnb   z21.b, z26.h, z21.h                   \n" /* V */               \
+                                                                            \
+  "uzp1     z16.h, z16.h, z17.h                   \n"                       \
+  "uzp1     z20.h, z20.h, z21.h                   \n"                       \
+                                                                            \
+  "st1b     {z16.h}, p5, [%[dst_u]]               \n" /* U */               \
+  "st1b     {z20.h}, p5, [%[dst_v]]               \n" /* V */               \
+  "inch     %[dst_u]                              \n"                       \
+  "inch     %[dst_v]                              \n"
+
 static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
                                   int src_stride_argb,
                                   uint8_t* dst_u,
                                   uint8_t* dst_v,
                                   int width,
-                                   const int16_t* uvconstants) {
+                                   const int8_t* uvconstants) {
  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
  uint64_t vl;
+  asm("cntd %x0" : "=r"(vl));
+
+  // Width is a multiple of two here, so halve it.
+  width >>= 1;
+
  asm volatile(
-      "ptrue    p0.b                                \n"
-      "ld1rd    {z24.d}, p0/z, [%[uvconstants]]     \n"
-      "ld1rd    {z25.d}, p0/z, [%[uvconstants], #8] \n"
-      "mov      z26.h, #0x8000                      \n"  // 128.0 (0x8000)
-      "cntb     %[vl]                               \n"
-      "subs     %w[width], %w[width], %w[vl]        \n"
-      "b.lt     2f                                  \n"
+      "ptrue    p0.b                                 \n"
+      "ld1rw    {z24.s}, p0/z, [%[uvconstants]]      \n"
+      "ld1rw    {z25.s}, p0/z, [%[uvconstants], #4]  \n"
+      "mov      z26.h, #0x8000                       \n"  // 128.0 (0x8000)

-      // Process 4x vectors from each input row per iteration.
-      // Cannot use predication here due to unrolling.
-      "1:                                           \n"  // e.g.
-      "ld1b     {z0.b}, p0/z, [%[src0], #0, mul vl] \n"  // bgrabgra
-      "ld1b     {z4.b}, p0/z, [%[src1], #0, mul vl] \n"  // bgrabgra
-      "ld1b     {z1.b}, p0/z, [%[src0], #1, mul vl] \n"  // bgrabgra
-      "ld1b     {z5.b}, p0/z, [%[src1], #1, mul vl] \n"  // bgrabgra
-      "ld1b     {z2.b}, p0/z, [%[src0], #2, mul vl] \n"  // bgrabgra
-      "ld1b     {z6.b}, p0/z, [%[src1], #2, mul vl] \n"  // bgrabgra
-      "ld1b     {z3.b}, p0/z, [%[src0], #3, mul vl] \n"  // bgrabgra
-      "ld1b     {z7.b}, p0/z, [%[src1], #3, mul vl] \n"  // bgrabgra
-      "incb     %[src0], all, mul #4                \n"
-      "incb     %[src1], all, mul #4                \n"
+      // Generate some TBL indices to undo the interleaving from ADDP.
+      "index    z0.s, #0, #1                         \n"
+      "index    z1.s, #1, #1                         \n"
+      "uzp1     z27.s, z0.s, z1.s                    \n"

-      "uaddlb   z16.h, z0.b, z4.b                   \n"  // brbrbrbr
-      "uaddlt   z17.h, z0.b, z4.b                   \n"  // gagagaga
-      "uaddlb   z18.h, z1.b, z5.b                   \n"  // brbrbrbr
-      "uaddlt   z19.h, z1.b, z5.b                   \n"  // gagagaga
-      "uaddlb   z20.h, z2.b, z6.b                   \n"  // brbrbrbr
-      "uaddlt   z21.h, z2.b, z6.b                   \n"  // gagagaga
-      "uaddlb   z22.h, z3.b, z7.b                   \n"  // brbrbrbr
-      "uaddlt   z23.h, z3.b, z7.b                   \n"  // gagagaga
+      "subs     %w[width], %w[width], %w[vl], lsl #2 \n"
+      "b.lt    2f                                    \n"

-      "trn1     z0.s, z16.s, z17.s                  \n"  // brgabgra
-      "trn2     z1.s, z16.s, z17.s                  \n"  // brgabgra
-      "trn1     z2.s, z18.s, z19.s                  \n"  // brgabgra
-      "trn2     z3.s, z18.s, z19.s                  \n"  // brgabgra
-      "trn1     z4.s, z20.s, z21.s                  \n"  // brgabgra
-      "trn2     z5.s, z20.s, z21.s                  \n"  // brgabgra
-      "trn1     z6.s, z22.s, z23.s                  \n"  // brgabgra
-      "trn2     z7.s, z22.s, z23.s                  \n"  // brgabgra
+      "ptrue  p1.d                                   \n"
+      "ptrue  p2.d                                   \n"
+      "ptrue  p3.d                                   \n"
+      "ptrue  p4.d                                   \n"
+      "ptrue  p5.h                                   \n"
+      "1:                                            \n"  //
+      ABCDTOUVMATRIX_SVE
+      "b.gt     1b                                   \n"

-      "subs     %w[width], %w[width], %w[vl]        \n"  // 4*VL per loop
+      "2:                                            \n"
+      "adds    %w[width], %w[width], %w[vl], lsl #2  \n"
+      "b.eq    99f                                   \n"

-      "add      z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
-      "add      z2.h, p0/m, z2.h, z3.h              \n"  // brgabrga
-      "add      z4.h, p0/m, z4.h, z5.h              \n"  // brgabrga
-      "add      z6.h, p0/m, z6.h, z7.h              \n"  // brgabrga
+      "3:                                            \n"
+      "whilelt  p1.d, wzr, %w[width]                 \n"
+      "whilelt  p2.d, %w[vl], %w[width]              \n"
+      "whilelt  p3.d, %w[vl2], %w[width]             \n"
+      "whilelt  p4.d, %w[vl3], %w[width]             \n"
+      "whilelt  p5.h, wzr, %w[width]                 \n"  //
+      ABCDTOUVMATRIX_SVE
+      "b.gt     3b                                   \n"

-      "urshr    z0.h, p0/m, z0.h, #2                \n"  // brgabrga
-      "urshr    z2.h, p0/m, z2.h, #2                \n"  // brgabrga
-      "urshr    z4.h, p0/m, z4.h, #2                \n"  // brgabrga
-      "urshr    z6.h, p0/m, z6.h, #2                \n"  // brgabrga
-
-      "movi     v16.8h, #0                          \n"
-      "movi     v17.8h, #0                          \n"
-      "movi     v18.8h, #0                          \n"
-      "movi     v19.8h, #0                          \n"
-
-      "movi     v20.8h, #0                          \n"
-      "movi     v21.8h, #0                          \n"
-      "movi     v22.8h, #0                          \n"
-      "movi     v23.8h, #0                          \n"
-
-      "sdot     z16.d, z0.h, z24.h                  \n"  // UUxxxxxx
-      "sdot     z17.d, z2.h, z24.h                  \n"  // UUxxxxxx
-      "sdot     z18.d, z4.h, z24.h                  \n"  // UUxxxxxx
-      "sdot     z19.d, z6.h, z24.h                  \n"  // UUxxxxxx
-
-      "sdot     z20.d, z0.h, z25.h                  \n"  // VVxxxxxx
-      "sdot     z21.d, z2.h, z25.h                  \n"  // VVxxxxxx
-      "sdot     z22.d, z4.h, z25.h                  \n"  // VVxxxxxx
-      "sdot     z23.d, z6.h, z25.h                  \n"  // VVxxxxxx
-
-      "uzp1     z16.s, z16.s, z17.s                 \n"  // UUxx
-      "uzp1     z18.s, z18.s, z19.s                 \n"  // UUxx
-      "uzp1     z20.s, z20.s, z21.s                 \n"  // VVxx
-      "uzp1     z22.s, z22.s, z23.s                 \n"  // VVxx
-
-      "uzp1     z16.h, z16.h, z18.h                 \n"  // UU
-      "uzp1     z20.h, z20.h, z22.h                 \n"  // VV
-
-      "addhnb   z16.b, z16.h, z26.h                 \n"  // U
-      "addhnb   z20.b, z20.h, z26.h                 \n"  // V
-
-      "st1b     {z16.h}, p0, [%[dst_u]]             \n"  // U
-      "st1b     {z20.h}, p0, [%[dst_v]]             \n"  // V
-      "inch     %[dst_u]                            \n"
-      "inch     %[dst_v]                            \n"
-
-      "b.ge     1b                                  \n"
-
-      "2:                                           \n"
-      "adds     %w[width], %w[width], %w[vl]        \n"  // VL per loop
-      "b.le     99f                                 \n"
-
-      // Process remaining pixels from each input row.
-      // Use predication to do one vector from each input array, so may loop up
-      // to three iterations.
-      "cntw     %x[vl]                              \n"
-
-      "3:                                           \n"
-      "whilelt  p1.s, wzr, %w[width]                \n"
-      "ld1d     {z0.d}, p1/z, [%[src0]]             \n"  // bgrabgra
-      "ld1d     {z4.d}, p1/z, [%[src1]]             \n"  // bgrabgra
-      "incb     %[src0]                             \n"
-      "incb     %[src1]                             \n"
-
-      "uaddlb   z16.h, z0.b, z4.b                   \n"  // brbrbrbr
-      "uaddlt   z17.h, z0.b, z4.b                   \n"  // gagagaga
-
-      "trn1     z0.s, z16.s, z17.s                  \n"  // brgabgra
-      "trn2     z1.s, z16.s, z17.s                  \n"  // brgabgra
-
-      "add      z0.h, p0/m, z0.h, z1.h              \n"  // brgabrga
-
-      "urshr    z0.h, p0/m, z0.h, #2                \n"  // brgabrga
-
-      "subs     %w[width], %w[width], %w[vl]        \n"  // VL per loop
-
-      "movi     v16.8h, #0                          \n"
-      "movi     v20.8h, #0                          \n"
-
-      "sdot     z16.d, z0.h, z24.h                  \n"
-      "sdot     z20.d, z0.h, z25.h                  \n"
-
-      "addhnb   z16.b, z16.h, z26.h                 \n"  // U
-      "addhnb   z20.b, z20.h, z26.h                 \n"  // V
-
-      "st1b     {z16.d}, p0, [%[dst_u]]             \n"  // U
-      "st1b     {z20.d}, p0, [%[dst_v]]             \n"  // V
-      "incd     %[dst_u]                            \n"
-      "incd     %[dst_v]                            \n"
-      "b.gt     3b                                  \n"
-
-      "99:                                          \n"
-      : [src0] "+r"(src_argb),    // %[src0]
-        [src1] "+r"(src_argb_1),  // %[src1]
-        [dst_u] "+r"(dst_u),      // %[dst_u]
-        [dst_v] "+r"(dst_v),      // %[dst_v]
-        [width] "+r"(width),      // %[width]
-        [vl] "=&r"(vl)            // %[vl]
-      : [uvconstants] "r"(uvconstants)
+      "99:                                           \n"
+      : [src0] "+r"(src_argb),           // %[src0]
+        [src1] "+r"(src_argb_1),         // %[src1]
+        [dst_u] "+r"(dst_u),             // %[dst_u]
+        [dst_v] "+r"(dst_v),             // %[dst_v]
+        [width] "+r"(width)              // %[width]
+      : [uvconstants] "r"(uvconstants),  // %[uvconstants]
+        [vl] "r"(vl),                    // %[vl]
+        [vl2] "r"(vl * 2),               // %[vl2]
+        [vl3] "r"(vl * 3)                // %[vl3]
      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16",
        "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26",
-        "p0");
+        "z27", "p0", "p1", "p2", "p3", "p4", "p5");
 }

 void ARGBToUVRow_SVE2(const uint8_t* src_argb,