[AArch64] Use full vectors in ARGB4444To{Y,UV}Row_NEON

The existing ARGB4444TORGB macro only makes use of 64 bit wide vectors rather than the full 128 bits available, so unroll it to allow us to process more data per instruction. For ARGB4444ToUVRow_NEON we already have enough data available each iteration to make use of full vectors, but for ARGB4444ToYRow_NEON we also need to adjust the "any" kernel to allow us to process 16 elements per iteration. Reduction in runtimes observed compared to the existing Neon kernels: | ARGB4444ToUVRow | ARGB4444ToYRow Cortex-A55 | -27.8% | -34.6% Cortex-A510 | -37.0% | -44.4% Cortex-A76 | -40.2% | -22.0% Cortex-A720 | -33.4% | -35.5% Cortex-X1 | -34.1% | -19.7% Cortex-X2 | -32.1% | -26.3% Bug: libyuv:976 Change-Id: I08f6286bab0ebf5e24d5d5803f8c45ec6ba776ee Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5631541 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-07 17:26:49 +08:00 · 2024-04-18 08:58:29 +01:00 · 2024-04-18 08:58:29 +01:00 · c1fe5663f5
commit c1fe5663f5
parent 5bac99fe09
2 changed files with 53 additions and 58 deletions
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1311,8 +1311,12 @@ ANY11(ARGB1555ToYRow_Any_LSX, ARGB1555ToYRow_LSX, 0, 2, 1, 15)
 ANY11(ARGB1555ToYRow_Any_LASX, ARGB1555ToYRow_LASX, 0, 2, 1, 31)
 #endif
 #ifdef HAS_ARGB4444TOYROW_NEON
 #ifdef __aarch64__
 ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 15)
 #else
 ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
 #endif
 #endif
 #ifdef HAS_YUY2TOYROW_NEON
 ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
 #endif
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2134,14 +2134,15 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
  "sri        v1.16b, v1.16b, #4  \n" /* AAAAAAAA_GGGGGGGG */ \
  "sri        v0.16b, v0.16b, #4  \n" /* RRRRRRRR_BBBBBBBB */
-#define ARGB4444TORGB                                \
+#define ARGB4444TORGB                                    \
-  /* Input: v0.8h = xxxxRRRRGGGGBBBB */              \
+  /* Input: v0.8h = xxxxRRRRGGGGBBBB */                  \
-  "xtn        v1.8b, v0.8h        \n" /* GGGGBBBB */ \
+  "uzp1       v1.16b, v0.16b, v3.16b  \n" /* GGGGBBBB */ \
-  "shrn       v2.8b, v0.8h, #4    \n" /* RRRRxxxx */ \
+  "shrn       v2.8b, v0.8h, #4        \n" /* RRRRxxxx */ \
-  "shl        v0.8b, v1.8b, #4    \n" /* BBBB0000 */ \
+  "shl        v0.16b, v1.16b, #4      \n" /* BBBB0000 */ \
-  "sri        v1.8b, v1.8b, #4    \n" /* GGGGGGGG */ \
+  "shrn2      v2.16b, v3.8h, #4       \n" /* RRRRxxxx */ \
-  "sri        v2.8b, v2.8b, #4    \n" /* RRRRRRRR */ \
+  "sri        v1.16b, v1.16b, #4      \n" /* GGGGGGGG */ \
-  "sri        v0.8b, v0.8b, #4    \n" /* BBBBBBBB */
+  "sri        v2.16b, v2.16b, #4      \n" /* RRRRRRRR */ \
  "sri        v0.16b, v0.16b, #4      \n" /* BBBBBBBB */
 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
@ -3331,46 +3332,32 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_v,
                          int width) {
  const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
-  asm volatile (
+  asm volatile(
      RGBTOUV_SETUP_REG  // sets v20-v25
-      "1:                                        \n"
+      "1:                                    \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "ldp         q0, q3, [%0], #32         \n"  // load 16 ARGB4444 pixels.
      ARGB4444TORGB
-      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v16.8h, v0.16b            \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%0, 448]      \n"
-      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v17.8h, v1.16b            \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "uaddlp      v18.8h, v2.16b            \n"  // R 16 bytes -> 8 shorts.
-      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB4444 pixels.
+
      "ldp         q0, q3, [%1], #32         \n"  // load 16 ARGB4444 pixels.
      ARGB4444TORGB
-      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v16.8h, v0.16b            \n"  // B 16 bytes -> 8 shorts.
-      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%1, 448]      \n"
-      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "uadalp      v17.8h, v1.16b            \n"  // G 16 bytes -> 8 shorts.
      "uadalp      v18.8h, v2.16b            \n"  // R 16 bytes -> 8 shorts.
-      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB4444 pixels.
+      "urshr       v0.8h, v16.8h, #1         \n"  // 2x average
-      ARGB4444TORGB
+      "urshr       v1.8h, v17.8h, #1         \n"
-      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "urshr       v2.8h, v18.8h, #1         \n"
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB4444 pixels.
      ARGB4444TORGB
      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-      "ins         v16.D[1], v26.D[0]            \n"
+      "subs        %w4, %w4, #16             \n"  // 16 processed per loop.
      "ins         v17.D[1], v27.D[0]            \n"
      "ins         v18.D[1], v28.D[0]            \n"
      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
      "urshr       v1.8h, v17.8h, #1             \n"
      "urshr       v2.8h, v18.8h, #1             \n"
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
      RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v0.8b}, [%2], #8         \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "st1         {v1.8b}, [%3], #8         \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
+      "b.gt        1b                        \n"
      : "+r"(src_argb4444),    // %0
        "+r"(src_argb4444_1),  // %1
        "+r"(dst_u),           // %2
@ -3445,23 +3432,27 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                         uint8_t* dst_y,
                         int width) {
-  asm volatile (
+  asm volatile(
-      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
+      "movi        v24.16b, #25              \n"  // B * 0.1016 coefficient
-      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
+      "movi        v25.16b, #129             \n"  // G * 0.5078 coefficient
-      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
+      "movi        v26.16b, #66              \n"  // R * 0.2578 coefficient
-      "movi        v27.8b, #16                   \n"  // Add 16 constant
+      "movi        v27.16b, #16              \n"  // Add 16 constant
-      "1:                                        \n"
+      "1:                                    \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "ldp         q0, q3, [%0], #32         \n"  // load 16 ARGB4444 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "subs        %w2, %w2, #16             \n"  // 16 processed per loop.
      ARGB4444TORGB
-      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
+      "umull       v16.8h, v0.8b, v24.8b     \n"  // B
-      "prfm        pldl1keep, [%0, 448]          \n"
+      "umull2      v17.8h, v0.16b, v24.16b   \n"  // B
-      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
+      "prfm        pldl1keep, [%0, 448]      \n"
-      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
+      "umlal       v16.8h, v1.8b, v25.8b     \n"  // G
-      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
+      "umlal2      v17.8h, v1.16b, v25.16b   \n"  // G
-      "uqadd       v0.8b, v0.8b, v27.8b          \n"
+      "umlal       v16.8h, v2.8b, v26.8b     \n"  // R
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "umlal2      v17.8h, v2.16b, v26.16b   \n"  // R
-      "b.gt        1b                            \n"
+      "uqrshrn     v0.8b, v16.8h, #8         \n"  // 16 bit to 8 bit Y
      "uqrshrn2    v0.16b, v17.8h, #8        \n"  // 16 bit to 8 bit Y
      "uqadd       v0.16b, v0.16b, v27.16b   \n"
      "str         q0, [%1], #16             \n"  // store 8 pixels Y.
      "b.gt        1b                        \n"
      : "+r"(src_argb4444),  // %0
        "+r"(dst_y),         // %1
        "+r"(width)          // %2