[AArch64] Use full vectors in ARGB4444To{Y,UV}Row_NEON

The existing ARGB4444TORGB macro only makes use of 64 bit wide vectors rather than the full 128 bits available, so unroll it to allow us to process more data per instruction. For ARGB4444ToUVRow_NEON we already have enough data available each iteration to make use of full vectors, but for ARGB4444ToYRow_NEON we also need to adjust the "any" kernel to allow us to process 16 elements per iteration. Reduction in runtimes observed compared to the existing Neon kernels: | ARGB4444ToUVRow | ARGB4444ToYRow Cortex-A55 | -27.8% | -34.6% Cortex-A510 | -37.0% | -44.4% Cortex-A76 | -40.2% | -22.0% Cortex-A720 | -33.4% | -35.5% Cortex-X1 | -34.1% | -19.7% Cortex-X2 | -32.1% | -26.3% Bug: libyuv:976 Change-Id: I08f6286bab0ebf5e24d5d5803f8c45ec6ba776ee Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5631541 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-06 16:56:55 +08:00 · 2024-04-18 08:58:29 +01:00 · 2024-04-18 08:58:29 +01:00 · c1fe5663f5
commit c1fe5663f5
parent 5bac99fe09
2 changed files with 53 additions and 58 deletions
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1311,8 +1311,12 @@ ANY11(ARGB1555ToYRow_Any_LSX, ARGB1555ToYRow_LSX, 0, 2, 1, 15)
 ANY11(ARGB1555ToYRow_Any_LASX, ARGB1555ToYRow_LASX, 0, 2, 1, 31)
 #endif
 #ifdef HAS_ARGB4444TOYROW_NEON
+#ifdef __aarch64__
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 15)
+#else
 ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#endif
 #ifdef HAS_YUY2TOYROW_NEON
 ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
 #endif
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2136,12 +2136,13 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,

 #define ARGB4444TORGB                                    \
  /* Input: v0.8h = xxxxRRRRGGGGBBBB */                  \
-  "xtn        v1.8b, v0.8h        \n" /* GGGGBBBB */ \
+  "uzp1       v1.16b, v0.16b, v3.16b  \n" /* GGGGBBBB */ \
  "shrn       v2.8b, v0.8h, #4        \n" /* RRRRxxxx */ \
-  "shl        v0.8b, v1.8b, #4    \n" /* BBBB0000 */ \
-  "sri        v1.8b, v1.8b, #4    \n" /* GGGGGGGG */ \
-  "sri        v2.8b, v2.8b, #4    \n" /* RRRRRRRR */ \
-  "sri        v0.8b, v0.8b, #4    \n" /* BBBBBBBB */
+  "shl        v0.16b, v1.16b, #4      \n" /* BBBB0000 */ \
+  "shrn2      v2.16b, v3.8h, #4       \n" /* RRRRxxxx */ \
+  "sri        v1.16b, v1.16b, #4      \n" /* GGGGGGGG */ \
+  "sri        v2.16b, v2.16b, #4      \n" /* RRRRRRRR */ \
+  "sri        v0.16b, v0.16b, #4      \n" /* BBBBBBBB */

 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
@ -3331,36 +3332,22 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_v,
                          int width) {
  const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
-  asm volatile (
+  asm volatile(
      RGBTOUV_SETUP_REG  // sets v20-v25
      "1:                                    \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "ldp         q0, q3, [%0], #32         \n"  // load 16 ARGB4444 pixels.
      ARGB4444TORGB
-      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v16.8h, v0.16b            \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%0, 448]      \n"
-      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB4444 pixels.
-      ARGB4444TORGB
-      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "uaddlp      v17.8h, v1.16b            \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v18.8h, v2.16b            \n"  // R 16 bytes -> 8 shorts.

-      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB4444 pixels.
+      "ldp         q0, q3, [%1], #32         \n"  // load 16 ARGB4444 pixels.
      ARGB4444TORGB
-      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v16.8h, v0.16b            \n"  // B 16 bytes -> 8 shorts.
      "prfm        pldl1keep, [%1, 448]      \n"
-      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB4444 pixels.
-      ARGB4444TORGB
-      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
-      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
-      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-
-      "ins         v16.D[1], v26.D[0]            \n"
-      "ins         v17.D[1], v27.D[0]            \n"
-      "ins         v18.D[1], v28.D[0]            \n"
+      "uadalp      v17.8h, v1.16b            \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v18.8h, v2.16b            \n"  // R 16 bytes -> 8 shorts.

      "urshr       v0.8h, v16.8h, #1         \n"  // 2x average
      "urshr       v1.8h, v17.8h, #1         \n"
@ -3445,22 +3432,26 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                         uint8_t* dst_y,
                         int width) {
-  asm volatile (
-      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
-      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
-      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
-      "movi        v27.8b, #16                   \n"  // Add 16 constant
+  asm volatile(
+      "movi        v24.16b, #25              \n"  // B * 0.1016 coefficient
+      "movi        v25.16b, #129             \n"  // G * 0.5078 coefficient
+      "movi        v26.16b, #66              \n"  // R * 0.2578 coefficient
+      "movi        v27.16b, #16              \n"  // Add 16 constant
      "1:                                    \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "ldp         q0, q3, [%0], #32         \n"  // load 16 ARGB4444 pixels.
+      "subs        %w2, %w2, #16             \n"  // 16 processed per loop.
      ARGB4444TORGB
-      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
+      "umull       v16.8h, v0.8b, v24.8b     \n"  // B
+      "umull2      v17.8h, v0.16b, v24.16b   \n"  // B
      "prfm        pldl1keep, [%0, 448]      \n"
-      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
-      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
-      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.8b, v0.8b, v27.8b          \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "umlal       v16.8h, v1.8b, v25.8b     \n"  // G
+      "umlal2      v17.8h, v1.16b, v25.16b   \n"  // G
+      "umlal       v16.8h, v2.8b, v26.8b     \n"  // R
+      "umlal2      v17.8h, v2.16b, v26.16b   \n"  // R
+      "uqrshrn     v0.8b, v16.8h, #8         \n"  // 16 bit to 8 bit Y
+      "uqrshrn2    v0.16b, v17.8h, #8        \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.16b, v0.16b, v27.16b   \n"
+      "str         q0, [%1], #16             \n"  // store 8 pixels Y.
      "b.gt        1b                        \n"
      : "+r"(src_argb4444),  // %0
        "+r"(dst_y),         // %1