[AArch64] Use full Neon vectors in ARGB4444ToARGBRow_NEON

The existing Neon code narrows the input 16-bit packed data to 8-bit elements and separates the color channels, causing us to only process half a Neon vector per instruction for the channel widening from 4-bit color data to 8-bits. We can note that the processing being done is identical for all color channels and therefore we can keep them partially interleaved during the widening step. This allows us to use full Neon vectors for the whole loop body. Reductions in runtimes observed for ARGB4444ToARGBRow_NEON: Cortex-A55: -30.7% Cortex-A510: -44.3% Cortex-A76: -51.6% Cortex-X2: -54.2% Bug: libyuv:976 Change-Id: I9d9cda7e16eb07619c6d7f1de2e6b8c0fb6d64cf Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5594389 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2024-03-23 20:02:41 +00:00 · 2024-03-23 20:02:41 +00:00 · dff7bad43d
commit dff7bad43d
parent 7633c818ec
1 changed files with 7 additions and 12 deletions
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -1911,16 +1911,11 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
  );
 }

-#define ARGB4444TOARGB                               \
-  /* Input: v0.8h = AAAARRRRGGGGBBBB */              \
-  "xtn        v1.8b, v0.8h        \n" /* GGGGBBBB */ \
-  "shrn       v2.8b, v0.8h, #4    \n" /* RRRRxxxx */ \
-  "shrn       v3.8b, v0.8h, #8    \n" /* AAAAxxxx */ \
-  "shl        v0.8b, v1.8b, #4    \n" /* BBBB0000 */ \
-  "sri        v1.8b, v1.8b, #4    \n" /* GGGGGGGG */ \
-  "sri        v2.8b, v2.8b, #4    \n" /* RRRRRRRR */ \
-  "sri        v3.8b, v3.8b, #4    \n" /* AAAAAAAA */ \
-  "sri        v0.8b, v0.8b, #4    \n" /* BBBBBBBB */
+#define ARGB4444TOARGB                                        \
+  /* Input: v1.8h = AAAARRRR_GGGGBBBB */                      \
+  "shl        v0.16b, v1.16b, #4  \n" /* RRRR0000_BBBB0000 */ \
+  "sri        v1.16b, v1.16b, #4  \n" /* AAAAAAAA_GGGGGGGG */ \
+  "sri        v0.16b, v0.16b, #4  \n" /* RRRRRRRR_BBBBBBBB */

 #define ARGB4444TORGB                                \
  /* Input: v0.8h = xxxxRRRRGGGGBBBB */              \
@ -1936,10 +1931,10 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                            int width) {
  asm volatile(
      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "ld1         {v1.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
      "prfm        pldl1keep, [%0, 448]          \n" ARGB4444TOARGB
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "st2         {v0.16b, v1.16b}, [%1], #32   \n"  // store 8 ARGB.
      "b.gt        1b                            \n"
      : "+r"(src_argb4444),  // %0
        "+r"(dst_argb),      // %1