From dff7bad43def35d09b6b1666b9645ec78c0cb0da Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Sat, 23 Mar 2024 20:02:41 +0000
Subject: [PATCH] [AArch64] Use full Neon vectors in ARGB4444ToARGBRow_NEON

The existing Neon code narrows the input 16-bit packed data to 8-bit
elements and separates the color channels, causing us to only process
half a Neon vector per instruction for the channel widening from 4-bit
color data to 8-bits.

We can note that the processing being done is identical for all color
channels and therefore we can keep them partially interleaved during the
widening step. This allows us to use full Neon vectors for the whole
loop body.

Reductions in runtimes observed for ARGB4444ToARGBRow_NEON:

 Cortex-A55: -30.7%
Cortex-A510: -44.3%
 Cortex-A76: -51.6%
  Cortex-X2: -54.2%

Bug: libyuv:976
Change-Id: I9d9cda7e16eb07619c6d7f1de2e6b8c0fb6d64cf
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5594389
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 source/row_neon64.cc | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 41c386201..231ab9af3 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -1911,16 +1911,11 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
   );
 }
 
-#define ARGB4444TOARGB                               \
-  /* Input: v0.8h = AAAARRRRGGGGBBBB */              \
-  "xtn        v1.8b, v0.8h        \n" /* GGGGBBBB */ \
-  "shrn       v2.8b, v0.8h, #4    \n" /* RRRRxxxx */ \
-  "shrn       v3.8b, v0.8h, #8    \n" /* AAAAxxxx */ \
-  "shl        v0.8b, v1.8b, #4    \n" /* BBBB0000 */ \
-  "sri        v1.8b, v1.8b, #4    \n" /* GGGGGGGG */ \
-  "sri        v2.8b, v2.8b, #4    \n" /* RRRRRRRR */ \
-  "sri        v3.8b, v3.8b, #4    \n" /* AAAAAAAA */ \
-  "sri        v0.8b, v0.8b, #4    \n" /* BBBBBBBB */
+#define ARGB4444TOARGB                                        \
+  /* Input: v1.8h = AAAARRRR_GGGGBBBB */                      \
+  "shl        v0.16b, v1.16b, #4  \n" /* RRRR0000_BBBB0000 */ \
+  "sri        v1.16b, v1.16b, #4  \n" /* AAAAAAAA_GGGGGGGG */ \
+  "sri        v0.16b, v0.16b, #4  \n" /* RRRRRRRR_BBBBBBBB */
 
 #define ARGB4444TORGB                                \
   /* Input: v0.8h = xxxxRRRRGGGGBBBB */              \
@@ -1936,10 +1931,10 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                             int width) {
   asm volatile(
       "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
+      "ld1         {v1.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
       "prfm        pldl1keep, [%0, 448]          \n" ARGB4444TOARGB
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "st2         {v0.16b, v1.16b}, [%1], #32   \n"  // store 8 ARGB.
       "b.gt        1b                            \n"
       : "+r"(src_argb4444),  // %0
         "+r"(dst_argb),      // %1