From d5303f4f779a6baf66bbac8a97937e507ffaaaba Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Mon, 22 Apr 2024 10:03:28 +0100
Subject: [PATCH] [AArch64] Unroll ARGB1555ToARGBRow_NEON to use full Neon
 vectors

Processing more data per loop iteration means that we can use the full
128-bit Neon vectors and also allows us to use e.g. UZP1 to perform XTN
+ XTN2 in a single instruction.

The early Cortex-X cores are not a fan of ST4 .16b with a
post-increment, so split out the pointer increment to a separate
instruction to avoid this bottleneck.

Reductions in runtime observed for ARGB1555ToARGBRow_NEON:

 Cortex-A55: -18.1%
Cortex-A510: -11.2%
Cortex-A520: -39.5%
 Cortex-A76: -18.0%
Cortex-A715: -34.8%
Cortex-A720: -34.8%
  Cortex-X1:  -0.9%
  Cortex-X2:  -4.6%
  Cortex-X3:  -3.6%
  Cortex-X4: -20.8%

Bug: libyuv:976
Change-Id: Iae2ac24ffdbc718cd1e05bb77191f8d1df3fcf6f
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790975
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
---
 source/convert_argb.cc |  2 +-
 source/row_any.cc      |  2 +-
 source/row_neon64.cc   | 39 +++++++++++++++++++++------------------
 3 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index dd94c1b77..a77358df8 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -3792,7 +3792,7 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555,
 #if defined(HAS_ARGB1555TOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
       ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
     }
   }
diff --git a/source/row_any.cc b/source/row_any.cc
index b9eaa0a3c..d2f8a5419 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -1393,7 +1393,7 @@ ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15)
 ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31)
 #endif
 #ifdef HAS_ARGB1555TOARGBROW_NEON
-ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 15)
 #endif
 #ifdef HAS_ARGB1555TOARGBROW_MSA
 ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 6d50b2760..7ad54b430 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -2083,17 +2083,19 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
   );
 }
 
-#define ARGB1555TOARGB                             \
-  /* Input: ARRRRRGGGGGBBBBB */                    \
-  "xtn        v29.8b, v0.8h     \n" /* xxxBBBBB */ \
-  "shrn       v3.8b, v0.8h, #8  \n" /* Axxxxxxx */ \
-  "shrn       v2.8b, v0.8h, #7  \n" /* RRRRRxxx */ \
-  "shrn       v1.8b, v0.8h, #2  \n" /* GGGGGxxx */ \
-  "shl        v0.8b, v29.8b, #3 \n" /* BBBBB000 */ \
-  "sshr       v3.8b, v3.8b, #7  \n" /* AAAAAAAA */ \
-  "sri        v2.8b, v2.8b, #5  \n" /* RRRRRRRR */ \
-  "sri        v1.8b, v1.8b, #5  \n" /* GGGGGGGG */ \
-  "sri        v0.8b, v0.8b, #5  \n" /* BBBBBBBB */
+#define ARGB1555TOARGB                                   \
+  /* Input: ARRRRRGGGGGBBBBB */                          \
+  "shrn       v2.8b, v0.8h, #7        \n" /* RRRRRxxx */ \
+  "uzp1       v29.16b, v0.16b, v4.16b \n" /* xxxBBBBB */ \
+  "shrn       v1.8b, v0.8h, #2        \n" /* GGGGGxxx */ \
+  "uzp2       v3.16b, v0.16b, v4.16b  \n" /* Axxxxxxx */ \
+  "shrn2      v2.16b, v4.8h, #7       \n" /* RRRRRxxx */ \
+  "shl        v0.16b, v29.16b, #3     \n" /* BBBBB000 */ \
+  "shrn2      v1.16b, v4.8h, #2       \n" /* GGGGGxxx */ \
+  "sshr       v3.16b, v3.16b, #7      \n" /* AAAAAAAA */ \
+  "sri        v2.16b, v2.16b, #5      \n" /* RRRRRRRR */ \
+  "sri        v1.16b, v1.16b, #5      \n" /* GGGGGGGG */ \
+  "sri        v0.16b, v0.16b, #5      \n" /* BBBBBBBB */
 
 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
 #define RGB555TOARGB                                         \
@@ -2112,14 +2114,15 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width) {
-  asm volatile (
-      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+  asm volatile(
+      "1:                            \n"
+      "ldp   q0, q4, [%0], #32       \n"  // load 16 ARGB1555 pixels
+      "prfm  pldl1keep, [%0, 448]    \n"
+      "subs  %w2, %w2, #16           \n"  // 16 processed per loop
       ARGB1555TOARGB
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt        1b                            \n"
+      "st4   {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n"  // store 16 ARGB
+      "add   %1, %1, #64             \n"
+      "b.gt  1b                      \n"
       : "+r"(src_argb1555),  // %0
         "+r"(dst_argb),      // %1
         "+r"(width)          // %2