From 7eb552c891d3f874f9b87f1860d4c3ba65cd2c5d Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Mon, 16 Sep 2024 16:56:18 +0100
Subject: [PATCH] [AArch64] Avoid unnecessary MOVs in
 ScaleARGBRowDownEvenBox_NEON

The existing code uses three MOV instructions through a temporary
register to swap the low and high halves of a vector register, however
this can be done with a pair of ZIP instructions instead.

Also use a pair of RSHRN rather than RSHRN2 to allow these to execute in
parallel on little cores.

Reduction in runtime observed compared to the existing Neon
implementation:

 Cortex-A55:  -8.3%
Cortex-A510: -20.6%
Cortex-A520: -16.6%
 Cortex-A76:  -6.8%
Cortex-A715:  -6.2%
Cortex-A720:  -6.2%
  Cortex-X1: -22.0%
  Cortex-X2: -18.7%
  Cortex-X3: -21.1%
  Cortex-X4: -25.8%
Cortex-X925: -21.9%

Change-Id: I87ae133be86c3c9f850d5848ec19d9b71ebda4d9
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872801
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 source/scale_neon64.cc | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 15a1fe196..2ad0c8152 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -1284,7 +1284,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
                                   int src_stepx,
                                   uint8_t* dst_argb,
                                   int dst_width) {
-  asm volatile (
+  asm volatile(
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
       "ld1         {v0.8b}, [%0], %4             \n"  // Read 4 2x2 -> 2x1
@@ -1300,26 +1300,24 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
       "uaddl       v4.8h, v4.8b, v5.8b           \n"
       "uaddl       v6.8h, v6.8b, v7.8b           \n"
       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "mov         v16.d[1], v0.d[1]             \n"  // ab_cd -> ac_bd
-      "mov         v0.d[1], v2.d[0]              \n"
-      "mov         v2.d[0], v16.d[1]             \n"
-      "mov         v16.d[1], v4.d[1]             \n"  // ef_gh -> eg_fh
-      "mov         v4.d[1], v6.d[0]              \n"
-      "mov         v6.d[0], v16.d[1]             \n"
+      "zip1        v1.2d, v0.2d, v2.2d           \n"
+      "zip2        v2.2d, v0.2d, v2.2d           \n"
+      "zip1        v5.2d, v4.2d, v6.2d           \n"
+      "zip2        v6.2d, v4.2d, v6.2d           \n"
       "prfm        pldl1keep, [%1, 448]          \n"
-      "add         v0.8h, v0.8h, v2.8h           \n"  // (a+b)_(c+d)
-      "add         v4.8h, v4.8h, v6.8h           \n"  // (e+f)_(g+h)
+      "add         v0.8h, v1.8h, v2.8h           \n"  // (a+b)_(c+d)
+      "add         v4.8h, v5.8h, v6.8h           \n"  // (e+f)_(g+h)
       "rshrn       v0.8b, v0.8h, #2              \n"  // first 2 pixels.
-      "rshrn2      v0.16b, v4.8h, #2             \n"  // next 2 pixels.
+      "rshrn       v1.8b, v4.8h, #2              \n"  // next 2 pixels.
       "subs        %w3, %w3, #4                  \n"  // 4 pixels per loop.
-      "st1         {v0.16b}, [%2], #16           \n"
+      "stp         d0, d1, [%2], #16             \n"
       "b.gt        1b                            \n"
       : "+r"(src_argb),                // %0
         "+r"(src_stride),              // %1
         "+r"(dst_argb),                // %2
         "+r"(dst_width)                // %3
       : "r"((int64_t)(src_stepx * 4))  // %4
-      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for