From e6469913473c4afdd8abc95fbecd3ed228002834 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Sun, 24 Mar 2024 06:17:55 +0000
Subject: [PATCH] [AArch64] Use LD1/ST1 rather than LD4/ST4 in ARGBAddRow_NEON

There is no need to de-interleave channels here since we are applying
the same operation across all lanes. LD4 and ST4 are known to be
significantly slower than LD1/ST1 on some micro-architectures so we
should prefer to avoid them where possible.

Reduction in runtimes observed for ARGBAddRow_NEON:

 Cortex-A55: -15.0%
Cortex-A510: -59.8%
 Cortex-A76: -54.4%
  Cortex-X2: -70.4%

Change-Id: Id04e5259d8e5e7511dad5df85cdf9759b392cb99
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5454831
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 source/row_neon64.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index e06f65e56..7f25a34c5 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -3810,8 +3810,8 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
   asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
-      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "ld1         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld1         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
       "uqadd       v0.8b, v0.8b, v4.8b           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
@@ -3819,7 +3819,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
       "prfm        pldl1keep, [%1, 448]          \n"
       "uqadd       v2.8b, v2.8b, v6.8b           \n"
       "uqadd       v3.8b, v3.8b, v7.8b           \n"
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "st1         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1