From e348995a92f4de6d96e7f86beb78eca36ea8f9b3 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Fri, 22 Mar 2024 16:53:25 +0000
Subject: [PATCH] [AArch64] Optimize MergeXR30Row_10_NEON

By keeping intermediate data as 16-bits wide we can compute twice as
much and use ST2 to store the final result. This appears to be much
better even on micro-architectures where ST2 is slightly slower than
ST1.

We save a couple of instructions by taking advantage of multiply-add
instructions to perform an effective shift-left and bitwise-or, since we
know the set of nonzero bits are disjoint after the UMIN.

Reduction in runtime observed for MergeXR30Row_10_NEON:

 Cortex-A55: -34.2%
Cortex-A510: -35.6%
 Cortex-A76: -44.9%
  Cortex-X2: -48.3%

Bug: libyuv:976
Change-Id: I6e2627f9aa8e400ea82ff381ed587fcfc0d94648
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5509199
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 source/row_any.cc    |  2 +-
 source/row_neon64.cc | 42 ++++++++++++++++++++++--------------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/source/row_any.cc b/source/row_any.cc
index e603d754b..459dad26a 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -528,7 +528,7 @@ ANY31PT(MergeXR30Row_10_Any_NEON,
         2,
         uint8_t,
         4,
-        3)
+        7)
 #endif
 
 #ifdef HAS_MERGEXR64ROW_AVX2
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 56aada737..572c082e2 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -1195,32 +1195,34 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
                           uint8_t* dst_ar30,
                           int /* depth */,
                           int width) {
+  // Neon has no "shift left and accumulate/orr", so use a multiply-add to
+  // perform the shift instead.
+  int limit = 1023;
   asm volatile(
-      "movi        v30.16b, #255                 \n"
-      "ushr        v30.4s, v30.4s, #22           \n"  // 1023
-      "1:                                        \n"
-      "ldr         d2, [%2], #8                  \n"  // B
-      "ldr         d1, [%1], #8                  \n"  // G
-      "ldr         d0, [%0], #8                  \n"  // R
-      "ushll       v2.4s, v2.4h, #0              \n"  // 000B
-      "ushll       v1.4s, v1.4h, #0              \n"  // G
-      "ushll       v0.4s, v0.4h, #0              \n"  // R
-      "umin        v2.4s, v2.4s, v30.4s          \n"
-      "umin        v1.4s, v1.4s, v30.4s          \n"
-      "umin        v0.4s, v0.4s, v30.4s          \n"
-      "sli         v2.4s, v1.4s, #10             \n"  // 00GB
-      "sli         v2.4s, v0.4s, #20             \n"  // 0RGB
-      "orr         v2.4s, #0xc0, lsl #24         \n"  // ARGB (AR30)
-      "subs        %w4, %w4, #4                  \n"
-      "str         q2, [%3], #16                 \n"
-      "b.gt        1b                            \n"
+      "dup    v5.8h, %w[limit]          \n"
+      "movi   v6.8h, #16                \n"  // 1 << 4
+      "movi   v7.8h, #4, lsl #8         \n"  // 1 << 10
+      "1:                               \n"
+      "ldr    q0, [%0], #16             \n"  // xxxxxxRrrrrrrrrr
+      "ldr    q1, [%1], #16             \n"  // xxxxxxGggggggggg
+      "ldr    q2, [%2], #16             \n"  // xxxxxxBbbbbbbbbb
+      "umin   v0.8h, v0.8h, v5.8h       \n"  // 000000Rrrrrrrrrr
+      "umin   v1.8h, v1.8h, v5.8h       \n"  // 000000Gggggggggg
+      "movi   v4.8h, #0xc0, lsl #8      \n"  // 1100000000000000
+      "umin   v3.8h, v2.8h, v5.8h       \n"  // 000000Bbbbbbbbbb
+      "mla    v4.8h, v0.8h, v6.8h       \n"  // 11Rrrrrrrrrr0000
+      "mla    v3.8h, v1.8h, v7.8h       \n"  // ggggggBbbbbbbbbb
+      "usra   v4.8h, v1.8h, #6          \n"  // 11RrrrrrrrrrGggg
+      "subs   %w4, %w4, #8              \n"
+      "st2    {v3.8h, v4.8h}, [%3], #32 \n"
+      "b.gt   1b                        \n"
       : "+r"(src_r),     // %0
         "+r"(src_g),     // %1
         "+r"(src_b),     // %2
         "+r"(dst_ar30),  // %3
         "+r"(width)      // %4
-      :
-      : "memory", "cc", "v0", "v1", "v2", "v30");
+      : [limit] "r"(limit)
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 void MergeAR64Row_NEON(const uint16_t* src_r,