From d0c28db56cf032882d97a2b348edcbca91419068 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 21 Mar 2024 09:17:38 +0000
Subject: [PATCH] [AArch64] Optimize Merge{ARGB,XRGB}16To8Row_NEON

Rather than shifting the data into the low half of each lane and then
using a saturating narrowing operation, we can do the saturation as part
of a shift into the highest half of the lane and then use a simpler TRN2
instruction to extract pairs of high halves into full vectors. This also
has the nice advantage of allowing us to use ST2 rather than ST4 for
storing the result, since ST4 is known to be slow on some
micro-architectures.

Reduction in runtimes observed for the two kernels:

             | MergeARGB16To8Row_NEON | MergeXRGB16To8Row_NEON
  Cortex-A55 |                  -8.0% |                 -12.2%
 Cortex-A510 |                 -29.9% |                 -31.4%
  Cortex-A76 |                 -29.0% |                 -32.0%
   Cortex-X2 |                 -33.5% |                 -43.4%

Bug: libyuv:976
Change-Id: I9da3beedc27ab43527b3642aa6d4decf3b5b6683
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5509198
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
---
 source/row_neon64.cc | 55 +++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 4259f425c..f01cc6172 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -1293,33 +1293,31 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
                             uint8_t* dst_argb,
                             int depth,
                             int width) {
-  int shift = 8 - depth;
+  // Shift is 8 - depth, +8 so the result is in the top half of each lane.
+  int shift = 16 - depth;
   asm volatile(
-
       "dup         v31.8h, %w6                   \n"
       "1:                                        \n"
-      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q0, [%0], #16                 \n"  // B
       "ldr         q1, [%1], #16                 \n"  // G
-      "ldr         q0, [%2], #16                 \n"  // B
+      "ldr         q2, [%2], #16                 \n"  // R
       "ldr         q3, [%3], #16                 \n"  // A
-      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "uqshl       v0.8h, v0.8h, v31.8h          \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "uqshl       v1.8h, v1.8h, v31.8h          \n"
       "prfm        pldl1keep, [%1, 448]          \n"
-      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "uqshl       v2.8h, v2.8h, v31.8h          \n"
       "prfm        pldl1keep, [%2, 448]          \n"
-      "ushl        v3.8h, v3.8h, v31.8h          \n"
+      "uqshl       v3.8h, v3.8h, v31.8h          \n"
       "prfm        pldl1keep, [%3, 448]          \n"
-      "uqxtn       v2.8b, v2.8h                  \n"
-      "uqxtn       v1.8b, v1.8h                  \n"
-      "uqxtn       v0.8b, v0.8h                  \n"
-      "uqxtn       v3.8b, v3.8h                  \n"
+      "trn2        v0.16b, v0.16b, v1.16b        \n"
+      "trn2        v1.16b, v2.16b, v3.16b        \n"
       "subs        %w5, %w5, #8                  \n"
-      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
+      "st2         {v0.8h, v1.8h}, [%4], #32     \n"
       "b.gt        1b                            \n"
-      : "+r"(src_r),     // %0
+      : "+r"(src_b),     // %0
         "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
+        "+r"(src_r),     // %2
         "+r"(src_a),     // %3
         "+r"(dst_argb),  // %4
         "+r"(width)      // %5
@@ -1333,30 +1331,29 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
                             uint8_t* dst_argb,
                             int depth,
                             int width) {
-  int shift = 8 - depth;
+  // Shift is 8 - depth, +8 so the result is in the top half of each lane.
+  int shift = 16 - depth;
   asm volatile(
-
       "dup         v31.8h, %w5                   \n"
-      "movi        v3.8b, #0xff                  \n"  // A (0xff)
+      "movi        v3.16b, #0xff                 \n"  // A (0xff)
       "1:                                        \n"
-      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q0, [%0], #16                 \n"  // B
       "ldr         q1, [%1], #16                 \n"  // G
-      "ldr         q0, [%2], #16                 \n"  // B
-      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "ldr         q2, [%2], #16                 \n"  // R
+      "uqshl       v0.8h, v0.8h, v31.8h          \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "uqshl       v1.8h, v1.8h, v31.8h          \n"
       "prfm        pldl1keep, [%1, 448]          \n"
-      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "uqshl       v2.8h, v2.8h, v31.8h          \n"
       "prfm        pldl1keep, [%2, 448]          \n"
-      "uqxtn       v2.8b, v2.8h                  \n"
-      "uqxtn       v1.8b, v1.8h                  \n"
-      "uqxtn       v0.8b, v0.8h                  \n"
+      "trn2        v0.16b, v0.16b, v1.16b        \n"
+      "trn2        v1.16b, v2.16b, v3.16b        \n"
       "subs        %w4, %w4, #8                  \n"
-      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
+      "st2         {v0.8h, v1.8h}, [%3], #32     \n"
       "b.gt        1b                            \n"
-      : "+r"(src_r),     // %0
+      : "+r"(src_b),     // %0
         "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
+        "+r"(src_r),     // %2
         "+r"(dst_argb),  // %3
         "+r"(width)      // %4
       : "r"(shift)       // %5