From 56258c125b69d1564e3c75b1b1d6ee30a959cc22 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 19 Mar 2024 16:41:45 +0000
Subject: [PATCH] [AArch64] Avoid redundant shift around RGB565 conversion

The existing code performs a narrowing shift right (in RGBTORGB8)
followed by a widening left shift (in ARGBTORGB565). This is redundant
since we could have simply not performed a narrowing operation to begin
with and instead done a saturating left shift to saturate against the
top of the 16-bit lanes rather than the narrowed 8-bit lanes.

To enable this we introduce new RGBTORGB8_TOP and ARGBTORGB565_FROM_TOP
macros which produce and consume values from the high half of each
16-bit lane rather than a narrowed 8-bit intermediate.

Reduction in runtime for selected kernels:

                     | Cortex-A55 | Cortex-A510 | Cortex-A76 | Cortex-X2
I422ToRGB565Row_NEON |     -10.8% |       -6.1% |     -17.2% |    -23.6%
NV12ToRGB565Row_NEON |     -11.4% |       -4.9% |     -20.4% |    -17.4%

Bug: libyuv:976
Change-Id: I3337b8f41ff62a7af1b70a56b774239bdb55d0f1
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5509197
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 source/row_neon64.cc | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index dcf901773..56aada737 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -147,6 +147,13 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9,  9,  13, 13,
   "uqshrn     v16.8b, v16.8h, #6             \n" \
   "uqshrn     v18.8b, v18.8h, #6             \n"
 
+// Convert from 2.14 fixed point RGB to 8 bit RGB, placing the results in the
+// top half of each lane.
+#define RGBTORGB8_TOP                            \
+  "uqshl      v17.8h, v17.8h, #2             \n" \
+  "uqshl      v16.8h, v16.8h, #2             \n" \
+  "uqshl      v18.8h, v18.8h, #2             \n"
+
 #define YUVTORGB_REGS                                                         \
   "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", \
       "v25", "v26", "v27", "v28", "v29", "v30", "v31"
@@ -321,12 +328,24 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
       : "cc", "memory", YUVTORGB_REGS);
 }
 
-#define ARGBTORGB565                                                        \
-  "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
-  "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
-  "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
-  "sri        v18.8h, v17.8h, #5             \n" /* RG                   */ \
-  "sri        v18.8h, v16.8h, #11            \n" /* RGB                  */
+#define ARGBTORGB565                                                \
+  /* Inputs:                                                        \
+   * v16: bbbbbxxx                                                  \
+   * v17: ggggggxx                                                  \
+   * v18: rrrrrxxx */                                               \
+  "shll       v18.8h, v18.8b, #8     \n" /* rrrrrrxx00000000     */ \
+  "shll       v17.8h, v17.8b, #8     \n" /* gggggxxx00000000     */ \
+  "shll       v16.8h, v16.8b, #8     \n" /* bbbbbbxx00000000     */ \
+  "sri        v18.8h, v17.8h, #5     \n" /* rrrrrgggggg00000     */ \
+  "sri        v18.8h, v16.8h, #11    \n" /* rrrrrggggggbbbbb     */
+
+#define ARGBTORGB565_FROM_TOP                                       \
+  /* Inputs:                                                        \
+   * v16: bbbbbxxxxxxxxxxx                                          \
+   * v17: ggggggxxxxxxxxxx                                          \
+   * v18: rrrrrxxxxxxxxxxx */                                       \
+  "sri        v18.8h, v17.8h, #5     \n" /* rrrrrgggggg00000     */ \
+  "sri        v18.8h, v16.8h, #11    \n" /* rrrrrggggggbbbbb     */
 
 void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           const uint8_t* src_u,
@@ -337,7 +356,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 I4XXTORGB
-          RGBTORGB8 "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
+          RGBTORGB8_TOP
+      "subs        %w[width], %w[width], #8      \n" ARGBTORGB565_FROM_TOP
       "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8 pixels RGB565.
       "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
@@ -579,8 +599,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READNV12 NVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
+      "1:                                        \n" READNV12 NVTORGB
+          RGBTORGB8_TOP
+      "subs        %w[width], %w[width], #8      \n" ARGBTORGB565_FROM_TOP
       "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8
                                                        // pixels
                                                        // RGB565.