[AArch64] Avoid redundant shift around RGB565 conversion

The existing code performs a narrowing shift right (in RGBTORGB8) followed by a widening left shift (in ARGBTORGB565). This is redundant since we could have simply not performed a narrowing operation to begin with and instead done a saturating left shift to saturate against the top of the 16-bit lanes rather than the narrowed 8-bit lanes. To enable this we introduce new RGBTORGB8_TOP and ARGBTORGB565_FROM_TOP macros which produce and consume values from the high half of each 16-bit lane rather than a narrowed 8-bit intermediate. Reduction in runtime for selected kernels: | Cortex-A55 | Cortex-A510 | Cortex-A76 | Cortex-X2 I422ToRGB565Row_NEON | -10.8% | -6.1% | -17.2% | -23.6% NV12ToRGB565Row_NEON | -11.4% | -4.9% | -20.4% | -17.4% Bug: libyuv:976 Change-Id: I3337b8f41ff62a7af1b70a56b774239bdb55d0f1 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5509197 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-02-06 17:59:50 +08:00 · 2024-03-19 16:41:45 +00:00 · 2024-03-19 16:41:45 +00:00 · 56258c125b
commit 56258c125b
parent c5f9583b1c
1 changed files with 30 additions and 9 deletions
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -147,6 +147,13 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9,  9,  13, 13,
  "uqshrn     v16.8b, v16.8h, #6             \n" \
  "uqshrn     v18.8b, v18.8h, #6             \n"

+// Convert from 2.14 fixed point RGB to 8 bit RGB, placing the results in the
+// top half of each lane.
+#define RGBTORGB8_TOP                            \
+  "uqshl      v17.8h, v17.8h, #2             \n" \
+  "uqshl      v16.8h, v16.8h, #2             \n" \
+  "uqshl      v18.8h, v18.8h, #2             \n"
+
 #define YUVTORGB_REGS                                                         \
  "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", \
      "v25", "v26", "v27", "v28", "v29", "v30", "v31"
@ -321,12 +328,24 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
      : "cc", "memory", YUVTORGB_REGS);
 }

-#define ARGBTORGB565                                                        \
-  "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
-  "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
-  "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
-  "sri        v18.8h, v17.8h, #5             \n" /* RG                   */ \
-  "sri        v18.8h, v16.8h, #11            \n" /* RGB                  */
+#define ARGBTORGB565                                                \
+  /* Inputs:                                                        \
+   * v16: bbbbbxxx                                                  \
+   * v17: ggggggxx                                                  \
+   * v18: rrrrrxxx */                                               \
+  "shll       v18.8h, v18.8b, #8     \n" /* rrrrrrxx00000000     */ \
+  "shll       v17.8h, v17.8b, #8     \n" /* gggggxxx00000000     */ \
+  "shll       v16.8h, v16.8b, #8     \n" /* bbbbbbxx00000000     */ \
+  "sri        v18.8h, v17.8h, #5     \n" /* rrrrrgggggg00000     */ \
+  "sri        v18.8h, v16.8h, #11    \n" /* rrrrrggggggbbbbb     */
+
+#define ARGBTORGB565_FROM_TOP                                       \
+  /* Inputs:                                                        \
+   * v16: bbbbbxxxxxxxxxxx                                          \
+   * v17: ggggggxxxxxxxxxx                                          \
+   * v18: rrrrrxxxxxxxxxxx */                                       \
+  "sri        v18.8h, v17.8h, #5     \n" /* rrrrrgggggg00000     */ \
+  "sri        v18.8h, v16.8h, #11    \n" /* rrrrrggggggbbbbb     */

 void I422ToRGB565Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_u,
@ -337,7 +356,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
  asm volatile(
      YUVTORGB_SETUP
      "1:                                        \n" READYUV422 I4XXTORGB
-          RGBTORGB8 "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
+          RGBTORGB8_TOP
+      "subs        %w[width], %w[width], #8      \n" ARGBTORGB565_FROM_TOP
      "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8 pixels RGB565.
      "b.gt        1b                            \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
@ -579,8 +599,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
  asm volatile(
      YUVTORGB_SETUP
      "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READNV12 NVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
+      "1:                                        \n" READNV12 NVTORGB
+          RGBTORGB8_TOP
+      "subs        %w[width], %w[width], #8      \n" ARGBTORGB565_FROM_TOP
      "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8
                                                       // pixels
                                                       // RGB565.