[AArch64] Avoid unnecessary widening in I422ToARGB1555Row_NEON

The existing code first widens the component vectors from 8-bit elements to 16-bits to construct the final ARGB1555 result, however this is unnecessary since the inputs to the widening are themselves the result of having just been narrowed in the RGBTORGB8 macro. By making use of the new RGBTORGB8_TOP macro we can get rid of both the widening as well as the prior narrowing step. Also remove volatile from the asm, it is unnecessary. Reduction in runtime observed for I422ToARGB1555Row_NEON: Cortex-A55: -7.8% Cortex-A76: -15.0% Cortex-A720: -20.3% Cortex-X1: -20.2% Cortex-X2: -20.3% Bug: libyuv:976 Change-Id: Id031c5d4d788828297adcc2fe2c2cd8d99b45433 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5616050 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2024-04-19 15:06:35 +01:00 · 2024-04-19 15:06:35 +01:00 · 89cf221baa
commit 89cf221baa
parent e6c4b9ad2e
1 changed files with 26 additions and 17 deletions
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -566,14 +566,24 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
      : "cc", "memory", YUVTORGB_REGS);
 }

-#define ARGBTOARGB1555                                                      \
-  "shll       v0.8h,  v19.8b, #8             \n" /* A                    */ \
-  "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
-  "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
-  "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
-  "sri        v0.8h,  v18.8h, #1             \n" /* AR                   */ \
-  "sri        v0.8h,  v17.8h, #6             \n" /* ARG                  */ \
-  "sri        v0.8h,  v16.8h, #11            \n" /* ARGB                 */
+#define ARGBTOARGB1555                                                  \
+  /* Inputs:                                                            \
+   * v16: bbbbbxxx  v17: gggggxxx  v18: rrrrrxxx  v19: axxxxxxx */      \
+  "shll       v0.8h,  v19.8b, #8             \n" /* axxxxxxx00000000 */ \
+  "shll       v18.8h, v18.8b, #8             \n" /* rrrrrxxx00000000 */ \
+  "shll       v17.8h, v17.8b, #8             \n" /* gggggxxx00000000 */ \
+  "shll       v16.8h, v16.8b, #8             \n" /* bbbbbxxx00000000 */ \
+  "sri        v0.8h,  v18.8h, #1             \n" /* arrrrrxxx0000000 */ \
+  "sri        v0.8h,  v17.8h, #6             \n" /* arrrrrgggggxxx00 */ \
+  "sri        v0.8h,  v16.8h, #11            \n" /* arrrrrgggggbbbbb */
+
+#define ARGBTOARGB1555_FROM_TOP                                         \
+  /* Inputs:                                                            \
+   * v16: bbbbbxxxxxxxxxxx  v17: gggggxxxxxxxxxxx                       \
+   * v18: rrrrrxxxxxxxxxxx  v19: axxxxxxxxxxxxxxx */                    \
+  "sri        v19.8h,  v18.8h, #1            \n" /* arrrrrxxxxxxxxxx */ \
+  "sri        v19.8h,  v17.8h, #6            \n" /* arrrrrgggggxxxxx */ \
+  "sri        v19.8h,  v16.8h, #11           \n" /* arrrrrgggggbbbbb */

 void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
@ -581,15 +591,14 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width) {
-  asm volatile(
-      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV422 I4XXTORGB
-          RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n" ARGBTOARGB1555
-      "st1         {v0.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels
-                                                        // RGB565.
-      "b.gt        1b                            \n"
+  asm(YUVTORGB_SETUP
+      "movi    v19.8h, #0x80, lsl #8             \n"
+      "1:                                        \n"  //
+      READYUV422 I4XXTORGB RGBTORGB8_TOP
+      "subs    %w[width], %w[width], #8          \n"  //
+      ARGBTOARGB1555_FROM_TOP
+      "st1     {v19.8h}, [%[dst_argb1555]], #16  \n"  // store 8 pixels RGB1555.
+      "b.gt    1b                                \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]