[AArch64] Improve ARGBTOARGB4444 using SRI instructions

The existing sequence to convert from 8-bit ARGB to 4-bit ARGB4444 makes use of a lot of shifts and bit-clears before ORR'ing the pairs together. This is unnecessary since we can do the same with the SRI instruction, so use that instead. Reduction in runtime for selected kernels: Kernel | Cortex-A55 | Cortex-A76 ARGBToARGB4444Row_NEON | -15.3% | -16.6% I422ToARGB4444Row_NEON | -2.7% | -11.9% Bug: libyuv:976 Change-Id: I86cd86c7adf1105558787a679272179821f31a9d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5383443 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2025-12-07 17:26:49 +08:00 · 2024-03-13 05:20:06 +00:00 · 2024-03-13 05:20:06 +00:00 · 95b0a3326c
commit 95b0a3326c
parent b265c311b7
1 changed files with 7 additions and 15 deletions
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -365,15 +365,11 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
      : "cc", "memory", YUVTORGB_REGS, "v19");
 }

-#define ARGBTOARGB4444                                                       \
-  /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f       */ \
-  "ushr       v16.8b, v16.8b, #4             \n" /* B                    */  \
-  "bic        v17.8b, v17.8b, v23.8b         \n" /* G                    */  \
-  "ushr       v18.8b, v18.8b, #4             \n" /* R                    */  \
-  "bic        v19.8b, v19.8b, v23.8b         \n" /* A                    */  \
-  "orr        v0.8b,  v16.8b, v17.8b         \n" /* BG                   */  \
-  "orr        v1.8b,  v18.8b, v19.8b         \n" /* RA                   */  \
-  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
+#define ARGBTOARGB4444                                   \
+  /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A */ \
+  "sri    v17.8b, v16.8b, #4       \n" /* BG */          \
+  "sri    v19.8b, v18.8b, #4       \n" /* RA */          \
+  "zip1   v0.16b, v17.16b, v19.16b \n" /* BGRA */

 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                            const uint8_t* src_u,
@ -383,8 +379,6 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                            int width) {
  asm volatile(
      YUVTORGB_SETUP
-      "movi        v23.16b, #0x0f                \n"  // bits to clear with
-                                                      // vbic.
      "1:                                        \n" READYUV422 YUVTORGB
          RGBTORGB8
      "subs        %w[width], %w[width], #8      \n"
@ -400,7 +394,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
        [width] "+r"(width)                                // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
-      : "cc", "memory", YUVTORGB_REGS, "v19", "v23");
+      : "cc", "memory", YUVTORGB_REGS, "v19");
 }

 void I400ToARGBRow_NEON(const uint8_t* src_y,
@ -2042,8 +2036,6 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb4444,
                            int width) {
  asm volatile(
-      "movi        v23.16b, #0x0f                \n"  // bits to clear with
-                                                      // vbic.
      "1:                                        \n"
      "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
                                                                 // pixels
@ -2055,7 +2047,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
        "+r"(dst_argb4444),  // %1
        "+r"(width)          // %2
      :
-      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19");
 }

 #if LIBYUV_USE_ST2