[AArch64] Use full Neon vectors in RGB565To{ARGB,UV,Y}Row_NEON

The existing code only makes use of half of the vector lanes in the RGB565TOARGB macro. In the RGB565To{ARGB,Y} kernels we can load more data to allow using full vectors, adjusting the "any" kernel macros to match. For the RGB565ToUVRow kernel we already have plenty of data but currently call the macro twice as much as needed, so refactor the code to only call it once but operating with full vectors instead. Reduction in runtimes observed for selected micro-architectures: | RGB565ToARGBRow | RGB565ToUVRow | RGB565ToYRow Cortex-A53 | -35.2% | -28.8% | -31.1% Cortex-A55 | -32.5% | -34.4% | -42.9% Cortex-A510 | -21.6% | -27.7% | -47.2% Cortex-A76 | -0.9% | -42.0% | -21.4% Cortex-A720 | -28.6% | -37.2% | -26.1% Cortex-X1 | -3.2% | -42.3% | -23.4% Bug: b/42280945 Change-Id: Ib1f68e5b87cc05a1485bbe96cfef87e6ac119fc3 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790974 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-02-16 23:29:52 +08:00 · 2024-04-23 20:28:02 +01:00 · 2024-04-23 20:28:02 +01:00 · 772f0fde1c
commit 772f0fde1c
parent 2dfb84b311
4 changed files with 55 additions and 64 deletions
--- a/source/convert.cc
+++ b/source/convert.cc
@ -3565,11 +3565,9 @@ int RGB565ToI420(const uint8_t* src_rgb565,
  if (TestCpuFlag(kCpuHasNEON)) {
    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
    RGB565ToYRow = RGB565ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
      RGB565ToYRow = RGB565ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
+      RGB565ToUVRow = RGB565ToUVRow_NEON;
        RGB565ToUVRow = RGB565ToUVRow_NEON;
      }
    }
  }
 // MSA version does direct RGB565 to YUV.
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -3709,7 +3709,7 @@ int RGB565ToARGB(const uint8_t* src_rgb565,
 #if defined(HAS_RGB565TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
    }
  }
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1283,7 +1283,7 @@ ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15)
 ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31)
 #endif
 #ifdef HAS_RGB565TOYROW_NEON
-ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 15)
 #endif
 #ifdef HAS_RGB565TOYROW_MSA
 ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
@ -1381,7 +1381,7 @@ ANY11(RAWToARGBRow_Any_LSX, RAWToARGBRow_LSX, 0, 3, 4, 15)
 ANY11(RAWToARGBRow_Any_LASX, RAWToARGBRow_LASX, 0, 3, 4, 31)
 #endif
 #ifdef HAS_RGB565TOARGBROW_NEON
-ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 15)
 #endif
 #ifdef HAS_RGB565TOARGBROW_MSA
 ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2053,26 +2053,28 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
 }
 #define RGB565TOARGB                                                      \
-  /* Input: v0.8h: RRRRRGGGGGGBBBBB */                                    \
+  /* Input: v0/v4.8h: RRRRRGGGGGGBBBBB */                                 \
  "shrn       v1.8b, v0.8h, #3               \n" /* G GGGGGGxx */         \
-  "shrn       v2.8b, v0.8h, #8               \n" /* R RRRRRxxx */         \
+  "shrn2      v1.16b, v4.8h, #3              \n" /* G GGGGGGxx */         \
-  "xtn        v0.8b, v0.8h                   \n" /* B xxxBBBBB */         \
+  "uzp2       v2.16b, v0.16b, v4.16b         \n" /* R RRRRRxxx */         \
-  "sri        v1.8b, v1.8b, #6               \n" /* G GGGGGGGG, fill 2 */ \
+  "uzp1       v0.16b, v0.16b, v4.16b         \n" /* B xxxBBBBB */         \
-  "shl        v0.8b, v0.8b, #3               \n" /* B BBBBB000 */         \
+  "sri        v1.16b, v1.16b, #6             \n" /* G GGGGGGGG, fill 2 */ \
-  "sri        v2.8b, v2.8b, #5               \n" /* R RRRRRRRR, fill 3 */ \
+  "shl        v0.16b, v0.16b, #3             \n" /* B BBBBB000 */         \
-  "sri        v0.8b, v0.8b, #5               \n" /* R BBBBBBBB, fill 3 */
+  "sri        v2.16b, v2.16b, #5             \n" /* R RRRRRRRR, fill 3 */ \
  "sri        v0.16b, v0.16b, #5             \n" /* R BBBBBBBB, fill 3 */
 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile (
+  asm volatile(
-      "movi        v3.8b, #255                   \n"  // Alpha
+      "movi    v3.16b, #255            \n"  // Alpha
-      "1:                                        \n"
+      "1:                              \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "ldp     q0, q4, [%0], #32       \n"  // load 16 RGB565 pixels
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "subs    %w2, %w2, #16           \n"  // 16 processed per loop
-      "prfm        pldl1keep, [%0, 448]          \n" RGB565TOARGB
+      "prfm    pldl1keep, [%0, 448]    \n" RGB565TOARGB
-      "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "st4     {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n"  // store 16 ARGB
-      "b.gt        1b                            \n"
+      "add     %1, %1, #64             \n"
      "b.gt    1b                      \n"
      : "+r"(src_rgb565),  // %0
        "+r"(dst_argb),    // %1
        "+r"(width)        // %2
@ -3281,46 +3283,32 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
                        uint8_t* dst_v,
                        int width) {
  const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
-  asm volatile (
+  asm volatile(
      RGBTOUV_SETUP_REG
-      "1:                                        \n"
+      "1:                                   \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "ldp         q0, q4, [%0], #32        \n"  // load 16 RGB565 pixels.
      RGB565TOARGB
-      "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp      v16.8h, v0.16b           \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]          \n"
+      "prfm        pldl1keep, [%0, 448]     \n"
-      "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp      v17.8h, v1.16b           \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "uaddlp      v18.8h, v2.16b           \n"  // R 16 bytes -> 8 shorts.
-      "ld1         {v0.16b}, [%0], #16           \n"  // next 8 RGB565 pixels.
+
      "ldp         q0, q4, [%1], #32        \n"  // load 16 RGB565 pixels.
      RGB565TOARGB
-      "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "uadalp      v16.8h, v0.16b           \n"  // B 16 bytes -> 8 shorts.
-      "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
+      "prfm        pldl1keep, [%1, 448]     \n"
-      "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
+      "uadalp      v17.8h, v1.16b           \n"  // G 16 bytes -> 8 shorts.
      "uadalp      v18.8h, v2.16b           \n"  // R 16 bytes -> 8 shorts.
-      "ld1         {v0.16b}, [%1], #16           \n"  // load 8 RGB565 pixels.
+      "urshr       v0.8h, v16.8h, #1        \n"  // 2x average
-      RGB565TOARGB
+      "urshr       v1.8h, v17.8h, #1        \n"
-      "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
+      "urshr       v2.8h, v18.8h, #1        \n"
      "prfm        pldl1keep, [%1, 448]          \n"
      "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
      "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
      "ld1         {v0.16b}, [%1], #16           \n"  // next 8 RGB565 pixels.
      RGB565TOARGB
      "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
      "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
      "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
-      "ins         v16.D[1], v26.D[0]            \n"
+      "subs        %w4, %w4, #16            \n"  // 16 processed per loop.
      "ins         v17.D[1], v27.D[0]            \n"
      "ins         v18.D[1], v28.D[0]            \n"
      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
      "urshr       v1.8h, v17.8h, #1             \n"
      "urshr       v2.8h, v18.8h, #1             \n"
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
      RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v0.8b}, [%2], #8        \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "st1         {v1.8b}, [%3], #8        \n"  // store 8 pixels V.
-      "b.gt        1b                            \n"
+      "b.gt        1b                       \n"
      : "+r"(src_rgb565),    // %0
        "+r"(src_rgb565_1),  // %1
        "+r"(dst_u),           // %2
@ -3423,22 +3411,27 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
 }
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
-      "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
+      "movi        v24.16b, #25                  \n"  // B * 0.1016 coefficient
-      "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
+      "movi        v25.16b, #129                 \n"  // G * 0.5078 coefficient
-      "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
+      "movi        v26.16b, #66                  \n"  // R * 0.2578 coefficient
-      "movi        v27.8b, #16                   \n"  // Add 16 constant
+      "movi        v27.16b, #16                  \n"  // Add 16 constant
      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
+      "ldp         q0, q4, [%0], #32             \n"  // load 16 RGB565 pixels.
-      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
      RGB565TOARGB
      "umull       v3.8h, v0.8b, v24.8b          \n"  // B
      "umull2      v4.8h, v0.16b, v24.16b        \n"  // B
      "prfm        pldl1keep, [%0, 448]          \n"
      "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
      "umlal2      v4.8h, v1.16b, v25.16b        \n"  // G
      "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
      "umlal2      v4.8h, v2.16b, v26.16b        \n"  // R
      "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
      "uqrshrn     v1.8b, v4.8h, #8              \n"  // 16 bit to 8 bit Y
      "uqadd       v0.8b, v0.8b, v27.8b          \n"
-      "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
+      "uqadd       v1.8b, v1.8b, v27.8b          \n"
      "stp         d0, d1, [%1], #16             \n"  // store 8 pixels Y.
      "b.gt        1b                            \n"
      : "+r"(src_rgb565),  // %0
        "+r"(dst_y),       // %1