[AArch64] Optimize Merge{ARGB,XRGB}16To8Row_NEON

Rather than shifting the data into the low half of each lane and then using a saturating narrowing operation, we can do the saturation as part of a shift into the highest half of the lane and then use a simpler TRN2 instruction to extract pairs of high halves into full vectors. This also has the nice advantage of allowing us to use ST2 rather than ST4 for storing the result, since ST4 is known to be slow on some micro-architectures. Reduction in runtimes observed for the two kernels: | MergeARGB16To8Row_NEON | MergeXRGB16To8Row_NEON Cortex-A55 | -8.0% | -12.2% Cortex-A510 | -29.9% | -31.4% Cortex-A76 | -29.0% | -32.0% Cortex-X2 | -33.5% | -43.4% Bug: libyuv:976 Change-Id: I9da3beedc27ab43527b3642aa6d4decf3b5b6683 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5509198 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
2025-12-06 16:56:55 +08:00 · 2024-03-21 09:17:38 +00:00 · 2024-03-21 09:17:38 +00:00 · d0c28db56c
commit d0c28db56c
parent 4f7fd808b7
1 changed files with 26 additions and 29 deletions
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -1293,33 +1293,31 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
                            uint8_t* dst_argb,
                            int depth,
                            int width) {
-  int shift = 8 - depth;
+  // Shift is 8 - depth, +8 so the result is in the top half of each lane.
+  int shift = 16 - depth;
  asm volatile(
-
      "dup         v31.8h, %w6                   \n"
      "1:                                        \n"
-      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q0, [%0], #16                 \n"  // B
      "ldr         q1, [%1], #16                 \n"  // G
-      "ldr         q0, [%2], #16                 \n"  // B
+      "ldr         q2, [%2], #16                 \n"  // R
      "ldr         q3, [%3], #16                 \n"  // A
-      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "uqshl       v0.8h, v0.8h, v31.8h          \n"
      "prfm        pldl1keep, [%0, 448]          \n"
-      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "uqshl       v1.8h, v1.8h, v31.8h          \n"
      "prfm        pldl1keep, [%1, 448]          \n"
-      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "uqshl       v2.8h, v2.8h, v31.8h          \n"
      "prfm        pldl1keep, [%2, 448]          \n"
-      "ushl        v3.8h, v3.8h, v31.8h          \n"
+      "uqshl       v3.8h, v3.8h, v31.8h          \n"
      "prfm        pldl1keep, [%3, 448]          \n"
-      "uqxtn       v2.8b, v2.8h                  \n"
-      "uqxtn       v1.8b, v1.8h                  \n"
-      "uqxtn       v0.8b, v0.8h                  \n"
-      "uqxtn       v3.8b, v3.8h                  \n"
+      "trn2        v0.16b, v0.16b, v1.16b        \n"
+      "trn2        v1.16b, v2.16b, v3.16b        \n"
      "subs        %w5, %w5, #8                  \n"
-      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
+      "st2         {v0.8h, v1.8h}, [%4], #32     \n"
      "b.gt        1b                            \n"
-      : "+r"(src_r),     // %0
+      : "+r"(src_b),     // %0
        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
+        "+r"(src_r),     // %2
        "+r"(src_a),     // %3
        "+r"(dst_argb),  // %4
        "+r"(width)      // %5
@ -1333,30 +1331,29 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
                            uint8_t* dst_argb,
                            int depth,
                            int width) {
-  int shift = 8 - depth;
+  // Shift is 8 - depth, +8 so the result is in the top half of each lane.
+  int shift = 16 - depth;
  asm volatile(
-
      "dup         v31.8h, %w5                   \n"
-      "movi        v3.8b, #0xff                  \n"  // A (0xff)
+      "movi        v3.16b, #0xff                 \n"  // A (0xff)
      "1:                                        \n"
-      "ldr         q2, [%0], #16                 \n"  // R
+      "ldr         q0, [%0], #16                 \n"  // B
      "ldr         q1, [%1], #16                 \n"  // G
-      "ldr         q0, [%2], #16                 \n"  // B
-      "ushl        v2.8h, v2.8h, v31.8h          \n"
+      "ldr         q2, [%2], #16                 \n"  // R
+      "uqshl       v0.8h, v0.8h, v31.8h          \n"
      "prfm        pldl1keep, [%0, 448]          \n"
-      "ushl        v1.8h, v1.8h, v31.8h          \n"
+      "uqshl       v1.8h, v1.8h, v31.8h          \n"
      "prfm        pldl1keep, [%1, 448]          \n"
-      "ushl        v0.8h, v0.8h, v31.8h          \n"
+      "uqshl       v2.8h, v2.8h, v31.8h          \n"
      "prfm        pldl1keep, [%2, 448]          \n"
-      "uqxtn       v2.8b, v2.8h                  \n"
-      "uqxtn       v1.8b, v1.8h                  \n"
-      "uqxtn       v0.8b, v0.8h                  \n"
+      "trn2        v0.16b, v0.16b, v1.16b        \n"
+      "trn2        v1.16b, v2.16b, v3.16b        \n"
      "subs        %w4, %w4, #8                  \n"
-      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
+      "st2         {v0.8h, v1.8h}, [%3], #32     \n"
      "b.gt        1b                            \n"
-      : "+r"(src_r),     // %0
+      : "+r"(src_b),     // %0
        "+r"(src_g),     // %1
-        "+r"(src_b),     // %2
+        "+r"(src_r),     // %2
        "+r"(dst_argb),  // %3
        "+r"(width)      // %4
      : "r"(shift)       // %5