From d0c28db56cf032882d97a2b348edcbca91419068 Mon Sep 17 00:00:00 2001 From: George Steed Date: Thu, 21 Mar 2024 09:17:38 +0000 Subject: [PATCH] [AArch64] Optimize Merge{ARGB,XRGB}16To8Row_NEON Rather than shifting the data into the low half of each lane and then using a saturating narrowing operation, we can do the saturation as part of a shift into the highest half of the lane and then use a simpler TRN2 instruction to extract pairs of high halves into full vectors. This also has the nice advantage of allowing us to use ST2 rather than ST4 for storing the result, since ST4 is known to be slow on some micro-architectures. Reduction in runtimes observed for the two kernels: | MergeARGB16To8Row_NEON | MergeXRGB16To8Row_NEON Cortex-A55 | -8.0% | -12.2% Cortex-A510 | -29.9% | -31.4% Cortex-A76 | -29.0% | -32.0% Cortex-X2 | -33.5% | -43.4% Bug: libyuv:976 Change-Id: I9da3beedc27ab43527b3642aa6d4decf3b5b6683 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5509198 Reviewed-by: Frank Barchard Reviewed-by: Justin Green --- source/row_neon64.cc | 55 +++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 4259f425c..f01cc6172 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1293,33 +1293,31 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, uint8_t* dst_argb, int depth, int width) { - int shift = 8 - depth; + // Shift is 8 - depth, +8 so the result is in the top half of each lane. + int shift = 16 - depth; asm volatile( - "dup v31.8h, %w6 \n" "1: \n" - "ldr q2, [%0], #16 \n" // R + "ldr q0, [%0], #16 \n" // B "ldr q1, [%1], #16 \n" // G - "ldr q0, [%2], #16 \n" // B + "ldr q2, [%2], #16 \n" // R "ldr q3, [%3], #16 \n" // A - "ushl v2.8h, v2.8h, v31.8h \n" + "uqshl v0.8h, v0.8h, v31.8h \n" "prfm pldl1keep, [%0, 448] \n" - "ushl v1.8h, v1.8h, v31.8h \n" + "uqshl v1.8h, v1.8h, v31.8h \n" "prfm pldl1keep, [%1, 448] \n" - "ushl v0.8h, v0.8h, v31.8h \n" + "uqshl v2.8h, v2.8h, v31.8h \n" "prfm pldl1keep, [%2, 448] \n" - "ushl v3.8h, v3.8h, v31.8h \n" + "uqshl v3.8h, v3.8h, v31.8h \n" "prfm pldl1keep, [%3, 448] \n" - "uqxtn v2.8b, v2.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v0.8b, v0.8h \n" - "uqxtn v3.8b, v3.8h \n" + "trn2 v0.16b, v0.16b, v1.16b \n" + "trn2 v1.16b, v2.16b, v3.16b \n" "subs %w5, %w5, #8 \n" - "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n" + "st2 {v0.8h, v1.8h}, [%4], #32 \n" "b.gt 1b \n" - : "+r"(src_r), // %0 + : "+r"(src_b), // %0 "+r"(src_g), // %1 - "+r"(src_b), // %2 + "+r"(src_r), // %2 "+r"(src_a), // %3 "+r"(dst_argb), // %4 "+r"(width) // %5 @@ -1333,30 +1331,29 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, uint8_t* dst_argb, int depth, int width) { - int shift = 8 - depth; + // Shift is 8 - depth, +8 so the result is in the top half of each lane. + int shift = 16 - depth; asm volatile( - "dup v31.8h, %w5 \n" - "movi v3.8b, #0xff \n" // A (0xff) + "movi v3.16b, #0xff \n" // A (0xff) "1: \n" - "ldr q2, [%0], #16 \n" // R + "ldr q0, [%0], #16 \n" // B "ldr q1, [%1], #16 \n" // G - "ldr q0, [%2], #16 \n" // B - "ushl v2.8h, v2.8h, v31.8h \n" + "ldr q2, [%2], #16 \n" // R + "uqshl v0.8h, v0.8h, v31.8h \n" "prfm pldl1keep, [%0, 448] \n" - "ushl v1.8h, v1.8h, v31.8h \n" + "uqshl v1.8h, v1.8h, v31.8h \n" "prfm pldl1keep, [%1, 448] \n" - "ushl v0.8h, v0.8h, v31.8h \n" + "uqshl v2.8h, v2.8h, v31.8h \n" "prfm pldl1keep, [%2, 448] \n" - "uqxtn v2.8b, v2.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v0.8b, v0.8h \n" + "trn2 v0.16b, v0.16b, v1.16b \n" + "trn2 v1.16b, v2.16b, v3.16b \n" "subs %w4, %w4, #8 \n" - "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n" + "st2 {v0.8h, v1.8h}, [%3], #32 \n" "b.gt 1b \n" - : "+r"(src_r), // %0 + : "+r"(src_b), // %0 "+r"(src_g), // %1 - "+r"(src_b), // %2 + "+r"(src_r), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 : "r"(shift) // %5