[AArch64] Optimize MergeXR30Row_10_NEON

By keeping intermediate data as 16-bits wide we can compute twice as
much and use ST2 to store the final result. This appears to be much
better even on micro-architectures where ST2 is slightly slower than
ST1.

We save a couple of instructions by taking advantage of multiply-add
instructions to perform an effective shift-left and bitwise-or, since we
know the set of nonzero bits are disjoint after the UMIN.

Reduction in runtime observed for MergeXR30Row_10_NEON:

 Cortex-A55: -34.2%
Cortex-A510: -35.6%
 Cortex-A76: -44.9%
  Cortex-X2: -48.3%

Bug: libyuv:976
Change-Id: I6e2627f9aa8e400ea82ff381ed587fcfc0d94648
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5509199
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-03-22 16:53:25 +00:00 committed by Frank Barchard
parent 56258c125b
commit e348995a92
2 changed files with 23 additions and 21 deletions

View File

@ -528,7 +528,7 @@ ANY31PT(MergeXR30Row_10_Any_NEON,
2, 2,
uint8_t, uint8_t,
4, 4,
3) 7)
#endif #endif
#ifdef HAS_MERGEXR64ROW_AVX2 #ifdef HAS_MERGEXR64ROW_AVX2

View File

@ -1195,32 +1195,34 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
uint8_t* dst_ar30, uint8_t* dst_ar30,
int /* depth */, int /* depth */,
int width) { int width) {
// Neon has no "shift left and accumulate/orr", so use a multiply-add to
// perform the shift instead.
int limit = 1023;
asm volatile( asm volatile(
"movi v30.16b, #255 \n" "dup v5.8h, %w[limit] \n"
"ushr v30.4s, v30.4s, #22 \n" // 1023 "movi v6.8h, #16 \n" // 1 << 4
"movi v7.8h, #4, lsl #8 \n" // 1 << 10
"1: \n" "1: \n"
"ldr d2, [%2], #8 \n" // B "ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr
"ldr d1, [%1], #8 \n" // G "ldr q1, [%1], #16 \n" // xxxxxxGggggggggg
"ldr d0, [%0], #8 \n" // R "ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb
"ushll v2.4s, v2.4h, #0 \n" // 000B "umin v0.8h, v0.8h, v5.8h \n" // 000000Rrrrrrrrrr
"ushll v1.4s, v1.4h, #0 \n" // G "umin v1.8h, v1.8h, v5.8h \n" // 000000Gggggggggg
"ushll v0.4s, v0.4h, #0 \n" // R "movi v4.8h, #0xc0, lsl #8 \n" // 1100000000000000
"umin v2.4s, v2.4s, v30.4s \n" "umin v3.8h, v2.8h, v5.8h \n" // 000000Bbbbbbbbbb
"umin v1.4s, v1.4s, v30.4s \n" "mla v4.8h, v0.8h, v6.8h \n" // 11Rrrrrrrrrr0000
"umin v0.4s, v0.4s, v30.4s \n" "mla v3.8h, v1.8h, v7.8h \n" // ggggggBbbbbbbbbb
"sli v2.4s, v1.4s, #10 \n" // 00GB "usra v4.8h, v1.8h, #6 \n" // 11RrrrrrrrrrGggg
"sli v2.4s, v0.4s, #20 \n" // 0RGB "subs %w4, %w4, #8 \n"
"orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) "st2 {v3.8h, v4.8h}, [%3], #32 \n"
"subs %w4, %w4, #4 \n"
"str q2, [%3], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_r), // %0 : "+r"(src_r), // %0
"+r"(src_g), // %1 "+r"(src_g), // %1
"+r"(src_b), // %2 "+r"(src_b), // %2
"+r"(dst_ar30), // %3 "+r"(dst_ar30), // %3
"+r"(width) // %4 "+r"(width) // %4
: : [limit] "r"(limit)
: "memory", "cc", "v0", "v1", "v2", "v30"); : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
} }
void MergeAR64Row_NEON(const uint16_t* src_r, void MergeAR64Row_NEON(const uint16_t* src_r,