mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[AArch64] Optimize MergeXR30Row_10_NEON
By keeping intermediate data as 16-bits wide we can compute twice as much and use ST2 to store the final result. This appears to be much better even on micro-architectures where ST2 is slightly slower than ST1. We save a couple of instructions by taking advantage of multiply-add instructions to perform an effective shift-left and bitwise-or, since we know the set of nonzero bits are disjoint after the UMIN. Reduction in runtime observed for MergeXR30Row_10_NEON: Cortex-A55: -34.2% Cortex-A510: -35.6% Cortex-A76: -44.9% Cortex-X2: -48.3% Bug: libyuv:976 Change-Id: I6e2627f9aa8e400ea82ff381ed587fcfc0d94648 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5509199 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
56258c125b
commit
e348995a92
@ -528,7 +528,7 @@ ANY31PT(MergeXR30Row_10_Any_NEON,
|
||||
2,
|
||||
uint8_t,
|
||||
4,
|
||||
3)
|
||||
7)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXR64ROW_AVX2
|
||||
|
||||
@ -1195,32 +1195,34 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
|
||||
uint8_t* dst_ar30,
|
||||
int /* depth */,
|
||||
int width) {
|
||||
// Neon has no "shift left and accumulate/orr", so use a multiply-add to
|
||||
// perform the shift instead.
|
||||
int limit = 1023;
|
||||
asm volatile(
|
||||
"movi v30.16b, #255 \n"
|
||||
"ushr v30.4s, v30.4s, #22 \n" // 1023
|
||||
"dup v5.8h, %w[limit] \n"
|
||||
"movi v6.8h, #16 \n" // 1 << 4
|
||||
"movi v7.8h, #4, lsl #8 \n" // 1 << 10
|
||||
"1: \n"
|
||||
"ldr d2, [%2], #8 \n" // B
|
||||
"ldr d1, [%1], #8 \n" // G
|
||||
"ldr d0, [%0], #8 \n" // R
|
||||
"ushll v2.4s, v2.4h, #0 \n" // 000B
|
||||
"ushll v1.4s, v1.4h, #0 \n" // G
|
||||
"ushll v0.4s, v0.4h, #0 \n" // R
|
||||
"umin v2.4s, v2.4s, v30.4s \n"
|
||||
"umin v1.4s, v1.4s, v30.4s \n"
|
||||
"umin v0.4s, v0.4s, v30.4s \n"
|
||||
"sli v2.4s, v1.4s, #10 \n" // 00GB
|
||||
"sli v2.4s, v0.4s, #20 \n" // 0RGB
|
||||
"orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
|
||||
"subs %w4, %w4, #4 \n"
|
||||
"str q2, [%3], #16 \n"
|
||||
"ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr
|
||||
"ldr q1, [%1], #16 \n" // xxxxxxGggggggggg
|
||||
"ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb
|
||||
"umin v0.8h, v0.8h, v5.8h \n" // 000000Rrrrrrrrrr
|
||||
"umin v1.8h, v1.8h, v5.8h \n" // 000000Gggggggggg
|
||||
"movi v4.8h, #0xc0, lsl #8 \n" // 1100000000000000
|
||||
"umin v3.8h, v2.8h, v5.8h \n" // 000000Bbbbbbbbbb
|
||||
"mla v4.8h, v0.8h, v6.8h \n" // 11Rrrrrrrrrr0000
|
||||
"mla v3.8h, v1.8h, v7.8h \n" // ggggggBbbbbbbbbb
|
||||
"usra v4.8h, v1.8h, #6 \n" // 11RrrrrrrrrrGggg
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"st2 {v3.8h, v4.8h}, [%3], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_ar30), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v30");
|
||||
: [limit] "r"(limit)
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||
}
|
||||
|
||||
void MergeAR64Row_NEON(const uint16_t* src_r,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user