mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
[AArch64] Use LD1/ST1 rather than LD4/ST4 in ARGBShadeRow_NEON
The use of LD4 and ST4 to de-interleave ARGB color channels is unnecessary here since we can just adjust the scale multiplicand to match the interleaved layout. LD4 and ST4 are known to perform poorly on some micro-architectures so using LD1 and ST1 here should be preferred. Reduction in runtime for ARGBShadeRow_NEON: Cortex-A55: -19.9% Cortex-A510: -50.8% Cortex-A76: -36.0% Cortex-X2: -46.4% Bug: libyuv:976 Change-Id: I10a0e6a0a62242826d39b1e963063770f084226a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5494093 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
ce32eb773f
commit
1eae2efbc7
@ -3613,36 +3613,36 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
|
||||
}
|
||||
|
||||
// Shade 8 pixels at a time by specified value.
|
||||
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
|
||||
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
|
||||
// sqrdmulh is a rounding instruction, so +1 if high bit of low half of
|
||||
// multiply result is set.
|
||||
void ARGBShadeRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
int width,
|
||||
uint32_t value) {
|
||||
asm volatile(
|
||||
"dup v0.4s, %w3 \n" // duplicate scale value.
|
||||
"zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
|
||||
"ushr v0.8h, v0.8h, #1 \n" // scale / 2.
|
||||
"dup v0.4s, %w3 \n" // duplicate scale value.
|
||||
"zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b aarrggbbaarrggbb.
|
||||
"ushr v0.8h, v0.8h, #1 \n" // scale / 2.
|
||||
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uxtl v5.8h, v5.8b \n"
|
||||
"uxtl v6.8h, v6.8b \n"
|
||||
"uxtl v7.8h, v7.8b \n"
|
||||
"sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
|
||||
"sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
|
||||
"sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
|
||||
"sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
|
||||
"uqxtn v4.8b, v4.8h \n"
|
||||
"uqxtn v5.8b, v5.8h \n"
|
||||
"uqxtn v6.8b, v6.8h \n"
|
||||
"uqxtn v7.8b, v7.8h \n"
|
||||
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
"b.gt 1b \n"
|
||||
"1: \n"
|
||||
"ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"uxtl v4.8h, v4.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uxtl v5.8h, v5.8b \n"
|
||||
"uxtl v6.8h, v6.8b \n"
|
||||
"uxtl v7.8h, v7.8b \n"
|
||||
"sqrdmulh v4.8h, v4.8h, v0.8h \n" // argb * scale * 2
|
||||
"sqrdmulh v5.8h, v5.8h, v0.8h \n"
|
||||
"sqrdmulh v6.8h, v6.8h, v0.8h \n"
|
||||
"sqrdmulh v7.8h, v7.8h, v0.8h \n"
|
||||
"uqxtn v4.8b, v4.8h \n"
|
||||
"uqxtn v5.8b, v5.8h \n"
|
||||
"uqxtn v6.8b, v6.8h \n"
|
||||
"uqxtn v7.8b, v7.8h \n"
|
||||
"st1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user