mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
[AArch64] Use LD1/ST1 rather than LD4/ST4 in ARGBAddRow_NEON
There is no need to de-interleave channels here since we are applying the same operation across all lanes. LD4 and ST4 are known to be significantly slower than LD1/ST1 on some micro-architectures so we should prefer to avoid them where possible. Reduction in runtimes observed for ARGBAddRow_NEON: Cortex-A55: -15.0% Cortex-A510: -59.8% Cortex-A76: -54.4% Cortex-X2: -70.4% Change-Id: Id04e5259d8e5e7511dad5df85cdf9759b392cb99 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5454831 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
d43a3bb2df
commit
e646991347
@ -3810,8 +3810,8 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
|
||||
asm volatile(
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
|
||||
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
"ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v0.8b, v0.8b, v4.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
@ -3819,7 +3819,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uqadd v2.8b, v2.8b, v6.8b \n"
|
||||
"uqadd v3.8b, v3.8b, v7.8b \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user