mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Improve instruction interleaving in READI212_SVE
The existing instruction arrangement is sub-optimal on little cores
since it has instructions with dependencies next to each other, so
spread them out to improve performance.
No significant change observed on bigger cores, but little cores do show
some small improvements except for the *Alpha* kernels which regress
slightly.
Runtimes observed compared to the previous SVE implementation:
| Cortex-A510 | Cortex-A520
I210AlphaToARGBRow | (!) +7.0% | (!) +6.8%
I210ToAR30Row | -10.3% | -9.9%
I210ToARGBRow | -2.4% | -2.3%
I212ToAR30Row | -10.3% | -9.9%
I212ToARGBRow | -2.4% | -2.3%
Change-Id: I626942ce02c4610cfac1ea4f8e7890653ee4324f
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6067150
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
a729ba686a
commit
2c32b689e4
@ -80,35 +80,35 @@ extern "C" {
|
||||
|
||||
#define READI210_SVE \
|
||||
"ld1h {z3.h}, p1/z, [%[src_y]] \n" \
|
||||
"lsl z0.h, z3.h, #6 \n" \
|
||||
"usra z0.h, z3.h, #4 \n" \
|
||||
"ld1h {z1.s}, p1/z, [%[src_u]] \n" \
|
||||
"ld1h {z2.s}, p1/z, [%[src_v]] \n" \
|
||||
"incb %[src_y] \n" \
|
||||
"inch %[src_u] \n" \
|
||||
"inch %[src_v] \n" \
|
||||
"lsl z0.h, z3.h, #6 \n" \
|
||||
"trn1 z1.h, z1.h, z1.h \n" \
|
||||
"trn1 z2.h, z2.h, z2.h \n" \
|
||||
"prfm pldl1keep, [%[src_y], 448] \n" \
|
||||
"prfm pldl1keep, [%[src_u], 128] \n" \
|
||||
"prfm pldl1keep, [%[src_v], 128] \n" \
|
||||
"trn1 z1.h, z1.h, z1.h \n" \
|
||||
"trn1 z2.h, z2.h, z2.h \n" \
|
||||
"usra z0.h, z3.h, #4 \n" \
|
||||
"uqshrnb z1.b, z1.h, #2 \n" \
|
||||
"uqshrnb z2.b, z2.h, #2 \n"
|
||||
|
||||
#define READI212_SVE \
|
||||
"ld1h {z3.h}, p1/z, [%[src_y]] \n" \
|
||||
"lsl z0.h, z3.h, #4 \n" \
|
||||
"usra z0.h, z3.h, #8 \n" \
|
||||
"ld1h {z1.s}, p1/z, [%[src_u]] \n" \
|
||||
"ld1h {z2.s}, p1/z, [%[src_v]] \n" \
|
||||
"incb %[src_y] \n" \
|
||||
"inch %[src_u] \n" \
|
||||
"inch %[src_v] \n" \
|
||||
"lsl z0.h, z3.h, #4 \n" \
|
||||
"trn1 z1.h, z1.h, z1.h \n" \
|
||||
"trn1 z2.h, z2.h, z2.h \n" \
|
||||
"prfm pldl1keep, [%[src_y], 448] \n" \
|
||||
"prfm pldl1keep, [%[src_u], 128] \n" \
|
||||
"prfm pldl1keep, [%[src_v], 128] \n" \
|
||||
"trn1 z1.h, z1.h, z1.h \n" \
|
||||
"trn1 z2.h, z2.h, z2.h \n" \
|
||||
"usra z0.h, z3.h, #8 \n" \
|
||||
"uqshrnb z1.b, z1.h, #4 \n" \
|
||||
"uqshrnb z2.b, z2.h, #4 \n"
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user