mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Unroll ScaleRowDown4_NEON
We can use wider load/store instructions here which is mostly an improvement across the board. Reduction in runtimes observed compared to the existing Neon implementation: Cortex-A55: +4.9% (!) Cortex-A510: -46.3% Cortex-A520: -49.0% Cortex-A76: -12.2% Cortex-A715: -15.5% Cortex-A720: -15.0% Cortex-X1: -12.4% Cortex-X2: -12.5% Cortex-X3: -12.3% Cortex-X4: +0.3% Bug: b/42280945 Change-Id: Id8af6499c63919924c2a954dfe7765b703ce4820 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5785970 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
e6297afd14
commit
2d62d8d22a
@ -279,7 +279,7 @@ static void ScalePlaneDown4(int src_width,
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleRowDown4 =
|
||||
filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
if (IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
|
||||
}
|
||||
}
|
||||
|
||||
@ -202,7 +202,7 @@ SDANY(ScaleRowDown4Box_Any_AVX2,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_NEON
|
||||
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
|
||||
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 15)
|
||||
SDANY(ScaleRowDown4Box_Any_NEON,
|
||||
ScaleRowDown4Box_NEON,
|
||||
ScaleRowDown4Box_C,
|
||||
|
||||
@ -101,12 +101,12 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"st1 {v2.8b}, [%1], #8 \n"
|
||||
"st1 {v2.16b}, [%1], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user