[AArch64] Unroll ScaleRowDown4_NEON

We can use wider load/store instructions here which is mostly an
improvement across the board.

Reduction in runtimes observed compared to the existing Neon
implementation:

 Cortex-A55:  +4.9% (!)
Cortex-A510: -46.3%
Cortex-A520: -49.0%
 Cortex-A76: -12.2%
Cortex-A715: -15.5%
Cortex-A720: -15.0%
  Cortex-X1: -12.4%
  Cortex-X2: -12.5%
  Cortex-X3: -12.3%
  Cortex-X4:  +0.3%

Bug: b/42280945
Change-Id: Id8af6499c63919924c2a954dfe7765b703ce4820
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5785970
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-05-15 21:50:12 +01:00 committed by Frank Barchard
parent e6297afd14
commit 2d62d8d22a
3 changed files with 6 additions and 6 deletions

View File

@ -279,7 +279,7 @@ static void ScalePlaneDown4(int src_width,
if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowDown4 =
filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
}
}

View File

@ -202,7 +202,7 @@ SDANY(ScaleRowDown4Box_Any_AVX2,
15)
#endif
#ifdef HAS_SCALEROWDOWN4_NEON
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 15)
SDANY(ScaleRowDown4Box_Any_NEON,
ScaleRowDown4Box_NEON,
ScaleRowDown4Box_C,

View File

@ -101,12 +101,12 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0
"subs %w2, %w2, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v2.8b}, [%1], #8 \n"
"st1 {v2.16b}, [%1], #16 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1