[AArch64] Unroll ScaleRowDown34_1_Box_NEON

We can make use of wider instructions for the loads and stores as well
as the URHADD instructions. In addition the duplicated instructions of
the code from the unrolling provides a further small improvement for
little cores with limited out-of-order capability.

Reduction in runtimes observed compared to the existing Neon
implementation:

 Cortex-A55: -23.5%
Cortex-A510: -35.4%
Cortex-A520: -40.5%
 Cortex-A76: -15.1%
Cortex-A715:  -6.2%
Cortex-A720:  -6.2%
  Cortex-X1: -17.9%
  Cortex-X2: -18.4%
  Cortex-X3: -18.3%
  Cortex-X4: -14.0%

Bug: b/42280945
Change-Id: I5905e026a0507870bfc580b702906d6acb4ed6f4
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5725170
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-05-15 21:26:24 +01:00 committed by Frank Barchard
parent be5de19db3
commit 776a509891
2 changed files with 27 additions and 11 deletions

View File

@ -261,6 +261,14 @@ SDANY(ScaleRowDown34_0_Box_Any_NEON,
4 / 3,
1,
23)
#ifdef __aarch64__
SDANY(ScaleRowDown34_1_Box_Any_NEON,
ScaleRowDown34_1_Box_NEON,
ScaleRowDown34_1_Box_C,
4 / 3,
1,
47)
#else
SDANY(ScaleRowDown34_1_Box_Any_NEON,
ScaleRowDown34_1_Box_NEON,
ScaleRowDown34_1_Box_C,
@ -268,6 +276,7 @@ SDANY(ScaleRowDown34_1_Box_Any_NEON,
1,
23)
#endif
#endif
#ifdef HAS_SCALEROWDOWN34_MSA
SDANY(ScaleRowDown34_Any_MSA,
ScaleRowDown34_MSA,

View File

@ -237,35 +237,42 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
"movi v20.8b, #3 \n"
asm volatile(
"movi v20.16b, #3 \n"
"add %3, %3, %0 \n"
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1
"subs %w2, %w2, #48 \n"
// average src line 0 with src line 1
"urhadd v0.8b, v0.8b, v4.8b \n"
"urhadd v1.8b, v1.8b, v5.8b \n"
"urhadd v2.8b, v2.8b, v6.8b \n"
"urhadd v3.8b, v3.8b, v7.8b \n"
"urhadd v0.16b, v0.16b, v4.16b \n"
"urhadd v1.16b, v1.16b, v5.16b \n"
"urhadd v2.16b, v2.16b, v6.16b \n"
"urhadd v3.16b, v3.16b, v7.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
"ushll v4.8h, v1.8b, #0 \n"
"ushll2 v5.8h, v1.16b, #0 \n"
"umlal v4.8h, v0.8b, v20.8b \n"
"umlal2 v5.8h, v0.16b, v20.16b \n"
"uqrshrn v0.8b, v4.8h, #2 \n"
"uqrshrn2 v0.16b, v5.8h, #2 \n"
"prfm pldl1keep, [%3, 448] \n"
// a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
"urhadd v1.8b, v1.8b, v2.8b \n"
"urhadd v1.16b, v1.16b, v2.16b \n"
// a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
"ushll v4.8h, v2.8b, #0 \n"
"ushll2 v5.8h, v2.16b, #0 \n"
"umlal v4.8h, v3.8b, v20.8b \n"
"umlal2 v5.8h, v3.16b, v20.16b \n"
"uqrshrn v2.8b, v4.8h, #2 \n"
"uqrshrn2 v2.16b, v5.8h, #2 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1