[AArch64] Unroll ScaleRowUp2_Linear_NEON

On little cores with limited out-of-order capability this gives a good
improvement.

Reduction in runtimes observed compared to the existing Neon
implementation:

 Cortex-A55: -21.3%
Cortex-A520: -33.6%
 Cortex-A76:  +1.1%
Cortex-A715:  =0.0%
Cortex-A720:  =0.0%
  Cortex-X1: +10.4% (!)
  Cortex-X2:  -5.3%
  Cortex-X3:  -4.3%
  Cortex-X4:  -9.9%

Bug: b/42280945
Change-Id: I45b3510f13c05b19d61052e2f8e447199dbd0551
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5725169
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-05-15 21:20:11 +01:00 committed by Frank Barchard
parent 42d33341d3
commit be5de19db3
2 changed files with 22 additions and 7 deletions

View File

@ -749,12 +749,20 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
#endif
#ifdef HAS_SCALEROWUP2_LINEAR_NEON
#ifdef __aarch64__
SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
ScaleRowUp2_Linear_NEON,
ScaleRowUp2_Linear_C,
31,
uint8_t)
#else
SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
ScaleRowUp2_Linear_NEON,
ScaleRowUp2_Linear_C,
15,
uint8_t)
#endif
#endif
#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON
SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON,

View File

@ -539,32 +539,39 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 1;
asm volatile (
"movi v31.8b, #3 \n"
asm volatile(
"movi v31.16b, #3 \n"
"1: \n"
"ldr d0, [%0], #8 \n" // 01234567
"ldr d1, [%1], #8 \n" // 12345678
"ldr q0, [%0], #16 \n" // 0123456789abcdef
"ldr q1, [%1], #16 \n" // 123456789abcdefg
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
"ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
"ushll2 v4.8h, v0.16b, #0 \n" // 89abcdef (16b)
"ushll2 v5.8h, v1.16b, #0 \n" // 9abcdefg (16b)
"umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
"umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
"umlal2 v4.8h, v1.16b, v31.16b \n" // 3*near+far (odd)
"umlal2 v5.8h, v0.16b, v31.16b \n" // 3*near+far (even)
"rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
"rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
"rshrn2 v2.16b, v4.8h, #2 \n" // 3/4*near+1/4*far (odd)
"rshrn2 v1.16b, v5.8h, #2 \n" // 3/4*near+1/4*far (even)
"st2 {v1.8b, v2.8b}, [%2], #16 \n" // store
"subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
"st2 {v1.16b, v2.16b}, [%2], #32 \n"
"subs %w3, %w3, #32 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_temp), // %1
"+r"(dst_ptr), // %2
"+r"(dst_width) // %3
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
"v31" // Clobber List
);
}