[AArch64] Add Neon dot-product implementation of SumSquareError

The Neon dot-product instructions perform two widening steps rather than
one, saving us the need to widen the absolute difference to 16-bits
before accumulating. Additionally, the dot-product instructions tend to
have better performance characteristics than traditional widening
multiply instructions like SMLAL used in the existing
SumSquareError_NEON code.

Observed reduction in runtimes compared to the existing Neon kernel:

 Cortex-A55:  -9.1%
Cortex-A510: -36.7%
 Cortex-A76: -37.6%
Cortex-A720: -48.8%
  Cortex-X1: -56.1%
  Cortex-X2: -42.6%

Bug: libyuv:977
Change-Id: Ie20c69040cc47a803d8e95620d31e0bf1e1dac12
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5463945
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-16 14:08:00 +01:00 committed by Frank Barchard
parent 9e223c3fc0
commit 53b65220da
3 changed files with 38 additions and 0 deletions

View File

@ -82,6 +82,11 @@ extern "C" {
#define HAS_HAMMINGDISTANCE_NEON
#endif
// The following are available for AArch64 Neon:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SUMSQUAREERROR_NEON_DOTPROD
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_HAMMINGDISTANCE_MSA
#define HAS_SUMSQUAREERROR_MSA
@ -117,6 +122,9 @@ uint32_t SumSquareError_AVX2(const uint8_t* src_a,
uint32_t SumSquareError_NEON(const uint8_t* src_a,
const uint8_t* src_b,
int count);
uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
const uint8_t* src_b,
int count);
uint32_t SumSquareError_MSA(const uint8_t* src_a,
const uint8_t* src_b,
int count);

View File

@ -194,6 +194,11 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a,
SumSquareError = SumSquareError_NEON;
}
#endif
#if defined(HAS_SUMSQUAREERROR_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
SumSquareError = SumSquareError_NEON_DotProd;
}
#endif
#if defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
// Note only used for multiples of 16 so count is not checked.

View File

@ -86,6 +86,31 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
return sse;
}
uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t sse;
asm volatile(
"movi v2.16b, #0 \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"uabd v0.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"udot v2.4s, v0.16b, v0.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
"addv s0, v2.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "v0", "v1", "v2");
return sse;
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus