mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
[AArch64] Add Neon dot-product implementation of SumSquareError
The Neon dot-product instructions perform two widening steps rather than one, saving us the need to widen the absolute difference to 16-bits before accumulating. Additionally, the dot-product instructions tend to have better performance characteristics than traditional widening multiply instructions like SMLAL used in the existing SumSquareError_NEON code. Observed reduction in runtimes compared to the existing Neon kernel: Cortex-A55: -9.1% Cortex-A510: -36.7% Cortex-A76: -37.6% Cortex-A720: -48.8% Cortex-X1: -56.1% Cortex-X2: -42.6% Bug: libyuv:977 Change-Id: Ie20c69040cc47a803d8e95620d31e0bf1e1dac12 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5463945 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
9e223c3fc0
commit
53b65220da
@ -82,6 +82,11 @@ extern "C" {
|
||||
#define HAS_HAMMINGDISTANCE_NEON
|
||||
#endif
|
||||
|
||||
// The following are available for AArch64 Neon:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_SUMSQUAREERROR_NEON_DOTPROD
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
#define HAS_HAMMINGDISTANCE_MSA
|
||||
#define HAS_SUMSQUAREERROR_MSA
|
||||
@ -117,6 +122,9 @@ uint32_t SumSquareError_AVX2(const uint8_t* src_a,
|
||||
uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count);
|
||||
uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count);
|
||||
uint32_t SumSquareError_MSA(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count);
|
||||
|
||||
@ -194,6 +194,11 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a,
|
||||
SumSquareError = SumSquareError_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SUMSQUAREERROR_NEON_DOTPROD)
|
||||
if (TestCpuFlag(kCpuHasNeonDotProd)) {
|
||||
SumSquareError = SumSquareError_NEON_DotProd;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SUMSQUAREERROR_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
// Note only used for multiples of 16 so count is not checked.
|
||||
|
||||
@ -86,6 +86,31 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
||||
return sse;
|
||||
}
|
||||
|
||||
uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t sse;
|
||||
asm volatile(
|
||||
"movi v2.16b, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n"
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"uabd v0.16b, v0.16b, v1.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"udot v2.4s, v0.16b, v0.16b \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"addv s0, v2.4s \n"
|
||||
"fmov %w3, s0 \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2");
|
||||
return sse;
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user