From 53b65220da5cdc3dce0e088cd67e08bdf0a76dd6 Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 16 Apr 2024 14:08:00 +0100 Subject: [PATCH] [AArch64] Add Neon dot-product implementation of SumSquareError The Neon dot-product instructions perform two widening steps rather than one, saving us the need to widen the absolute difference to 16-bits before accumulating. Additionally, the dot-product instructions tend to have better performance characteristics than traditional widening multiply instructions like SMLAL used in the existing SumSquareError_NEON code. Observed reduction in runtimes compared to the existing Neon kernel: Cortex-A55: -9.1% Cortex-A510: -36.7% Cortex-A76: -37.6% Cortex-A720: -48.8% Cortex-X1: -56.1% Cortex-X2: -42.6% Bug: libyuv:977 Change-Id: Ie20c69040cc47a803d8e95620d31e0bf1e1dac12 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5463945 Reviewed-by: Frank Barchard --- include/libyuv/compare_row.h | 8 ++++++++ source/compare.cc | 5 +++++ source/compare_neon64.cc | 25 +++++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index 8293c9195..e78d742be 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -82,6 +82,11 @@ extern "C" { #define HAS_HAMMINGDISTANCE_NEON #endif +// The following are available for AArch64 Neon: +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_SUMSQUAREERROR_NEON_DOTPROD +#endif + #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_HAMMINGDISTANCE_MSA #define HAS_SUMSQUAREERROR_MSA @@ -117,6 +122,9 @@ uint32_t SumSquareError_AVX2(const uint8_t* src_a, uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count); +uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a, + const uint8_t* src_b, + int count); uint32_t SumSquareError_MSA(const uint8_t* src_a, const uint8_t* src_b, int count); diff --git a/source/compare.cc b/source/compare.cc index 50a736bdd..e128dfabc 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -194,6 +194,11 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a, SumSquareError = SumSquareError_NEON; } #endif +#if defined(HAS_SUMSQUAREERROR_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + SumSquareError = SumSquareError_NEON_DotProd; + } +#endif #if defined(HAS_SUMSQUAREERROR_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { // Note only used for multiples of 16 so count is not checked. diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 1bc8d18c9..34fa3e494 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -86,6 +86,31 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, return sse; } +uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "movi v2.16b, #0 \n" + + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "uabd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "udot v2.4s, v0.16b, v0.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "b.gt 1b \n" + + "addv s0, v2.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "memory", "cc", "v0", "v1", "v2"); + return sse; +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus