[AArch64] Unroll SumSquareError_NEON_DotProd

The kernel is only ever called with count as a multiple of 32 so it is
safe to unroll this and maintain two accumulators.

Reduction in runtime observed compared to the existing
SumSquareError_NEON_DotProd implementation:

 Cortex-A55: -28.2%
Cortex-A510: -27.6%
 Cortex-A76: -33.0%
Cortex-A720: -35.3%
  Cortex-X1: -16.9%
  Cortex-X2: -13.3%

Bug: libyuv:977
Change-Id: Iee423106c38e97cc38007d73fa80e8374dd96721
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5490048
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-26 09:20:12 +01:00 committed by Frank Barchard
parent f5882ed1c5
commit 6433029df7

View File

@ -89,25 +89,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
// count is guaranteed to be a multiple of 32.
uint32_t sse;
asm volatile(
"movi v2.16b, #0 \n"
"movi v4.4s, #0 \n"
"movi v5.4s, #0 \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"ldp q0, q2, [%0], #32 \n"
"ldp q1, q3, [%1], #32 \n"
"subs %w2, %w2, #32 \n"
"uabd v0.16b, v0.16b, v1.16b \n"
"uabd v1.16b, v2.16b, v3.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"udot v2.4s, v0.16b, v0.16b \n"
"udot v4.4s, v0.16b, v0.16b \n"
"udot v5.4s, v1.16b, v1.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
"addv s0, v2.4s \n"
"add v0.4s, v4.4s, v5.4s \n"
"addv s0, v0.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "v0", "v1", "v2");
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5");
return sse;
}