mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
[AArch64] Unroll SumSquareError_NEON_DotProd
The kernel is only ever called with count as a multiple of 32 so it is safe to unroll this and maintain two accumulators. Reduction in runtime observed compared to the existing SumSquareError_NEON_DotProd implementation: Cortex-A55: -28.2% Cortex-A510: -27.6% Cortex-A76: -33.0% Cortex-A720: -35.3% Cortex-X1: -16.9% Cortex-X2: -13.3% Bug: libyuv:977 Change-Id: Iee423106c38e97cc38007d73fa80e8374dd96721 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5490048 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
f5882ed1c5
commit
6433029df7
@ -89,25 +89,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
||||
uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
// count is guaranteed to be a multiple of 32.
|
||||
uint32_t sse;
|
||||
asm volatile(
|
||||
"movi v2.16b, #0 \n"
|
||||
"movi v4.4s, #0 \n"
|
||||
"movi v5.4s, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n"
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"ldp q0, q2, [%0], #32 \n"
|
||||
"ldp q1, q3, [%1], #32 \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"uabd v0.16b, v0.16b, v1.16b \n"
|
||||
"uabd v1.16b, v2.16b, v3.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"udot v2.4s, v0.16b, v0.16b \n"
|
||||
"udot v4.4s, v0.16b, v0.16b \n"
|
||||
"udot v5.4s, v1.16b, v1.16b \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"addv s0, v2.4s \n"
|
||||
"add v0.4s, v4.4s, v5.4s \n"
|
||||
"addv s0, v0.4s \n"
|
||||
"fmov %w3, s0 \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2");
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5");
|
||||
return sse;
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user