[AArch64] Unroll SumSquareError_NEON_DotProd

The kernel is only ever called with count as a multiple of 32 so it is safe to unroll this and maintain two accumulators. Reduction in runtime observed compared to the existing SumSquareError_NEON_DotProd implementation: Cortex-A55: -28.2% Cortex-A510: -27.6% Cortex-A76: -33.0% Cortex-A720: -35.3% Cortex-X1: -16.9% Cortex-X2: -13.3% Bug: libyuv:977 Change-Id: Iee423106c38e97cc38007d73fa80e8374dd96721 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5490048 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2026-02-06 01:39:49 +08:00 · 2024-04-26 09:20:12 +01:00 · 2024-04-26 09:20:12 +01:00 · 6433029df7
commit 6433029df7
parent f5882ed1c5
1 changed files with 12 additions and 7 deletions
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@ -89,25 +89,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
 uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
                                     const uint8_t* src_b,
                                     int count) {
+  // count is guaranteed to be a multiple of 32.
  uint32_t sse;
  asm volatile(
-      "movi        v2.16b, #0                    \n"
+      "movi        v4.4s, #0                     \n"
+      "movi        v5.4s, #0                     \n"

      "1:                                        \n"
-      "ld1         {v0.16b}, [%0], #16           \n"
-      "ld1         {v1.16b}, [%1], #16           \n"
-      "subs        %w2, %w2, #16                 \n"
+      "ldp         q0, q2, [%0], #32             \n"
+      "ldp         q1, q3, [%1], #32             \n"
+      "subs        %w2, %w2, #32                 \n"
      "uabd        v0.16b, v0.16b, v1.16b        \n"
+      "uabd        v1.16b, v2.16b, v3.16b        \n"
      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "udot        v2.4s, v0.16b, v0.16b         \n"
+      "udot        v4.4s, v0.16b, v0.16b         \n"
+      "udot        v5.4s, v1.16b, v1.16b         \n"
      "prfm        pldl1keep, [%1, 448]          \n"
      "b.gt        1b                            \n"

-      "addv        s0, v2.4s                     \n"
+      "add         v0.4s, v4.4s, v5.4s           \n"
+      "addv        s0, v0.4s                     \n"
      "fmov        %w3, s0                       \n"
      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
      :
-      : "memory", "cc", "v0", "v1", "v2");
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5");
  return sse;
 }