[AArch64] Avoid partial vector stores in ScaleRowDown38_NEON

The existing code performs a pair of stores since there is no AArch64 instruction in Neon to store exactly 12 bytes from a vector register. It is guaranteed to be safe to write full vectors until the last iteration of the loop, since the extra four bytes will be over-written by subsequent iterations. This allows us to avoid duplicating the store instruction and address arithmetic. Reduction in runtime observed relative to the existing Neon implementation: Cortex-A55: +2.0% Cortex-A510: -25.3% Cortex-A520: -15.1% Cortex-A76: -32.2% Cortex-A715: -19.7% Cortex-A720: -19.6% Cortex-X1: -31.6% Cortex-X2: -27.1% Cortex-X3: -25.9% Cortex-X4: -24.7% Cortex-X925: -35.8% Bug: b/42280945 Change-Id: I222ed662f169d82f5f472bebb1bcfe6d428ccae2 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872843 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2024-09-18 13:05:07 +01:00 · 2024-09-18 13:05:07 +01:00 · faade2f73f
commit faade2f73f
parent 0dce974ca0
1 changed files with 25 additions and 14 deletions
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -337,20 +337,31 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
  (void)src_stride;
-  asm volatile (
-      "ld1         {v3.16b}, [%3]                \n"
-      "1:                                        \n"
-      "ld1         {v0.16b,v1.16b}, [%0], #32    \n"
-      "subs        %w2, %w2, #12                 \n"
-      "tbl         v2.16b, {v0.16b,v1.16b}, v3.16b \n"
-      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
-      "st1         {v2.8b}, [%1], #8             \n"
-      "st1         {v2.s}[2], [%1], #4           \n"
-      "b.gt        1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-      : "r"(&kShuf38)    // %3
+  asm volatile(
+      "ld1     {v3.16b}, [%[kShuf38]]             \n"
+      "subs    %w[width], %w[width], #12          \n"
+      "b.eq    2f                                 \n"
+
+      "1:                                         \n"
+      "ldp     q0, q1, [%[src_ptr]], #32          \n"
+      "subs    %w[width], %w[width], #12          \n"
+      "tbl     v2.16b, {v0.16b, v1.16b}, v3.16b   \n"
+      "prfm    pldl1keep, [%[src_ptr], 448]       \n"  // prefetch 7 lines ahead
+      "str     q2, [%[dst_ptr]]                   \n"
+      "add     %[dst_ptr], %[dst_ptr], #12        \n"
+      "b.gt    1b                                 \n"
+
+      // Store exactly 12 bytes on the final iteration to avoid writing past
+      // the end of the array.
+      "2:                                         \n"
+      "ldp     q0, q1, [%[src_ptr]]               \n"
+      "tbl     v2.16b, {v0.16b, v1.16b}, v3.16b   \n"
+      "st1     {v2.8b}, [%[dst_ptr]], #8          \n"
+      "st1     {v2.s}[2], [%[dst_ptr]]            \n"
+      : [src_ptr] "+r"(src_ptr),  // %[src_ptr]
+        [dst_ptr] "+r"(dst_ptr),  // %[dst_ptr]
+        [width] "+r"(dst_width)   // %[width]
+      : [kShuf38] "r"(&kShuf38)   // %[kShuf38]
      : "memory", "cc", "v0", "v1", "v2", "v3");
 }