CopyRow_NEON use ldp instead of ld1 for better performance.

Under cache thrashing circumstances, ldp/stp perform better than ld1/st1 on QC820/QC821 CPUs. Same performance when hitting cache. Bug: libyuv:738 Test: LibYUVPlanarTest.TestCopySamples_Opt (445 ms) Change-Id: Ib6a0a5d5e6a1b7ef667b9bb2edb39d681cf3614c Reviewed-on: https://chromium-review.googlesource.com/691281 Commit-Queue: Frank Barchard <fbarchard@google.com> Reviewed-by: Cheng Wang <wangcheng@google.com>
2026-01-01 03:12:16 +08:00 · 2017-09-28 16:58:18 -07:00 · 2017-09-28 16:58:18 -07:00 · 311add63c2
commit 311add63c2
parent ccd6d6fc57
1 changed files with 4 additions and 4 deletions
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -628,19 +628,19 @@ void MergeRGBRow_NEON(const uint8* src_r,
      );
 }

-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+// Copy multiple of 32.
 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  asm volatile(
      "1:                                        \n"
-      "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
+      "ldp        q0, q1, [%0], #32              \n"
      "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
-      "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
+      "stp        q0, q1, [%1], #32              \n"
      "b.gt       1b                             \n"
      : "+r"(src),                              // %0
        "+r"(dst),                              // %1
        "+r"(count)                             // %2  // Output registers
      :                                         // Input registers
-      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      : "cc", "memory", "v0", "v1"  // Clobber List
      );
 }