From 311add63c2e73cf50cc0e89cf94ebf3fc5632560 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 28 Sep 2017 16:58:18 -0700 Subject: [PATCH] CopyRow_NEON use ldp instead of ld1 for better performance. Under cache thrashing circumstances, ldp/stp perform better than ld1/st1 on QC820/QC821 CPUs. Same performance when hitting cache. Bug: libyuv:738 Test: LibYUVPlanarTest.TestCopySamples_Opt (445 ms) Change-Id: Ib6a0a5d5e6a1b7ef667b9bb2edb39d681cf3614c Reviewed-on: https://chromium-review.googlesource.com/691281 Commit-Queue: Frank Barchard Reviewed-by: Cheng Wang --- source/row_neon64.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 341e14553..f7cc789ce 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -628,19 +628,19 @@ void MergeRGBRow_NEON(const uint8* src_r, ); } -// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +// Copy multiple of 32. void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile( "1: \n" - "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 + "ldp q0, q1, [%0], #32 \n" "subs %w2, %w2, #32 \n" // 32 processed per loop - "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 + "stp q0, q1, [%1], #32 \n" "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(count) // %2 // Output registers : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); }