mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
CopyRow_NEON use ldp instead of ld1 for better performance.
Under cache thrashing circumstances, ldp/stp perform better than ld1/st1 on QC820/QC821 CPUs. Same performance when hitting cache. Bug: libyuv:738 Test: LibYUVPlanarTest.TestCopySamples_Opt (445 ms) Change-Id: Ib6a0a5d5e6a1b7ef667b9bb2edb39d681cf3614c Reviewed-on: https://chromium-review.googlesource.com/691281 Commit-Queue: Frank Barchard <fbarchard@google.com> Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
parent
ccd6d6fc57
commit
311add63c2
@ -628,19 +628,19 @@ void MergeRGBRow_NEON(const uint8* src_r,
|
||||
);
|
||||
}
|
||||
|
||||
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
|
||||
// Copy multiple of 32.
|
||||
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
|
||||
"ldp q0, q1, [%0], #32 \n"
|
||||
"subs %w2, %w2, #32 \n" // 32 processed per loop
|
||||
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
|
||||
"stp q0, q1, [%1], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(count) // %2 // Output registers
|
||||
: // Input registers
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
||||
: "cc", "memory", "v0", "v1" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user