mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
[AArch64] Avoid partial vector stores in ScaleRowDown38_NEON
The existing code performs a pair of stores since there is no AArch64 instruction in Neon to store exactly 12 bytes from a vector register. It is guaranteed to be safe to write full vectors until the last iteration of the loop, since the extra four bytes will be over-written by subsequent iterations. This allows us to avoid duplicating the store instruction and address arithmetic. Reduction in runtime observed relative to the existing Neon implementation: Cortex-A55: +2.0% Cortex-A510: -25.3% Cortex-A520: -15.1% Cortex-A76: -32.2% Cortex-A715: -19.7% Cortex-A720: -19.6% Cortex-X1: -31.6% Cortex-X2: -27.1% Cortex-X3: -25.9% Cortex-X4: -24.7% Cortex-X925: -35.8% Bug: b/42280945 Change-Id: I222ed662f169d82f5f472bebb1bcfe6d428ccae2 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872843 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
0dce974ca0
commit
faade2f73f
@ -337,20 +337,31 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
"ld1 {v3.16b}, [%3] \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
|
||||
"subs %w2, %w2, #12 \n"
|
||||
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"st1 {v2.8b}, [%1], #8 \n"
|
||||
"st1 {v2.s}[2], [%1], #4 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"(&kShuf38) // %3
|
||||
asm volatile(
|
||||
"ld1 {v3.16b}, [%[kShuf38]] \n"
|
||||
"subs %w[width], %w[width], #12 \n"
|
||||
"b.eq 2f \n"
|
||||
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%[src_ptr]], #32 \n"
|
||||
"subs %w[width], %w[width], #12 \n"
|
||||
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
|
||||
"prfm pldl1keep, [%[src_ptr], 448] \n" // prefetch 7 lines ahead
|
||||
"str q2, [%[dst_ptr]] \n"
|
||||
"add %[dst_ptr], %[dst_ptr], #12 \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
// Store exactly 12 bytes on the final iteration to avoid writing past
|
||||
// the end of the array.
|
||||
"2: \n"
|
||||
"ldp q0, q1, [%[src_ptr]] \n"
|
||||
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
|
||||
"st1 {v2.8b}, [%[dst_ptr]], #8 \n"
|
||||
"st1 {v2.s}[2], [%[dst_ptr]] \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
|
||||
[width] "+r"(dst_width) // %[width]
|
||||
: [kShuf38] "r"(&kShuf38) // %[kShuf38]
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user