mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Avoid LD4/ST2 in ScaleARGBRowDown2_NEON
Use separate permute instructions to avoid using LD4/ST2 as these instructions are known to be slow on some micro-architectures. Observed reduction in runtimes compared to the existing Neon code: Cortex-A55: -12.4% Cortex-A510: -44.8% Cortex-A520: -31.1% Cortex-A76: -55.3% Cortex-A715: -63.7% Cortex-A720: -62.3% Cortex-X1: -79.0% Cortex-X2: -78.9% Cortex-X3: -79.6% Cortex-X4: -59.8% Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com> Bug: libyuv:976 Change-Id: I33cf27ae5e16c1ce62f1f343043e6bd9fca92558 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790971 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
4620f17058
commit
00886670bb
@ -1160,21 +1160,20 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
|
||||
"ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"mov v2.16b, v3.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(dst_width) // %2
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n"
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"prfm pldl1keep, [%[src], 448] \n"
|
||||
"uzp2 v0.4s, v0.4s, v1.4s \n"
|
||||
"uzp2 v1.4s, v2.4s, v3.4s \n"
|
||||
"st1 {v0.4s, v1.4s}, [%[dst]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src] "+r"(src_ptr), // %[src]
|
||||
[dst] "+r"(dst), // %[dst]
|
||||
[width] "+r"(dst_width) // %[width]
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
|
||||
);
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user