[AArch64] Optimize ScaleARGBRowDownEven_NEON

Replace indexed LD1 instructions with LDRs to avoid loop-carried
dependencies on unused lanes between consecutive iterations of the loop.

Reduction in run times:

 Cortex-A55: -10.9%
Cortex-A510: -70.7%
 Cortex-A76: -56.8%

Bug: libyuv:976
Change-Id: Ia767e76002c7823177e80163ebf034e023e9a6cc
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5371771
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
Cosmina Dunca 2024-02-12 10:08:01 +00:00 committed by Frank Barchard
parent a6135cfe0f
commit 9441ddd883

View File

@ -1199,29 +1199,37 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
: "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
} }
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
const uint8_t* src_argb1 = src_argb + src_stepx * 4;
const uint8_t* src_argb2 = src_argb + src_stepx * 8;
const uint8_t* src_argb3 = src_argb + src_stepx * 12;
int64_t i = 0;
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.s}[0], [%0], %3 \n" "ldr w10, [%[src], %[i]] \n"
"ld1 {v0.s}[1], [%0], %3 \n" "ldr w11, [%[src1], %[i]] \n"
"ld1 {v0.s}[2], [%0], %3 \n" "ldr w12, [%[src2], %[i]] \n"
"ld1 {v0.s}[3], [%0], %3 \n" "ldr w13, [%[src3], %[i]] \n"
"subs %w2, %w2, #4 \n" // 4 pixels per loop. "add %[i], %[i], %[step] \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "subs %w[width], %w[width], #4 \n"
"st1 {v0.16b}, [%1], #16 \n" "prfm pldl1keep, [%[src], 448] \n"
"b.gt 1b \n" "stp w10, w11, [%[dst]], #8 \n"
: "+r"(src_argb), // %0 "stp w12, w13, [%[dst]], #8 \n"
"+r"(dst_argb), // %1 "b.gt 1b \n"
"+r"(dst_width) // %2 : [src]"+r"(src_argb),
: "r"((int64_t)(src_stepx * 4)) // %3 [src1]"+r"(src_argb1),
: "memory", "cc", "v0"); [src2]"+r"(src_argb2),
[src3]"+r"(src_argb3),
[dst]"+r"(dst_argb),
[width]"+r"(dst_width),
[i]"+r"(i)
: [step]"r"((int64_t)(src_stepx * 16))
: "memory", "cc", "w10", "w11", "w12", "w13");
} }
// Reads 4 pixels at a time. // Reads 4 pixels at a time.