From 9441ddd8835ebbc20c6b338c666c6737e0b0ed0f Mon Sep 17 00:00:00 2001 From: Cosmina Dunca Date: Mon, 12 Feb 2024 10:08:01 +0000 Subject: [PATCH] [AArch64] Optimize ScaleARGBRowDownEven_NEON Replace indexed LD1 instructions with LDRs to avoid loop-carried dependencies on unused lanes between consecutive iterations of the loop. Reduction in run times: Cortex-A55: -10.9% Cortex-A510: -70.7% Cortex-A76: -56.8% Bug: libyuv:976 Change-Id: Ia767e76002c7823177e80163ebf034e023e9a6cc Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5371771 Reviewed-by: Frank Barchard Reviewed-by: Justin Green --- source/scale_neon64.cc | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index e1a377bba..06a0b40b5 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -1199,29 +1199,37 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { + const uint8_t* src_argb1 = src_argb + src_stepx * 4; + const uint8_t* src_argb2 = src_argb + src_stepx * 8; + const uint8_t* src_argb3 = src_argb + src_stepx * 12; + int64_t i = 0; (void)src_stride; asm volatile( - "1: \n" - "ld1 {v0.s}[0], [%0], %3 \n" - "ld1 {v0.s}[1], [%0], %3 \n" - "ld1 {v0.s}[2], [%0], %3 \n" - "ld1 {v0.s}[3], [%0], %3 \n" - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((int64_t)(src_stepx * 4)) // %3 - : "memory", "cc", "v0"); + "1: \n" + "ldr w10, [%[src], %[i]] \n" + "ldr w11, [%[src1], %[i]] \n" + "ldr w12, [%[src2], %[i]] \n" + "ldr w13, [%[src3], %[i]] \n" + "add %[i], %[i], %[step] \n" + "subs %w[width], %w[width], #4 \n" + "prfm pldl1keep, [%[src], 448] \n" + "stp w10, w11, [%[dst]], #8 \n" + "stp w12, w13, [%[dst]], #8 \n" + "b.gt 1b \n" + : [src]"+r"(src_argb), + [src1]"+r"(src_argb1), + [src2]"+r"(src_argb2), + [src3]"+r"(src_argb3), + [dst]"+r"(dst_argb), + [width]"+r"(dst_width), + [i]"+r"(i) + : [step]"r"((int64_t)(src_stepx * 16)) + : "memory", "cc", "w10", "w11", "w12", "w13"); } // Reads 4 pixels at a time.