From 8f039f639c44448eb16c9544b7d00dad71aa7011 Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 15 May 2024 21:47:21 +0100 Subject: [PATCH] [AArch64] Unroll ScaleRowDown4Box_NEON We can use wider load/store instructions and avoid the need to waste half of the ADDP/RSHRN vector data. The duplicated UADDLP and UADALP instructions also provide a good improvement on little cores due to their limited out-of-order capability. The mask in the "any" kernel definition is already set up to handle an unrolling of eight so no change to scale_any.cc is needed. Reduction in runtimes observed compared to the existing Neon implementation: Cortex-A55: -19.5% Cortex-A520: -38.3% Cortex-A76: -36.0% Cortex-A715: -18.1% Cortex-A720: -17.9% Cortex-X1: -25.4% Cortex-X2: -18.5% Cortex-X3: -8.2% Cortex-X4: -3.8% Bug: b/42280945 Change-Id: Iebba5da4db5e25af4b9fa5651c7396364dedffba Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5725172 Reviewed-by: Frank Barchard --- source/scale_neon64.cc | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 407c1e62f..d440c28c9 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -122,24 +122,28 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr3 = src_ptr + src_stride * 3; - asm volatile ( + asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 - "ld1 {v1.16b}, [%2], #16 \n" - "ld1 {v2.16b}, [%3], #16 \n" - "ld1 {v3.16b}, [%4], #16 \n" - "subs %w5, %w5, #4 \n" + "ldp q0, q4, [%0], #32 \n" // load up 16x8 + "ldp q1, q5, [%2], #32 \n" + "ldp q2, q6, [%3], #32 \n" + "ldp q3, q7, [%4], #32 \n" + "subs %w5, %w5, #8 \n" "uaddlp v0.8h, v0.16b \n" + "uaddlp v4.8h, v4.16b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "uadalp v0.8h, v1.16b \n" + "uadalp v4.8h, v5.16b \n" "prfm pldl1keep, [%2, 448] \n" "uadalp v0.8h, v2.16b \n" + "uadalp v4.8h, v6.16b \n" "prfm pldl1keep, [%3, 448] \n" "uadalp v0.8h, v3.16b \n" + "uadalp v4.8h, v7.16b \n" "prfm pldl1keep, [%4, 448] \n" - "addp v0.8h, v0.8h, v0.8h \n" + "addp v0.8h, v0.8h, v4.8h \n" "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding - "st1 {v0.s}[0], [%1], #4 \n" + "str d0, [%1], #8 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -148,7 +152,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, "+r"(src_ptr3), // %4 "+r"(dst_width) // %5 : - : "memory", "cc", "v0", "v1", "v2", "v3"); + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Down scale from 4 to 3 pixels. Use the neon multilane read/write