From be5de19db32f6191f2533d0fa69166e89ea60915 Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 15 May 2024 21:20:11 +0100 Subject: [PATCH] [AArch64] Unroll ScaleRowUp2_Linear_NEON On little cores with limited out-of-order capability this gives a good improvement. Reduction in runtimes observed compared to the existing Neon implementation: Cortex-A55: -21.3% Cortex-A520: -33.6% Cortex-A76: +1.1% Cortex-A715: =0.0% Cortex-A720: =0.0% Cortex-X1: +10.4% (!) Cortex-X2: -5.3% Cortex-X3: -4.3% Cortex-X4: -9.9% Bug: b/42280945 Change-Id: I45b3510f13c05b19d61052e2f8e447199dbd0551 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5725169 Reviewed-by: Frank Barchard --- source/scale_any.cc | 8 ++++++++ source/scale_neon64.cc | 21 ++++++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/source/scale_any.cc b/source/scale_any.cc index f6576874a..9f64f8043 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -749,12 +749,20 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, #endif #ifdef HAS_SCALEROWUP2_LINEAR_NEON +#ifdef __aarch64__ +SUH2LANY(ScaleRowUp2_Linear_Any_NEON, + ScaleRowUp2_Linear_NEON, + ScaleRowUp2_Linear_C, + 31, + uint8_t) +#else SUH2LANY(ScaleRowUp2_Linear_Any_NEON, ScaleRowUp2_Linear_NEON, ScaleRowUp2_Linear_C, 15, uint8_t) #endif +#endif #ifdef HAS_SCALEROWUP2_LINEAR_12_NEON SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON, diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index a7f73352c..63fa1e41e 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -539,32 +539,39 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 1; - asm volatile ( - "movi v31.8b, #3 \n" + asm volatile( + "movi v31.16b, #3 \n" "1: \n" - "ldr d0, [%0], #8 \n" // 01234567 - "ldr d1, [%1], #8 \n" // 12345678 + "ldr q0, [%0], #16 \n" // 0123456789abcdef + "ldr q1, [%1], #16 \n" // 123456789abcdefg "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) + "ushll2 v4.8h, v0.16b, #0 \n" // 89abcdef (16b) + "ushll2 v5.8h, v1.16b, #0 \n" // 9abcdefg (16b) "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) + "umlal2 v4.8h, v1.16b, v31.16b \n" // 3*near+far (odd) + "umlal2 v5.8h, v0.16b, v31.16b \n" // 3*near+far (even) "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) + "rshrn2 v2.16b, v4.8h, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn2 v1.16b, v5.8h, #2 \n" // 3/4*near+1/4*far (even) - "st2 {v1.8b, v2.8b}, [%2], #16 \n" // store - "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "st2 {v1.16b, v2.16b}, [%2], #32 \n" + "subs %w3, %w3, #32 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_temp), // %1 "+r"(dst_ptr), // %2 "+r"(dst_width) // %3 : - : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v31" // Clobber List ); }