From f00c43f4d6e812b581f64edc53a655f8e2413938 Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 7 May 2024 13:41:47 +0100 Subject: [PATCH] [AArch64] Unroll HalfFloat{,1}Row_NEON The existing C implementation compiled with a recent LLVM is auto-vectorised and unrolled to process four vectors per loop iteration, making the Neon implementation slower than the C implementation on little cores. To avoid this, unroll the Neon implementation to also process four vectors per iteration. Reduction in cycle counts observed compared to the existing Neon implementation: | HalfFloat1Row_NEON | HalfFloatRow_NEON Cortex-A510 | -37.1% | -40.8% Cortex-A520 | -32.3% | -37.4% Cortex-A720 | 0.0% | -10.6% Cortex-X2 | 0.0% | -7.8% Cortex-X4 | +0.3% | -6.9% Bug: b/42280945 Change-Id: I12b474c970fc4355d75ed924c4ca6169badda2bc Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872805 Reviewed-by: Frank Barchard Reviewed-by: Justin Green --- source/planar_functions.cc | 6 ++--- source/row_any.cc | 11 ++++++++ source/row_neon64.cc | 54 ++++++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 7c50599ad..1fc625330 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -5184,9 +5184,9 @@ int HalfFloatPlane(const uint16_t* src_y, #if defined(HAS_HALFFLOATROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { HalfFloatRow = - (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON; + scale == 1.0f ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = scale == 1.0f ? HalfFloat1Row_NEON : HalfFloatRow_NEON; } } #endif diff --git a/source/row_any.cc b/source/row_any.cc index d2f8a5419..e15b05aec 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1810,6 +1810,16 @@ ANY11P16(HalfFloat1Row_Any_F16C, 15) #endif #ifdef HAS_HALFFLOATROW_NEON +#ifdef __aarch64__ +ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 15) +ANY11P16(HalfFloat1Row_Any_NEON, + HalfFloat1Row_NEON, + uint16_t, + uint16_t, + 2, + 2, + 15) +#else ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7) ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, @@ -1819,6 +1829,7 @@ ANY11P16(HalfFloat1Row_Any_NEON, 2, 7) #endif +#endif #ifdef HAS_HALFFLOATROW_MSA ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31) #endif diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 7ad54b430..4b1ed2c0c 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -4669,50 +4669,64 @@ void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float /*unused*/, int width) { - asm volatile ( + asm volatile( "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's + "ldp q0, q1, [%0], #32 \n" // load 16 shorts + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "uxtl v2.4s, v0.4h \n" + "uxtl v4.4s, v1.4h \n" + "uxtl2 v3.4s, v0.8h \n" + "uxtl2 v5.4s, v1.8h \n" "prfm pldl1keep, [%0, 448] \n" - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v2.4s, v2.4s \n" + "scvtf v4.4s, v4.4s \n" "scvtf v3.4s, v3.4s \n" - "fcvtn v1.4h, v2.4s \n" // 8 half floats - "fcvtn2 v1.8h, v3.4s \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "scvtf v5.4s, v5.4s \n" + "fcvtn v0.4h, v2.4s \n" + "fcvtn v1.4h, v4.4s \n" + "fcvtn2 v0.8h, v3.4s \n" + "fcvtn2 v1.8h, v5.4s \n" + "stp q0, q1, [%1], #32 \n" // store 16 shorts "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : - : "cc", "memory", "v1", "v2", "v3"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); } void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width) { - asm volatile ( + asm volatile( "1: \n" - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's + "ldp q0, q1, [%0], #32 \n" // load 16 shorts + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "uxtl v2.4s, v0.4h \n" + "uxtl v4.4s, v1.4h \n" + "uxtl2 v3.4s, v0.8h \n" + "uxtl2 v5.4s, v1.8h \n" "prfm pldl1keep, [%0, 448] \n" - "uxtl2 v3.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v2.4s, v2.4s \n" + "scvtf v4.4s, v4.4s \n" "scvtf v3.4s, v3.4s \n" + "scvtf v5.4s, v5.4s \n" "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v4.4s, v4.4s, %3.s[0] \n" "fmul v3.4s, v3.4s, %3.s[0] \n" - "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat - "uqshrn2 v1.8h, v3.4s, #13 \n" - "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "fmul v5.4s, v5.4s, %3.s[0] \n" + "uqshrn v0.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn v1.4h, v4.4s, #13 \n" // isolate halffloat + "uqshrn2 v0.8h, v3.4s, #13 \n" + "uqshrn2 v1.8h, v5.4s, #13 \n" + "stp q0, q1, [%1], #32 \n" // store 16 shorts "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "w"(scale * 1.9259299444e-34f) // %3 - : "cc", "memory", "v1", "v2", "v3"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); } void ByteToFloatRow_NEON(const uint8_t* src,