From 11c57f4f12f92bf67bd893b6ad59ef7b71b11fba Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 15 Nov 2024 14:36:12 +0000 Subject: [PATCH] [AArch64] Add Neon implementation of ScaleRowDown2_16_NEON The auto-vectorized implementation unrolls to process 32 elements per iteration, so unroll the new Neon implementation to match and avoid a performance regression on little cores. Performance relative to the auto-vectorized C implementation compiled with LLVM 19: Cortex-A55: -35.8% Cortex-A510: -20.4% Cortex-A520: -22.1% Cortex-A76: -54.8% Cortex-A710: -44.5% Cortex-A715: -31.1% Cortex-A720: -31.4% Cortex-X1: -48.5% Cortex-X2: -47.8% Cortex-X3: -47.6% Cortex-X4: -51.1% Cortex-X925: -14.6% Bug: b/42280942 Change-Id: Ib4e89ba230d554f2717052e934ca0e8a109ccc42 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6040153 Reviewed-by: Justin Green Reviewed-by: Frank Barchard --- include/libyuv/scale_row.h | 4 ++++ source/scale.cc | 5 +++-- source/scale_neon64.cc | 45 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index bc72ca808..ce2d81c70 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -1428,6 +1428,10 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); +void ScaleRowDown2_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); void ScaleRowDown2_SME(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, diff --git a/source/scale.cc b/source/scale.cc index e469eaded..661224166 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -187,8 +187,9 @@ static void ScalePlaneDown2_16(int src_width, } #if defined(HAS_SCALEROWDOWN2_16_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16) && filtering) { - ScaleRowDown2 = ScaleRowDown2Box_16_NEON; + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = + filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; } #endif #if defined(HAS_SCALEROWDOWN2_16_SSE2) diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index c125c6c09..753438c7a 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -1354,6 +1354,51 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, #undef SCALE_ARGB_FILTER_COLS_STEP_ADDR +void ScaleRowDown2_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + + (void)src_stride; + asm volatile( + "subs %w[dst_width], %w[dst_width], #32 \n" + "b.lt 2f \n" + + "1: \n" + "ldp q0, q1, [%[src_ptr]] \n" + "ldp q2, q3, [%[src_ptr], #32] \n" + "ldp q4, q5, [%[src_ptr], #64] \n" + "ldp q6, q7, [%[src_ptr], #96] \n" + "add %[src_ptr], %[src_ptr], #128 \n" + "uzp2 v0.8h, v0.8h, v1.8h \n" + "uzp2 v1.8h, v2.8h, v3.8h \n" + "uzp2 v2.8h, v4.8h, v5.8h \n" + "uzp2 v3.8h, v6.8h, v7.8h \n" + "subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration. + "stp q0, q1, [%[dst_ptr]] \n" + "stp q2, q3, [%[dst_ptr], #32] \n" + "add %[dst_ptr], %[dst_ptr], #64 \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[dst_width], %w[dst_width], #32 \n" + "b.eq 99f \n" + + "ldp q0, q1, [%[src_ptr]] \n" + "ldp q2, q3, [%[src_ptr], #32] \n" + "add %[src_ptr], %[src_ptr], #64 \n" + "uzp2 v0.8h, v0.8h, v1.8h \n" + "uzp2 v1.8h, v2.8h, v3.8h \n" + "stp q0, q1, [%[dst_ptr]], #32 \n" + + "99: \n" + : [src_ptr]"+r"(src_ptr), + [dst_ptr]"+r"(dst), + [dst_width]"+r"(dst_width) + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + // Read 16x2 average down and write 8x1. void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride,