mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-16 23:29:52 +08:00
[AArch64] Add Neon implementation of ScaleRowDown2_16_NEON
The auto-vectorized implementation unrolls to process 32 elements per iteration, so unroll the new Neon implementation to match and avoid a performance regression on little cores. Performance relative to the auto-vectorized C implementation compiled with LLVM 19: Cortex-A55: -35.8% Cortex-A510: -20.4% Cortex-A520: -22.1% Cortex-A76: -54.8% Cortex-A710: -44.5% Cortex-A715: -31.1% Cortex-A720: -31.4% Cortex-X1: -48.5% Cortex-X2: -47.8% Cortex-X3: -47.6% Cortex-X4: -51.1% Cortex-X925: -14.6% Bug: b/42280942 Change-Id: Ib4e89ba230d554f2717052e934ca0e8a109ccc42 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6040153 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
952d6a282f
commit
11c57f4f12
@ -1428,6 +1428,10 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width);
|
int dst_width);
|
||||||
|
void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
|
||||||
|
ptrdiff_t src_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int dst_width);
|
||||||
void ScaleRowDown2_SME(const uint8_t* src_ptr,
|
void ScaleRowDown2_SME(const uint8_t* src_ptr,
|
||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
|
|||||||
@ -187,8 +187,9 @@ static void ScalePlaneDown2_16(int src_width,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(HAS_SCALEROWDOWN2_16_NEON)
|
#if defined(HAS_SCALEROWDOWN2_16_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16) && filtering) {
|
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
|
||||||
ScaleRowDown2 = ScaleRowDown2Box_16_NEON;
|
ScaleRowDown2 =
|
||||||
|
filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_SCALEROWDOWN2_16_SSE2)
|
#if defined(HAS_SCALEROWDOWN2_16_SSE2)
|
||||||
|
|||||||
@ -1354,6 +1354,51 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
|
|||||||
|
|
||||||
#undef SCALE_ARGB_FILTER_COLS_STEP_ADDR
|
#undef SCALE_ARGB_FILTER_COLS_STEP_ADDR
|
||||||
|
|
||||||
|
void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
|
||||||
|
ptrdiff_t src_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int dst_width) {
|
||||||
|
|
||||||
|
(void)src_stride;
|
||||||
|
asm volatile(
|
||||||
|
"subs %w[dst_width], %w[dst_width], #32 \n"
|
||||||
|
"b.lt 2f \n"
|
||||||
|
|
||||||
|
"1: \n"
|
||||||
|
"ldp q0, q1, [%[src_ptr]] \n"
|
||||||
|
"ldp q2, q3, [%[src_ptr], #32] \n"
|
||||||
|
"ldp q4, q5, [%[src_ptr], #64] \n"
|
||||||
|
"ldp q6, q7, [%[src_ptr], #96] \n"
|
||||||
|
"add %[src_ptr], %[src_ptr], #128 \n"
|
||||||
|
"uzp2 v0.8h, v0.8h, v1.8h \n"
|
||||||
|
"uzp2 v1.8h, v2.8h, v3.8h \n"
|
||||||
|
"uzp2 v2.8h, v4.8h, v5.8h \n"
|
||||||
|
"uzp2 v3.8h, v6.8h, v7.8h \n"
|
||||||
|
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration.
|
||||||
|
"stp q0, q1, [%[dst_ptr]] \n"
|
||||||
|
"stp q2, q3, [%[dst_ptr], #32] \n"
|
||||||
|
"add %[dst_ptr], %[dst_ptr], #64 \n"
|
||||||
|
"b.ge 1b \n"
|
||||||
|
|
||||||
|
"2: \n"
|
||||||
|
"adds %w[dst_width], %w[dst_width], #32 \n"
|
||||||
|
"b.eq 99f \n"
|
||||||
|
|
||||||
|
"ldp q0, q1, [%[src_ptr]] \n"
|
||||||
|
"ldp q2, q3, [%[src_ptr], #32] \n"
|
||||||
|
"add %[src_ptr], %[src_ptr], #64 \n"
|
||||||
|
"uzp2 v0.8h, v0.8h, v1.8h \n"
|
||||||
|
"uzp2 v1.8h, v2.8h, v3.8h \n"
|
||||||
|
"stp q0, q1, [%[dst_ptr]], #32 \n"
|
||||||
|
|
||||||
|
"99: \n"
|
||||||
|
: [src_ptr]"+r"(src_ptr),
|
||||||
|
[dst_ptr]"+r"(dst),
|
||||||
|
[dst_width]"+r"(dst_width)
|
||||||
|
:
|
||||||
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||||
|
}
|
||||||
|
|
||||||
// Read 16x2 average down and write 8x1.
|
// Read 16x2 average down and write 8x1.
|
||||||
void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
|
void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
|
||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user