mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Add Neon implementation of ScaleRowDown2Linear_16
Reduction in runtime observed relative to the auto-vectorized C implementation compiled with LLVM 19: Cortex-A55: -13.7% Cortex-A510: -49.0% Cortex-A520: -32.0% Cortex-A76: -34.3% Cortex-A710: -56.7% Cortex-A715: -45.4% Cortex-A720: -44.7% Cortex-X1: -70.6% Cortex-X2: -67.9% Cortex-X3: -72.2% Cortex-X4: -40.0% Cortex-X925: -24.1% Bug: b/42280942 Change-Id: I977899a2239e752400c9901f4d8482a76841269a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6040154 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
11c57f4f12
commit
9a9752134e
@ -1440,6 +1440,10 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown2Linear_SME(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
|
||||
@ -188,8 +188,9 @@ static void ScalePlaneDown2_16(int src_width,
|
||||
|
||||
#if defined(HAS_SCALEROWDOWN2_16_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleRowDown2 =
|
||||
filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_NEON
|
||||
: filtering == kFilterLinear ? ScaleRowDown2Linear_16_NEON
|
||||
: ScaleRowDown2Box_16_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN2_16_SSE2)
|
||||
|
||||
@ -1399,6 +1399,28 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||
}
|
||||
|
||||
void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n"
|
||||
"ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n"
|
||||
"subs %w[dst_width], %w[dst_width], #16 \n"
|
||||
"urhadd v0.8h, v0.8h, v1.8h \n"
|
||||
"urhadd v1.8h, v2.8h, v3.8h \n"
|
||||
"prfm pldl1keep, [%[src_ptr], 448] \n"
|
||||
"stp q0, q1, [%[dst_ptr]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst), // %[dst_ptr]
|
||||
[dst_width] "+r"(dst_width) // %[dst_width]
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
// Read 16x2 average down and write 8x1.
|
||||
void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user