[AArch64] Remove redundant increments in ScaleRowDown2_16_NEON

These were mistakenly copied from the main loop body, however this
particular block of the code is only executed at most once so we do not
need to perform the address updates.

Also adjust formatting with clang-format to match other kernels.

Change-Id: I8214821417d5e4f455ebe8805e1a37a9728ab8d2
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6067154
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-11-29 15:27:03 +00:00 committed by Frank Barchard
parent 9144583f22
commit 233f859e3c

View File

@ -1358,45 +1358,43 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst,
int dst_width) {
(void)src_stride;
asm volatile(
"subs %w[dst_width], %w[dst_width], #32 \n"
"b.lt 2f \n"
"subs %w[dst_width], %w[dst_width], #32 \n"
"b.lt 2f \n"
"1: \n"
"ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n"
"ldp q4, q5, [%[src_ptr], #64] \n"
"ldp q6, q7, [%[src_ptr], #96] \n"
"add %[src_ptr], %[src_ptr], #128 \n"
"uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n"
"uzp2 v2.8h, v4.8h, v5.8h \n"
"uzp2 v3.8h, v6.8h, v7.8h \n"
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration.
"stp q0, q1, [%[dst_ptr]] \n"
"stp q2, q3, [%[dst_ptr], #32] \n"
"add %[dst_ptr], %[dst_ptr], #64 \n"
"b.ge 1b \n"
"1: \n"
"ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n"
"ldp q4, q5, [%[src_ptr], #64] \n"
"ldp q6, q7, [%[src_ptr], #96] \n"
"add %[src_ptr], %[src_ptr], #128 \n"
"uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n"
"uzp2 v2.8h, v4.8h, v5.8h \n"
"uzp2 v3.8h, v6.8h, v7.8h \n"
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration.
"stp q0, q1, [%[dst_ptr]] \n"
"stp q2, q3, [%[dst_ptr], #32] \n"
"add %[dst_ptr], %[dst_ptr], #64 \n"
"b.ge 1b \n"
"2: \n"
"adds %w[dst_width], %w[dst_width], #32 \n"
"b.eq 99f \n"
"2: \n"
"adds %w[dst_width], %w[dst_width], #32 \n"
"b.eq 99f \n"
"ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n"
"add %[src_ptr], %[src_ptr], #64 \n"
"uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n"
"stp q0, q1, [%[dst_ptr]], #32 \n"
"ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n"
"uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n"
"stp q0, q1, [%[dst_ptr]] \n"
"99: \n"
: [src_ptr]"+r"(src_ptr),
[dst_ptr]"+r"(dst),
[dst_width]"+r"(dst_width)
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
"99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst), // %[dst_ptr]
[dst_width] "+r"(dst_width) // %[dst_width]
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,