From 233f859e3ccad205a0e2d585f5ac09a806ba0230 Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 29 Nov 2024 15:27:03 +0000 Subject: [PATCH] [AArch64] Remove redundant increments in ScaleRowDown2_16_NEON These were mistakenly copied from the main loop body, however this particular block of the code is only executed at most once so we do not need to perform the address updates. Also adjust formatting with clang-format to match other kernels. Change-Id: I8214821417d5e4f455ebe8805e1a37a9728ab8d2 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6067154 Reviewed-by: Frank Barchard --- source/scale_neon64.cc | 64 ++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 0f9806f01..69c51b1bb 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -1358,45 +1358,43 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { - (void)src_stride; asm volatile( - "subs %w[dst_width], %w[dst_width], #32 \n" - "b.lt 2f \n" + "subs %w[dst_width], %w[dst_width], #32 \n" + "b.lt 2f \n" - "1: \n" - "ldp q0, q1, [%[src_ptr]] \n" - "ldp q2, q3, [%[src_ptr], #32] \n" - "ldp q4, q5, [%[src_ptr], #64] \n" - "ldp q6, q7, [%[src_ptr], #96] \n" - "add %[src_ptr], %[src_ptr], #128 \n" - "uzp2 v0.8h, v0.8h, v1.8h \n" - "uzp2 v1.8h, v2.8h, v3.8h \n" - "uzp2 v2.8h, v4.8h, v5.8h \n" - "uzp2 v3.8h, v6.8h, v7.8h \n" - "subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration. - "stp q0, q1, [%[dst_ptr]] \n" - "stp q2, q3, [%[dst_ptr], #32] \n" - "add %[dst_ptr], %[dst_ptr], #64 \n" - "b.ge 1b \n" + "1: \n" + "ldp q0, q1, [%[src_ptr]] \n" + "ldp q2, q3, [%[src_ptr], #32] \n" + "ldp q4, q5, [%[src_ptr], #64] \n" + "ldp q6, q7, [%[src_ptr], #96] \n" + "add %[src_ptr], %[src_ptr], #128 \n" + "uzp2 v0.8h, v0.8h, v1.8h \n" + "uzp2 v1.8h, v2.8h, v3.8h \n" + "uzp2 v2.8h, v4.8h, v5.8h \n" + "uzp2 v3.8h, v6.8h, v7.8h \n" + "subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration. + "stp q0, q1, [%[dst_ptr]] \n" + "stp q2, q3, [%[dst_ptr], #32] \n" + "add %[dst_ptr], %[dst_ptr], #64 \n" + "b.ge 1b \n" - "2: \n" - "adds %w[dst_width], %w[dst_width], #32 \n" - "b.eq 99f \n" + "2: \n" + "adds %w[dst_width], %w[dst_width], #32 \n" + "b.eq 99f \n" - "ldp q0, q1, [%[src_ptr]] \n" - "ldp q2, q3, [%[src_ptr], #32] \n" - "add %[src_ptr], %[src_ptr], #64 \n" - "uzp2 v0.8h, v0.8h, v1.8h \n" - "uzp2 v1.8h, v2.8h, v3.8h \n" - "stp q0, q1, [%[dst_ptr]], #32 \n" + "ldp q0, q1, [%[src_ptr]] \n" + "ldp q2, q3, [%[src_ptr], #32] \n" + "uzp2 v0.8h, v0.8h, v1.8h \n" + "uzp2 v1.8h, v2.8h, v3.8h \n" + "stp q0, q1, [%[dst_ptr]] \n" - "99: \n" - : [src_ptr]"+r"(src_ptr), - [dst_ptr]"+r"(dst), - [dst_width]"+r"(dst_width) - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [dst_ptr] "+r"(dst), // %[dst_ptr] + [dst_width] "+r"(dst_width) // %[dst_width] + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,