Remove ARM NaCL macros from source

NaCL has been disabled for awhile, so the code
will still build, but only with C versions.
This change removes the MEMACCESS() macros from
Neon and Neon64 source.

BUG=libyuv:702
TEST=try bots build for arm.
R=kjellander@chromium.org

Change-Id: Id581a5c8ff71e18cc69595e7fee9337f97c44a19
Reviewed-on: https://chromium-review.googlesource.com/528332
Reviewed-by: Cheng Wang <wangcheng@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
Frank Barchard 2017-06-09 14:49:28 -07:00 committed by Commit Bot
parent 5f94a33e0c
commit 6c94ad13b5
10 changed files with 3826 additions and 4834 deletions

View File

@ -625,15 +625,6 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
#op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n" #op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
#endif // defined(__native_client__) && defined(__x86_64__) #endif // defined(__native_client__) && defined(__x86_64__)
#if defined(__arm__) || defined(__aarch64__)
#undef MEMACCESS
#if defined(__native_client__)
#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
#else
#define MEMACCESS(base)
#endif
#endif
// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be // Intel Code Analizer markers. Insert IACA_START IACA_END around code to be
// measured and then run with iaca -64 libyuv_unittest. // measured and then run with iaca -64 libyuv_unittest.
// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within // IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within

View File

@ -64,9 +64,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q11, #0 \n" "vmov.u8 q11, #0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n" "subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n" "vsubl.u8 q2, d0, d2 \n"

View File

@ -59,9 +59,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v19.16b, v19.16b, v19.16b \n" "eor v19.16b, v19.16b, v19.16b \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" "ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n" "subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n" "usubl v2.8h, v0.8b, v1.8b \n"

View File

@ -40,21 +40,13 @@ void TransposeWx8_NEON(const uint8* src,
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], %2 \n" "vld1.8 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d1}, [%0], %2 \n" "vld1.8 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], %2 \n" "vld1.8 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d3}, [%0], %2 \n" "vld1.8 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], %2 \n" "vld1.8 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d5}, [%0], %2 \n" "vld1.8 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], %2 \n" "vld1.8 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d7}, [%0] \n" "vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n" "vtrn.8 d1, d0 \n"
@ -79,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %4 \n" "vst1.8 {d1}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %4 \n" "vst1.8 {d3}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %4 \n" "vst1.8 {d5}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %4 \n" "vst1.8 {d7}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0] \n" "vst1.8 {d6}, [%0] \n"
"add %1, #8 \n" // src += 8 "add %1, #8 \n" // src += 8
@ -115,26 +99,17 @@ void TransposeWx8_NEON(const uint8* src,
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], %2 \n" "vld1.32 {d0[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], %2 \n" "vld1.32 {d0[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], %2 \n" "vld1.32 {d1[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], %2 \n" "vld1.32 {d1[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[0]}, [%0], %2 \n" "vld1.32 {d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[1]}, [%0], %2 \n" "vld1.32 {d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[0]}, [%0], %2 \n" "vld1.32 {d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[1]}, [%0] \n" "vld1.32 {d3[1]}, [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(6)
"vld1.8 {q3}, [%6] \n" "vld1.8 {q3}, [%6] \n"
"vtbl.8 d4, {d0, d1}, d6 \n" "vtbl.8 d4, {d0, d1}, d6 \n"
@ -144,23 +119,15 @@ void TransposeWx8_NEON(const uint8* src,
// TODO(frkoenig): Rework shuffle above to // TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes. // write out with 4 instead of 8 writes.
MEMACCESS(0)
"vst1.32 {d4[0]}, [%0], %4 \n" "vst1.32 {d4[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d4[1]}, [%0], %4 \n" "vst1.32 {d4[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[0]}, [%0], %4 \n" "vst1.32 {d5[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[1]}, [%0] \n" "vst1.32 {d5[1]}, [%0] \n"
"add %0, %3, #4 \n" "add %0, %3, #4 \n"
MEMACCESS(0)
"vst1.32 {d0[0]}, [%0], %4 \n" "vst1.32 {d0[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d0[1]}, [%0], %4 \n" "vst1.32 {d0[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[0]}, [%0], %4 \n" "vst1.32 {d1[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[1]}, [%0] \n" "vst1.32 {d1[1]}, [%0] \n"
"add %1, #4 \n" // src += 4 "add %1, #4 \n" // src += 4
@ -176,30 +143,20 @@ void TransposeWx8_NEON(const uint8* src,
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld1.16 {d0[0]}, [%0], %2 \n" "vld1.16 {d0[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[0]}, [%0], %2 \n" "vld1.16 {d1[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[1]}, [%0], %2 \n" "vld1.16 {d0[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[1]}, [%0], %2 \n" "vld1.16 {d1[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[2]}, [%0], %2 \n" "vld1.16 {d0[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[2]}, [%0], %2 \n" "vld1.16 {d1[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[3]}, [%0], %2 \n" "vld1.16 {d0[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[3]}, [%0] \n" "vld1.16 {d1[3]}, [%0] \n"
"vtrn.8 d0, d1 \n" "vtrn.8 d0, d1 \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n" "vst1.64 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0] \n" "vst1.64 {d1}, [%0] \n"
"add %1, #2 \n" // src += 2 "add %1, #2 \n" // src += 2
@ -209,24 +166,15 @@ void TransposeWx8_NEON(const uint8* src,
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"vld1.8 {d0[0]}, [%1], %2 \n" "vld1.8 {d0[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[1]}, [%1], %2 \n" "vld1.8 {d0[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[2]}, [%1], %2 \n" "vld1.8 {d0[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[3]}, [%1], %2 \n" "vld1.8 {d0[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[4]}, [%1], %2 \n" "vld1.8 {d0[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[5]}, [%1], %2 \n" "vld1.8 {d0[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[6]}, [%1], %2 \n" "vld1.8 {d0[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[7]}, [%1] \n" "vld1.8 {d0[7]}, [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n" "vst1.64 {d0}, [%3] \n"
"4: \n" "4: \n"
@ -238,8 +186,7 @@ void TransposeWx8_NEON(const uint8* src,
"+r"(dst_stride), // %4 "+r"(dst_stride), // %4
"+r"(width) // %5 "+r"(width) // %5
: "r"(&kVTbl4x4Transpose) // %6 : "r"(&kVTbl4x4Transpose) // %6
: "memory", "cc", "q0", "q1", "q2", "q3" : "memory", "cc", "q0", "q1", "q2", "q3");
);
} }
static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
@ -263,21 +210,13 @@ void TransposeUVWx8_NEON(const uint8* src,
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], %2 \n" "vld2.8 {d0, d1}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d2, d3}, [%0], %2 \n" "vld2.8 {d2, d3}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d4, d5}, [%0], %2 \n" "vld2.8 {d4, d5}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d6, d7}, [%0], %2 \n" "vld2.8 {d6, d7}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d16, d17}, [%0], %2 \n" "vld2.8 {d16, d17}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d18, d19}, [%0], %2 \n" "vld2.8 {d18, d19}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d20, d21}, [%0], %2 \n" "vld2.8 {d20, d21}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d22, d23}, [%0] \n" "vld2.8 {d22, d23}, [%0] \n"
"vtrn.8 q1, q0 \n" "vtrn.8 q1, q0 \n"
@ -306,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0], %4 \n" "vst1.8 {d6}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d18}, [%0], %4 \n" "vst1.8 {d18}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d16}, [%0], %4 \n" "vst1.8 {d16}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d22}, [%0], %4 \n" "vst1.8 {d22}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d20}, [%0] \n" "vst1.8 {d20}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %6 \n" "vst1.8 {d3}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %6 \n" "vst1.8 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %6 \n" "vst1.8 {d7}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %6 \n" "vst1.8 {d5}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d19}, [%0], %6 \n" "vst1.8 {d19}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d17}, [%0], %6 \n" "vst1.8 {d17}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d23}, [%0], %6 \n" "vst1.8 {d23}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d21}, [%0] \n" "vst1.8 {d21}, [%0] \n"
"add %1, #8*2 \n" // src += 8*2 "add %1, #8*2 \n" // src += 8*2
@ -363,24 +286,15 @@ void TransposeUVWx8_NEON(const uint8* src,
// TODO(frkoenig): Clean this up // TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld1.64 {d0}, [%0], %2 \n" "vld1.64 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d1}, [%0], %2 \n" "vld1.64 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d2}, [%0], %2 \n" "vld1.64 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d3}, [%0], %2 \n" "vld1.64 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d4}, [%0], %2 \n" "vld1.64 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d5}, [%0], %2 \n" "vld1.64 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d6}, [%0], %2 \n" "vld1.64 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d7}, [%0] \n" "vld1.64 {d7}, [%0] \n"
MEMACCESS(8)
"vld1.8 {q15}, [%8] \n" "vld1.8 {q15}, [%8] \n"
"vtrn.8 q0, q1 \n" "vtrn.8 q0, q1 \n"
@ -397,49 +311,35 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.32 {d16[0]}, [%0], %4 \n" "vst1.32 {d16[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d16[1]}, [%0], %4 \n" "vst1.32 {d16[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[0]}, [%0], %4 \n" "vst1.32 {d17[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[1]}, [%0], %4 \n" "vst1.32 {d17[1]}, [%0], %4 \n"
"add %0, %3, #4 \n" "add %0, %3, #4 \n"
MEMACCESS(0)
"vst1.32 {d20[0]}, [%0], %4 \n" "vst1.32 {d20[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d20[1]}, [%0], %4 \n" "vst1.32 {d20[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[0]}, [%0], %4 \n" "vst1.32 {d21[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[1]}, [%0] \n" "vst1.32 {d21[1]}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
MEMACCESS(0)
"vst1.32 {d18[0]}, [%0], %6 \n" "vst1.32 {d18[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d18[1]}, [%0], %6 \n" "vst1.32 {d18[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[0]}, [%0], %6 \n" "vst1.32 {d19[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[1]}, [%0], %6 \n" "vst1.32 {d19[1]}, [%0], %6 \n"
"add %0, %5, #4 \n" "add %0, %5, #4 \n"
MEMACCESS(0)
"vst1.32 {d22[0]}, [%0], %6 \n" "vst1.32 {d22[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d22[1]}, [%0], %6 \n" "vst1.32 {d22[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[0]}, [%0], %6 \n" "vst1.32 {d23[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[1]}, [%0] \n" "vst1.32 {d23[1]}, [%0] \n"
"add %1, #4*2 \n" // src += 4 * 2 "add %1, #4*2 \n" // src += 4 * 2
"add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a "add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b // dst_stride_a
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
// dst_stride_b
"subs %7, #4 \n" // w -= 4 "subs %7, #4 \n" // w -= 4
"beq 4f \n" "beq 4f \n"
@ -451,21 +351,13 @@ void TransposeUVWx8_NEON(const uint8* src,
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld2.16 {d0[0], d2[0]}, [%0], %2 \n" "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[0], d3[0]}, [%0], %2 \n" "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[1], d2[1]}, [%0], %2 \n" "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[1], d3[1]}, [%0], %2 \n" "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[2], d2[2]}, [%0], %2 \n" "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[2], d3[2]}, [%0], %2 \n" "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[3], d2[3]}, [%0], %2 \n" "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[3], d3[3]}, [%0] \n" "vld2.16 {d1[3], d3[3]}, [%0] \n"
"vtrn.8 d0, d1 \n" "vtrn.8 d0, d1 \n"
@ -473,46 +365,34 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.64 {d0}, [%0], %4 \n" "vst1.64 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.64 {d2}, [%0] \n" "vst1.64 {d2}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0], %6 \n" "vst1.64 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.64 {d3}, [%0] \n" "vst1.64 {d3}, [%0] \n"
"add %1, #2*2 \n" // src += 2 * 2 "add %1, #2*2 \n" // src += 2 * 2
"add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a "add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b // dst_stride_a
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
// dst_stride_b
"subs %7, #2 \n" // w -= 2 "subs %7, #2 \n" // w -= 2
"beq 4f \n" "beq 4f \n"
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"vld2.8 {d0[0], d1[0]}, [%1], %2 \n" "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[1], d1[1]}, [%1], %2 \n" "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[2], d1[2]}, [%1], %2 \n" "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[3], d1[3]}, [%1], %2 \n" "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[4], d1[4]}, [%1], %2 \n" "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[5], d1[5]}, [%1], %2 \n" "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[6], d1[6]}, [%1], %2 \n" "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[7], d1[7]}, [%1] \n" "vld2.8 {d0[7], d1[7]}, [%1] \n"
MEMACCESS(3)
"vst1.64 {d0}, [%3] \n" "vst1.64 {d0}, [%3] \n"
MEMACCESS(5)
"vst1.64 {d1}, [%5] \n" "vst1.64 {d1}, [%5] \n"
"4: \n" "4: \n"
@ -526,9 +406,7 @@ void TransposeUVWx8_NEON(const uint8* src,
"+r"(dst_stride_b), // %6 "+r"(dst_stride_b), // %6
"+r"(width) // %7 "+r"(width) // %7
: "r"(&kVTbl4x4TransposeDi) // %8 : "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc", : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
} }
#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #endif // defined(__ARM_NEON__) && !defined(__aarch64__)

View File

@ -40,21 +40,13 @@ void TransposeWx8_NEON(const uint8* src,
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n" "ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n" "ld1 {v7.8b}, [%0] \n"
"trn2 v16.8b, v0.8b, v1.8b \n" "trn2 v16.8b, v0.8b, v1.8b \n"
@ -86,21 +78,13 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v17.8b}, [%0], %6 \n" "st1 {v17.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.8b}, [%0], %6 \n" "st1 {v16.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.8b}, [%0], %6 \n" "st1 {v19.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.8b}, [%0], %6 \n" "st1 {v18.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v21.8b}, [%0], %6 \n" "st1 {v21.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v20.8b}, [%0], %6 \n" "st1 {v20.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v23.8b}, [%0], %6 \n" "st1 {v23.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v22.8b}, [%0] \n" "st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8 "add %1, %1, #8 \n" // src += 8
@ -122,26 +106,17 @@ void TransposeWx8_NEON(const uint8* src,
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %5 \n" "ld1 {v0.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %5 \n" "ld1 {v0.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %5 \n" "ld1 {v0.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %5 \n" "ld1 {v0.s}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[0], [%0], %5 \n" "ld1 {v1.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[1], [%0], %5 \n" "ld1 {v1.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[2], [%0], %5 \n" "ld1 {v1.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[3], [%0] \n" "ld1 {v1.s}[3], [%0] \n"
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%4] \n" "ld1 {v2.16b}, [%4] \n"
"tbl v3.16b, {v0.16b}, v2.16b \n" "tbl v3.16b, {v0.16b}, v2.16b \n"
@ -149,23 +124,15 @@ void TransposeWx8_NEON(const uint8* src,
// TODO(frkoenig): Rework shuffle above to // TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes. // write out with 4 instead of 8 writes.
MEMACCESS(0)
"st1 {v3.s}[0], [%0], %6 \n" "st1 {v3.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[1], [%0], %6 \n" "st1 {v3.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[2], [%0], %6 \n" "st1 {v3.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[3], [%0] \n" "st1 {v3.s}[3], [%0] \n"
"add %0, %2, #4 \n" "add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v0.s}[0], [%0], %6 \n" "st1 {v0.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[1], [%0], %6 \n" "st1 {v0.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[2], [%0], %6 \n" "st1 {v0.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[3], [%0] \n" "st1 {v0.s}[3], [%0] \n"
"add %1, %1, #4 \n" // src += 4 "add %1, %1, #4 \n" // src += 4
@ -181,21 +148,13 @@ void TransposeWx8_NEON(const uint8* src,
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.h}[0], [%0], %5 \n" "ld1 {v0.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[0], [%0], %5 \n" "ld1 {v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[1], [%0], %5 \n" "ld1 {v0.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[1], [%0], %5 \n" "ld1 {v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[2], [%0], %5 \n" "ld1 {v0.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[2], [%0], %5 \n" "ld1 {v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[3], [%0], %5 \n" "ld1 {v0.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[3], [%0] \n" "ld1 {v1.h}[3], [%0] \n"
"trn2 v2.8b, v0.8b, v1.8b \n" "trn2 v2.8b, v0.8b, v1.8b \n"
@ -203,9 +162,7 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v3.8b}, [%0], %6 \n" "st1 {v3.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v2.8b}, [%0] \n" "st1 {v2.8b}, [%0] \n"
"add %1, %1, #2 \n" // src += 2 "add %1, %1, #2 \n" // src += 2
@ -215,24 +172,15 @@ void TransposeWx8_NEON(const uint8* src,
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"ld1 {v0.b}[0], [%1], %5 \n" "ld1 {v0.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[1], [%1], %5 \n" "ld1 {v0.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[2], [%1], %5 \n" "ld1 {v0.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[3], [%1], %5 \n" "ld1 {v0.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[4], [%1], %5 \n" "ld1 {v0.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[5], [%1], %5 \n" "ld1 {v0.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[6], [%1], %5 \n" "ld1 {v0.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[7], [%1] \n" "ld1 {v0.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.8b}, [%2] \n" "st1 {v0.8b}, [%2] \n"
"4: \n" "4: \n"
@ -271,21 +219,13 @@ void TransposeUVWx8_NEON(const uint8* src,
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %5 \n" "ld1 {v0.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], %5 \n" "ld1 {v1.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.16b}, [%0], %5 \n" "ld1 {v2.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.16b}, [%0], %5 \n" "ld1 {v3.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.16b}, [%0], %5 \n" "ld1 {v4.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.16b}, [%0], %5 \n" "ld1 {v5.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.16b}, [%0], %5 \n" "ld1 {v6.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.16b}, [%0] \n" "ld1 {v7.16b}, [%0] \n"
"trn1 v16.16b, v0.16b, v1.16b \n" "trn1 v16.16b, v0.16b, v1.16b \n"
@ -317,40 +257,24 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.d}[0], [%0], %6 \n" "st1 {v16.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[0], [%0], %6 \n" "st1 {v18.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[0], [%0], %6 \n" "st1 {v17.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[0], [%0], %6 \n" "st1 {v19.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.d}[1], [%0], %6 \n" "st1 {v16.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[1], [%0], %6 \n" "st1 {v18.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[1], [%0], %6 \n" "st1 {v17.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[1], [%0] \n" "st1 {v19.d}[1], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v20.d}[0], [%0], %7 \n" "st1 {v20.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[0], [%0], %7 \n" "st1 {v22.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[0], [%0], %7 \n" "st1 {v21.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[0], [%0], %7 \n" "st1 {v23.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v20.d}[1], [%0], %7 \n" "st1 {v20.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[1], [%0], %7 \n" "st1 {v22.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[1], [%0], %7 \n" "st1 {v21.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[1], [%0] \n" "st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2 "add %1, %1, #16 \n" // src += 8*2
@ -374,24 +298,15 @@ void TransposeUVWx8_NEON(const uint8* src,
// TODO(frkoenig): Clean this up // TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n" "ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n" "ld1 {v7.8b}, [%0] \n"
MEMACCESS(8)
"ld1 {v30.16b}, [%8], #16 \n" "ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n" "ld1 {v31.16b}, [%8] \n"
@ -402,44 +317,28 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.s}[0], [%0], %6 \n" "st1 {v16.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[1], [%0], %6 \n" "st1 {v16.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[2], [%0], %6 \n" "st1 {v16.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[3], [%0], %6 \n" "st1 {v16.s}[3], [%0], %6 \n"
"add %0, %2, #4 \n" "add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v18.s}[0], [%0], %6 \n" "st1 {v18.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[1], [%0], %6 \n" "st1 {v18.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[2], [%0], %6 \n" "st1 {v18.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[3], [%0] \n" "st1 {v18.s}[3], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v17.s}[0], [%0], %7 \n" "st1 {v17.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[1], [%0], %7 \n" "st1 {v17.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[2], [%0], %7 \n" "st1 {v17.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[3], [%0], %7 \n" "st1 {v17.s}[3], [%0], %7 \n"
"add %0, %3, #4 \n" "add %0, %3, #4 \n"
MEMACCESS(0)
"st1 {v19.s}[0], [%0], %7 \n" "st1 {v19.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[1], [%0], %7 \n" "st1 {v19.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[2], [%0], %7 \n" "st1 {v19.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[3], [%0] \n" "st1 {v19.s}[3], [%0] \n"
"add %1, %1, #8 \n" // src += 4 * 2 "add %1, %1, #8 \n" // src += 4 * 2
@ -456,21 +355,13 @@ void TransposeUVWx8_NEON(const uint8* src,
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[0], [%0], %5 \n" "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[0], [%0], %5 \n" "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[1], [%0], %5 \n" "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[1], [%0], %5 \n" "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[2], [%0], %5 \n" "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[2], [%0], %5 \n" "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[3], [%0], %5 \n" "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[3], [%0] \n" "ld2 {v2.h, v3.h}[3], [%0] \n"
"trn1 v4.8b, v0.8b, v2.8b \n" "trn1 v4.8b, v0.8b, v2.8b \n"
@ -480,16 +371,12 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v4.d}[0], [%0], %6 \n" "st1 {v4.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v6.d}[0], [%0] \n" "st1 {v6.d}[0], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v5.d}[0], [%0], %7 \n" "st1 {v5.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v7.d}[0], [%0] \n" "st1 {v7.d}[0], [%0] \n"
"add %1, %1, #4 \n" // src += 2 * 2 "add %1, %1, #4 \n" // src += 2 * 2
@ -500,26 +387,16 @@ void TransposeUVWx8_NEON(const uint8* src,
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[0], [%1], %5 \n" "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[1], [%1], %5 \n" "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[2], [%1], %5 \n" "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[3], [%1], %5 \n" "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[4], [%1], %5 \n" "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[5], [%1], %5 \n" "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[6], [%1], %5 \n" "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[7], [%1] \n" "ld2 {v0.b, v1.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.d}[0], [%2] \n" "st1 {v0.d}[0], [%2] \n"
MEMACCESS(3)
"st1 {v1.d}[0], [%3] \n" "st1 {v1.d}[0], [%3] \n"
"4: \n" "4: \n"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -31,10 +31,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" "vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store odd pixels "vst1.8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -53,14 +51,14 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc // inc
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // add adjacent "vpaddl.u8 q0, q0 \n" // add adjacent
"vpaddl.u8 q1, q1 \n" "vpaddl.u8 q1, q1 \n"
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack "vrshrn.u16 d0, q0, #1 \n" // downshift, round and
// pack
"vrshrn.u16 d1, q1, #1 \n" "vrshrn.u16 d1, q1, #1 \n"
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" "vst1.8 {q0}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -80,18 +78,17 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
MEMACCESS(1)
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n" "vpaddl.u8 q1, q1 \n"
"vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
// row1
"vpadal.u8 q1, q3 \n" "vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
// pack
"vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d1, q1, #2 \n"
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" "vst1.8 {q0}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -110,18 +107,15 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"vst1.8 {d2}, [%1]! \n" "vst1.8 {d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "q0", "q1", "memory", "cc" : "q0", "q1", "memory", "cc");
);
} }
void ScaleRowDown4Box_NEON(const uint8* src_ptr, void ScaleRowDown4Box_NEON(const uint8* src_ptr,
@ -133,13 +127,9 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q0}, [%0]! \n" // load up 16x4
MEMACCESS(3)
"vld1.8 {q1}, [%3]! \n" "vld1.8 {q1}, [%3]! \n"
MEMACCESS(4)
"vld1.8 {q2}, [%4]! \n" "vld1.8 {q2}, [%4]! \n"
MEMACCESS(5)
"vld1.8 {q3}, [%5]! \n" "vld1.8 {q3}, [%5]! \n"
"subs %2, %2, #4 \n" "subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n" "vpaddl.u8 q0, q0 \n"
@ -149,7 +139,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
"vpaddl.u16 q0, q0 \n" "vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n" "vmovn.u16 d0, q0 \n"
MEMACCESS(1)
"vst1.32 {d0[0]}, [%1]! \n" "vst1.32 {d0[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -159,8 +148,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
"+r"(src_ptr2), // %4 "+r"(src_ptr2), // %4
"+r"(src_ptr3) // %5 "+r"(src_ptr3) // %5
: :
: "q0", "q1", "q2", "q3", "memory", "cc" : "q0", "q1", "q2", "q3", "memory", "cc");
);
} }
// Down scale from 4 to 3 pixels. Use the neon multilane read/write // Down scale from 4 to 3 pixels. Use the neon multilane read/write
@ -173,19 +161,16 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2 "vmov d2, d3 \n" // order d0, d1, d2
MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "d0", "d1", "d2", "d3", "memory", "cc" : "d0", "d1", "d2", "d3", "memory", "cc");
);
} }
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
@ -196,9 +181,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
@ -235,7 +218,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q8, d3, d24 \n" "vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n" "vqrshrn.u16 d2, q8, #2 \n"
MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
@ -244,8 +226,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(src_stride) // %3 "+r"(src_stride) // %3
: :
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
); "cc");
} }
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
@ -256,9 +238,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
// average src line 0 with src line 1 // average src line 0 with src line 1
@ -278,7 +258,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q3, d3, d24 \n" "vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n" "vqrshrn.u16 d2, q3, #2 \n"
MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n" "vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -286,8 +265,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(src_stride) // %3 "+r"(src_stride) // %3
: :
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
);
} }
#define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN38_NEON
@ -306,25 +284,20 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
MEMACCESS(3)
"vld1.8 {q3}, [%3] \n" "vld1.8 {q3}, [%3] \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n" "vst1.8 {d4}, [%1]! \n"
MEMACCESS(1)
"vst1.32 {d5[0]}, [%1]! \n" "vst1.32 {d5[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"(&kShuf38) // %3 : "r"(&kShuf38) // %3
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
);
} }
// 32x3 -> 12x1 // 32x3 -> 12x1
@ -335,11 +308,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
const uint8* src_ptr1 = src_ptr + src_stride * 2; const uint8* src_ptr1 = src_ptr + src_stride * 2;
asm volatile( asm volatile(
MEMACCESS(5)
"vld1.16 {q13}, [%5] \n" "vld1.16 {q13}, [%5] \n"
MEMACCESS(6)
"vld1.8 {q14}, [%6] \n" "vld1.8 {q14}, [%6] \n"
MEMACCESS(7)
"vld1.8 {q15}, [%7] \n" "vld1.8 {q15}, [%7] \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
@ -348,11 +318,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63 // d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73 // d3 = 30 70 31 71 32 72 33 73
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
MEMACCESS(4)
"vld4.8 {d16, d17, d18, d19}, [%4]! \n" "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
@ -430,9 +397,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
MEMACCESS(1)
"vst1.8 {d3}, [%1]! \n" "vst1.8 {d3}, [%1]! \n"
MEMACCESS(1)
"vst1.32 {d4[0]}, [%1]! \n" "vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -443,8 +408,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
: "r"(&kMult38_Div6), // %5 : "r"(&kMult38_Div6), // %5
"r"(&kShuf38_2), // %6 "r"(&kShuf38_2), // %6
"r"(&kMult38_Div9) // %7 "r"(&kMult38_Div9) // %7
: "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
); "cc");
} }
// 32x2 -> 12x1 // 32x2 -> 12x1
@ -453,9 +418,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
uint8* dst_ptr, uint8* dst_ptr,
int dst_width) { int dst_width) {
asm volatile( asm volatile(
MEMACCESS(4)
"vld1.16 {q13}, [%4] \n" "vld1.16 {q13}, [%4] \n"
MEMACCESS(5)
"vld1.8 {q14}, [%5] \n" "vld1.8 {q14}, [%5] \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
@ -464,9 +427,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63 // d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73 // d3 = 30 70 31 71 32 72 33 73
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
@ -533,9 +494,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
MEMACCESS(1)
"vst1.8 {d3}, [%1]! \n" "vst1.8 {d3}, [%1]! \n"
MEMACCESS(1)
"vst1.32 {d4[0]}, [%1]! \n" "vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -544,8 +503,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"+r"(src_stride) // %3 "+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4 : "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5 "r"(&kShuf38_2) // %5
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
);
} }
void ScaleAddRows_NEON(const uint8* src_ptr, void ScaleAddRows_NEON(const uint8* src_ptr,
@ -562,13 +520,11 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"veor q3, q3, q3 \n" "veor q3, q3, q3 \n"
"2: \n" "2: \n"
// load 16 pixels into q0 // load 16 pixels into q0
MEMACCESS(0)
"vld1.8 {q0}, [%0], %3 \n" "vld1.8 {q0}, [%0], %3 \n"
"vaddw.u8 q3, q3, d1 \n" "vaddw.u8 q3, q3, d1 \n"
"vaddw.u8 q2, q2, d0 \n" "vaddw.u8 q2, q2, d0 \n"
"subs r12, r12, #1 \n" "subs r12, r12, #1 \n"
"bgt 2b \n" "bgt 2b \n"
MEMACCESS(2)
"vst1.16 {q2, q3}, [%2]! \n" // store pixels "vst1.16 {q2, q3}, [%2]! \n" // store pixels
"add %1, %1, #16 \n" "add %1, %1, #16 \n"
"subs %4, %4, #16 \n" // 16 processed per loop "subs %4, %4, #16 \n" // 16 processed per loop
@ -591,7 +547,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \ "add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// clang-format on // clang-format on
@ -643,7 +598,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr,
"vadd.s16 q8, q8, q9 \n" "vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n" "vmovn.s16 d6, q8 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0]! \n" // store pixels "vst1.8 {d6}, [%0]! \n" // store pixels
"vadd.s32 q1, q1, q0 \n" "vadd.s32 q1, q1, q0 \n"
"vadd.s32 q2, q2, q0 \n" "vadd.s32 q2, q2, q0 \n"
@ -686,9 +640,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n" "vmull.u8 q13, d0, d4 \n"
@ -697,63 +649,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n" "vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n" "vrshrn.u16 d1, q14, #8 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 1b \n" "bgt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 25b \n" "bgt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n" "vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 50b \n" "bgt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n" "vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 75b \n" "bgt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" "vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 100b \n" "bgt 100b \n"
"99: \n" "99: \n"
MEMACCESS(0)
"vst1.8 {d1[7]}, [%0] \n" "vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
@ -761,8 +700,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"+r"(dst_width), // %3 "+r"(dst_width), // %3
"+r"(source_y_fraction) // %4 "+r"(source_y_fraction) // %4
: :
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
);
} }
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
@ -773,14 +711,10 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
asm volatile( asm volatile(
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS(0)
"vld2.32 {q0, q1}, [%0]! \n" "vld2.32 {q0, q1}, [%0]! \n"
MEMACCESS(0)
"vld2.32 {q2, q3}, [%0]! \n" "vld2.32 {q2, q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store odd pixels "vst1.8 {q1}, [%1]! \n" // store odd pixels
MEMACCESS(1)
"vst1.8 {q3}, [%1]! \n" "vst1.8 {q3}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -798,20 +732,19 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. // pixels.
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack "vrshrn.u16 d0, q0, #1 \n" // downshift, round and
// pack
"vrshrn.u16 d1, q1, #1 \n" "vrshrn.u16 d1, q1, #1 \n"
"vrshrn.u16 d2, q2, #1 \n" "vrshrn.u16 d2, q2, #1 \n"
"vrshrn.u16 d3, q3, #1 \n" "vrshrn.u16 d3, q3, #1 \n"
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -830,28 +763,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. // pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
MEMACCESS(1) "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. // pixels.
MEMACCESS(1) "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. // pixels.
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
"vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
// pack
"vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n" "vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n" "vrshrn.u16 d3, q3, #2 \n"
MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -859,8 +791,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"+r"(dst), // %2 "+r"(dst), // %2
"+r"(dst_width) // %3 "+r"(dst_width) // %3
: :
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
);
} }
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
@ -874,24 +805,18 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
asm volatile( asm volatile(
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[0]}, [%0], r12 \n"
MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], r12 \n" "vld1.32 {d0[1]}, [%0], r12 \n"
MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], r12 \n" "vld1.32 {d1[0]}, [%0], r12 \n"
MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], r12 \n" "vld1.32 {d1[1]}, [%0], r12 \n"
"subs %2, %2, #4 \n" // 4 pixels per loop. "subs %2, %2, #4 \n" // 4 pixels per loop.
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" "vst1.8 {q0}, [%1]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"(src_stepx) // %3 : "r"(src_stepx) // %3
: "memory", "cc", "r12", "q0" : "memory", "cc", "r12", "q0");
);
} }
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
@ -905,21 +830,14 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"mov r12, %4, lsl #2 \n" "mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks ->
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 // 2x1
MEMACCESS(1)
"vld1.8 {d1}, [%1], r12 \n" "vld1.8 {d1}, [%1], r12 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], r12 \n" "vld1.8 {d2}, [%0], r12 \n"
MEMACCESS(1)
"vld1.8 {d3}, [%1], r12 \n" "vld1.8 {d3}, [%1], r12 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], r12 \n" "vld1.8 {d4}, [%0], r12 \n"
MEMACCESS(1)
"vld1.8 {d5}, [%1], r12 \n" "vld1.8 {d5}, [%1], r12 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], r12 \n" "vld1.8 {d6}, [%0], r12 \n"
MEMACCESS(1)
"vld1.8 {d7}, [%1], r12 \n" "vld1.8 {d7}, [%1], r12 \n"
"vaddl.u8 q0, d0, d1 \n" "vaddl.u8 q0, d0, d1 \n"
"vaddl.u8 q1, d2, d3 \n" "vaddl.u8 q1, d2, d3 \n"
@ -932,7 +850,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
"vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
"subs %3, %3, #4 \n" // 4 pixels per loop. "subs %3, %3, #4 \n" // 4 pixels per loop.
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" "vst1.8 {q0}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -940,8 +857,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(dst_width) // %3 "+r"(dst_width) // %3
: "r"(src_stepx) // %4 : "r"(src_stepx) // %4
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
);
} }
// clang-format off // clang-format off
@ -951,7 +867,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld1.32 {" #dn "[" #n "]}, [%6] \n" "vld1.32 {" #dn "[" #n "]}, [%6] \n"
// clang-format on // clang-format on
@ -963,19 +878,14 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
int tmp; int tmp;
const uint8* src_tmp = src_argb; const uint8* src_tmp = src_argb;
asm volatile( asm volatile(
"1: \n" "1: \n" LOAD1_DATA32_LANE(
LOAD1_DATA32_LANE(d0, 0) d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0)
LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE(
LOAD1_DATA32_LANE(d1, 0) d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1)
LOAD1_DATA32_LANE(d1, 1)
LOAD1_DATA32_LANE(d2, 0)
LOAD1_DATA32_LANE(d2, 1)
LOAD1_DATA32_LANE(d3, 0)
LOAD1_DATA32_LANE(d3, 1)
MEMACCESS(0)
"vst1.32 {q0, q1}, [%0]! \n" // store pixels "vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per
// loop
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
@ -985,8 +895,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"=&r"(tmp), // %5 "=&r"(tmp), // %5
"+r"(src_tmp) // %6 "+r"(src_tmp) // %6
: :
: "memory", "cc", "q0", "q1" : "memory", "cc", "q0", "q1");
);
} }
#undef LOAD1_DATA32_LANE #undef LOAD1_DATA32_LANE
@ -998,7 +907,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
// clang-format on // clang-format on
@ -1045,7 +953,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
"vshrn.i16 d0, q11, #7 \n" "vshrn.i16 d0, q11, #7 \n"
"vshrn.i16 d1, q12, #7 \n" "vshrn.i16 d1, q12, #7 \n"
MEMACCESS(0)
"vst1.32 {d0, d1}, [%0]! \n" // store pixels "vst1.32 {d0, d1}, [%0]! \n" // store pixels
"vadd.s32 q8, q8, q9 \n" "vadd.s32 q8, q8, q9 \n"
"subs %2, %2, #4 \n" // 4 processed per loop "subs %2, %2, #4 \n" // 4 processed per loop

View File

@ -29,10 +29,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -51,14 +49,12 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // add adjacent "uaddlp v0.8h, v0.16b \n" // add adjacent
"uaddlp v1.8h, v1.16b \n" "uaddlp v1.8h, v1.16b \n"
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #1 \n" "rshrn2 v0.16b, v1.8h, #1 \n"
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -78,9 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
MEMACCESS(1)
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
@ -89,7 +83,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
"uadalp v1.8h, v3.16b \n" "uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #2 \n" "rshrn2 v0.16b, v1.8h, #2 \n"
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -108,10 +101,8 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -131,13 +122,9 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
MEMACCESS(3)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%3], #16 \n" "ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5)
"ld1 {v3.16b}, [%4], #16 \n" "ld1 {v3.16b}, [%4], #16 \n"
"subs %w5, %w5, #4 \n" "subs %w5, %w5, #4 \n"
"uaddlp v0.8h, v0.16b \n" "uaddlp v0.8h, v0.16b \n"
@ -146,7 +133,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
"uadalp v0.8h, v3.16b \n" "uadalp v0.8h, v3.16b \n"
"addp v0.8h, v0.8h, v0.8h \n" "addp v0.8h, v0.8h, v0.8h \n"
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
MEMACCESS(1)
"st1 {v0.s}[0], [%1], #4 \n" "st1 {v0.s}[0], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -170,11 +156,9 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -193,9 +177,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"movi v20.8b, #3 \n" "movi v20.8b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
@ -232,7 +214,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"umlal v16.8h, v3.8b, v20.8b \n" "umlal v16.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v16.8h, #2 \n" "uqrshrn v2.8b, v16.8h, #2 \n"
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
@ -254,9 +235,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"movi v20.8b, #3 \n" "movi v20.8b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
// average src line 0 with src line 1 // average src line 0 with src line 1
@ -278,7 +257,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"umlal v4.8h, v3.8b, v20.8b \n" "umlal v4.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v4.8h, #2 \n" "uqrshrn v2.8b, v4.8h, #2 \n"
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -305,16 +283,12 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
MEMACCESS(3)
"ld1 {v3.16b}, [%3] \n" "ld1 {v3.16b}, [%3] \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #12 \n" "subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v2.s}[2], [%1], #4 \n" "st1 {v2.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -334,11 +308,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t tmp_src_stride = src_stride; ptrdiff_t tmp_src_stride = src_stride;
asm volatile ( asm volatile (
MEMACCESS(5)
"ld1 {v29.8h}, [%5] \n" "ld1 {v29.8h}, [%5] \n"
MEMACCESS(6)
"ld1 {v30.16b}, [%6] \n" "ld1 {v30.16b}, [%6] \n"
MEMACCESS(7)
"ld1 {v31.8h}, [%7] \n" "ld1 {v31.8h}, [%7] \n"
"add %2, %2, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
@ -347,11 +318,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 10 50 11 51 12 52 13 53 // 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63 // 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73 // 30 70 31 71 32 72 33 73
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %w4, %w4, #12 \n" "subs %w4, %w4, #12 \n"
@ -436,9 +404,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// be adjacent // be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
MEMACCESS(1)
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -463,9 +429,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// TODO(fbarchard): use src_stride directly for clang 3.5+. // TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride; ptrdiff_t tmp_src_stride = src_stride;
asm volatile ( asm volatile (
MEMACCESS(4)
"ld1 {v30.8h}, [%4] \n" "ld1 {v30.8h}, [%4] \n"
MEMACCESS(5)
"ld1 {v31.16b}, [%5] \n" "ld1 {v31.16b}, [%5] \n"
"add %2, %2, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
@ -474,9 +438,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// 10 50 11 51 12 52 13 53 // 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63 // 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73 // 30 70 31 71 32 72 33 73
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"subs %w3, %w3, #12 \n" "subs %w3, %w3, #12 \n"
@ -547,9 +509,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
MEMACCESS(1)
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -577,13 +537,11 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"eor v3.16b, v3.16b, v3.16b \n" "eor v3.16b, v3.16b, v3.16b \n"
"2: \n" "2: \n"
// load 16 pixels into q0 // load 16 pixels into q0
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" "ld1 {v0.16b}, [%0], %3 \n"
"uaddw2 v3.8h, v3.8h, v0.16b \n" "uaddw2 v3.8h, v3.8h, v0.16b \n"
"uaddw v2.8h, v2.8h, v0.8b \n" "uaddw v2.8h, v2.8h, v0.8b \n"
"subs w12, w12, #1 \n" "subs w12, w12, #1 \n"
"b.gt 2b \n" "b.gt 2b \n"
MEMACCESS(2)
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
"add %1, %1, #16 \n" "add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop "subs %w4, %w4, #16 \n" // 16 processed per loop
@ -606,7 +564,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \ "add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n" "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
// clang-format on // clang-format on
@ -660,7 +617,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr,
"add v4.8h, v4.8h, v6.8h \n" "add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n" "xtn v4.8b, v4.8h \n"
MEMACCESS(0)
"st1 {v4.8b}, [%0], #8 \n" // store pixels "st1 {v4.8b}, [%0], #8 \n" // store pixels
"add v1.4s, v1.4s, v0.4s \n" "add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n" "add v2.4s, v2.4s, v0.4s \n"
@ -703,9 +659,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"dup v4.8b, %w5 \n" "dup v4.8b, %w5 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"umull v6.8h, v0.8b, v4.8b \n" "umull v6.8h, v0.8b, v4.8b \n"
@ -714,63 +668,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"umlal2 v7.8h, v1.16b, v5.16b \n" "umlal2 v7.8h, v1.16b, v5.16b \n"
"rshrn v0.8b, v6.8h, #8 \n" "rshrn v0.8b, v6.8h, #8 \n"
"rshrn2 v0.16b, v7.8h, #8 \n" "rshrn2 v0.16b, v7.8h, #8 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 25b \n" "b.gt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 50b \n" "b.gt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 75b \n" "b.gt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n" "b.gt 100b \n"
"99: \n" "99: \n"
MEMACCESS(0)
"st1 {v0.b}[15], [%0] \n" "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
@ -791,14 +732,10 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS (0)
"ld2 {v0.4s, v1.4s}, [%0], #32 \n" "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
MEMACCESS (0)
"ld2 {v2.4s, v3.4s}, [%0], #32 \n" "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS (1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
MEMACCESS (1)
"st1 {v3.16b}, [%1], #16 \n" "st1 {v3.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r" (src_ptr), // %0 : "+r" (src_ptr), // %0
@ -816,7 +753,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS (0)
// load 8 ARGB pixels. // load 8 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
@ -828,7 +764,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
"rshrn v1.8b, v1.8h, #1 \n" "rshrn v1.8b, v1.8h, #1 \n"
"rshrn v2.8b, v2.8h, #1 \n" "rshrn v2.8b, v2.8h, #1 \n"
"rshrn v3.8b, v3.8h, #1 \n" "rshrn v3.8b, v3.8h, #1 \n"
MEMACCESS (1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -847,14 +782,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS (0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
MEMACCESS (1)
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
@ -864,7 +797,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"rshrn v1.8b, v1.8h, #2 \n" "rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n" "rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n"
MEMACCESS (2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r" (src_ptr), // %0 : "+r" (src_ptr), // %0
@ -886,16 +818,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %3 \n" "ld1 {v0.s}[0], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %3 \n" "ld1 {v0.s}[1], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %3 \n" "ld1 {v0.s}[2], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %3 \n" "ld1 {v0.s}[3], [%0], %3 \n"
"subs %w2, %w2, #4 \n" // 4 pixels per loop. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -918,21 +845,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
asm volatile ( asm volatile (
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
MEMACCESS(1)
"ld1 {v1.8b}, [%1], %4 \n" "ld1 {v1.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %4 \n" "ld1 {v2.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v3.8b}, [%1], %4 \n" "ld1 {v3.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %4 \n" "ld1 {v4.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v5.8b}, [%1], %4 \n" "ld1 {v5.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %4 \n" "ld1 {v6.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v7.8b}, [%1], %4 \n" "ld1 {v7.8b}, [%1], %4 \n"
"uaddl v0.8h, v0.8b, v1.8b \n" "uaddl v0.8h, v0.8b, v1.8b \n"
"uaddl v2.8h, v2.8b, v3.8b \n" "uaddl v2.8h, v2.8b, v3.8b \n"
@ -949,7 +868,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop. "subs %w3, %w3, #4 \n" // 4 pixels per loop.
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -968,7 +886,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld1 {" #vn ".s}[" #n "], [%6] \n" "ld1 {" #vn ".s}[" #n "], [%6] \n"
// clang-format on // clang-format on
@ -992,7 +909,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3) LOAD1_DATA32_LANE(v1, 3)
MEMACCESS(0)
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n" "b.gt 1b \n"
@ -1017,7 +933,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
// clang-format on // clang-format on
@ -1067,7 +982,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
"shrn v0.8b, v16.8h, #7 \n" "shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n"
MEMACCESS(0)
"st1 {v0.4s}, [%0], #16 \n" // store pixels "st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n" "add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop "subs %w2, %w2, #4 \n" // 4 processed per loop