Remove ARM NaCL macros from source

NaCL has been disabled for awhile, so the code
will still build, but only with C versions.
This change removes the MEMACCESS() macros from
Neon and Neon64 source.

BUG=libyuv:702
TEST=try bots build for arm.
R=kjellander@chromium.org

Change-Id: Id581a5c8ff71e18cc69595e7fee9337f97c44a19
Reviewed-on: https://chromium-review.googlesource.com/528332
Reviewed-by: Cheng Wang <wangcheng@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
Frank Barchard 2017-06-09 14:49:28 -07:00 committed by Commit Bot
parent 5f94a33e0c
commit 6c94ad13b5
10 changed files with 3826 additions and 4834 deletions

View File

@ -625,15 +625,6 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
#op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n" #op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
#endif // defined(__native_client__) && defined(__x86_64__) #endif // defined(__native_client__) && defined(__x86_64__)
#if defined(__arm__) || defined(__aarch64__)
#undef MEMACCESS
#if defined(__native_client__)
#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
#else
#define MEMACCESS(base)
#endif
#endif
// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be // Intel Code Analizer markers. Insert IACA_START IACA_END around code to be
// measured and then run with iaca -64 libyuv_unittest. // measured and then run with iaca -64 libyuv_unittest.
// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within // IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within

View File

@ -64,9 +64,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q11, #0 \n" "vmov.u8 q11, #0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n" "subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n" "vsubl.u8 q2, d0, d2 \n"

View File

@ -59,9 +59,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v19.16b, v19.16b, v19.16b \n" "eor v19.16b, v19.16b, v19.16b \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" "ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n" "subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n" "usubl v2.8h, v0.8b, v1.8b \n"

View File

@ -30,31 +30,23 @@ void TransposeWx8_NEON(const uint8* src,
int dst_stride, int dst_stride,
int width) { int width) {
const uint8* src_temp; const uint8* src_temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %5, #8 \n" "sub %5, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], %2 \n" "vld1.8 {d0}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d1}, [%0], %2 \n" "vld1.8 {d1}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d2}, [%0], %2 \n" "vld1.8 {d2}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d3}, [%0], %2 \n" "vld1.8 {d3}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d4}, [%0], %2 \n" "vld1.8 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d5}, [%0], %2 \n" "vld1.8 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d6}, [%0], %2 \n" "vld1.8 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.8 {d7}, [%0] \n" "vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n" "vtrn.8 d1, d0 \n"
@ -79,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %4 \n" "vst1.8 {d1}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %4 \n" "vst1.8 {d3}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %4 \n" "vst1.8 {d5}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %4 \n" "vst1.8 {d7}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0] \n" "vst1.8 {d6}, [%0] \n"
"add %1, #8 \n" // src += 8 "add %1, #8 \n" // src += 8
@ -101,145 +85,108 @@ void TransposeWx8_NEON(const uint8* src,
"subs %5, #8 \n" // w -= 8 "subs %5, #8 \n" // w -= 8
"bge 1b \n" "bge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %5, #8 \n" "adds %5, #8 \n"
"beq 4f \n" "beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
"cmp %5, #2 \n" "cmp %5, #2 \n"
"blt 3f \n" "blt 3f \n"
"cmp %5, #4 \n" "cmp %5, #4 \n"
"blt 2f \n" "blt 2f \n"
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0) "vld1.32 {d0[0]}, [%0], %2 \n"
"vld1.32 {d0[0]}, [%0], %2 \n" "vld1.32 {d0[1]}, [%0], %2 \n"
MEMACCESS(0) "vld1.32 {d1[0]}, [%0], %2 \n"
"vld1.32 {d0[1]}, [%0], %2 \n" "vld1.32 {d1[1]}, [%0], %2 \n"
MEMACCESS(0) "vld1.32 {d2[0]}, [%0], %2 \n"
"vld1.32 {d1[0]}, [%0], %2 \n" "vld1.32 {d2[1]}, [%0], %2 \n"
MEMACCESS(0) "vld1.32 {d3[0]}, [%0], %2 \n"
"vld1.32 {d1[1]}, [%0], %2 \n" "vld1.32 {d3[1]}, [%0] \n"
MEMACCESS(0)
"vld1.32 {d2[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d2[1]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[0]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.32 {d3[1]}, [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(6) "vld1.8 {q3}, [%6] \n"
"vld1.8 {q3}, [%6] \n"
"vtbl.8 d4, {d0, d1}, d6 \n" "vtbl.8 d4, {d0, d1}, d6 \n"
"vtbl.8 d5, {d0, d1}, d7 \n" "vtbl.8 d5, {d0, d1}, d7 \n"
"vtbl.8 d0, {d2, d3}, d6 \n" "vtbl.8 d0, {d2, d3}, d6 \n"
"vtbl.8 d1, {d2, d3}, d7 \n" "vtbl.8 d1, {d2, d3}, d7 \n"
// TODO(frkoenig): Rework shuffle above to // TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes. // write out with 4 instead of 8 writes.
MEMACCESS(0) "vst1.32 {d4[0]}, [%0], %4 \n"
"vst1.32 {d4[0]}, [%0], %4 \n" "vst1.32 {d4[1]}, [%0], %4 \n"
MEMACCESS(0) "vst1.32 {d5[0]}, [%0], %4 \n"
"vst1.32 {d4[1]}, [%0], %4 \n" "vst1.32 {d5[1]}, [%0] \n"
MEMACCESS(0)
"vst1.32 {d5[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d5[1]}, [%0] \n"
"add %0, %3, #4 \n" "add %0, %3, #4 \n"
MEMACCESS(0) "vst1.32 {d0[0]}, [%0], %4 \n"
"vst1.32 {d0[0]}, [%0], %4 \n" "vst1.32 {d0[1]}, [%0], %4 \n"
MEMACCESS(0) "vst1.32 {d1[0]}, [%0], %4 \n"
"vst1.32 {d0[1]}, [%0], %4 \n" "vst1.32 {d1[1]}, [%0] \n"
MEMACCESS(0)
"vst1.32 {d1[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d1[1]}, [%0] \n"
"add %1, #4 \n" // src += 4 "add %1, #4 \n" // src += 4
"add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
"subs %5, #4 \n" // w -= 4 "subs %5, #4 \n" // w -= 4
"beq 4f \n" "beq 4f \n"
// some residual, check to see if it includes a 2x8 block, // some residual, check to see if it includes a 2x8 block,
// or less // or less
"cmp %5, #2 \n" "cmp %5, #2 \n"
"blt 3f \n" "blt 3f \n"
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0) "vld1.16 {d0[0]}, [%0], %2 \n"
"vld1.16 {d0[0]}, [%0], %2 \n" "vld1.16 {d1[0]}, [%0], %2 \n"
MEMACCESS(0) "vld1.16 {d0[1]}, [%0], %2 \n"
"vld1.16 {d1[0]}, [%0], %2 \n" "vld1.16 {d1[1]}, [%0], %2 \n"
MEMACCESS(0) "vld1.16 {d0[2]}, [%0], %2 \n"
"vld1.16 {d0[1]}, [%0], %2 \n" "vld1.16 {d1[2]}, [%0], %2 \n"
MEMACCESS(0) "vld1.16 {d0[3]}, [%0], %2 \n"
"vld1.16 {d1[1]}, [%0], %2 \n" "vld1.16 {d1[3]}, [%0] \n"
MEMACCESS(0)
"vld1.16 {d0[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d0[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld1.16 {d1[3]}, [%0] \n"
"vtrn.8 d0, d1 \n" "vtrn.8 d0, d1 \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0) "vst1.64 {d0}, [%0], %4 \n"
"vst1.64 {d0}, [%0], %4 \n" "vst1.64 {d1}, [%0] \n"
MEMACCESS(0)
"vst1.64 {d1}, [%0] \n"
"add %1, #2 \n" // src += 2 "add %1, #2 \n" // src += 2
"add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
"subs %5, #2 \n" // w -= 2 "subs %5, #2 \n" // w -= 2
"beq 4f \n" "beq 4f \n"
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1) "vld1.8 {d0[0]}, [%1], %2 \n"
"vld1.8 {d0[0]}, [%1], %2 \n" "vld1.8 {d0[1]}, [%1], %2 \n"
MEMACCESS(1) "vld1.8 {d0[2]}, [%1], %2 \n"
"vld1.8 {d0[1]}, [%1], %2 \n" "vld1.8 {d0[3]}, [%1], %2 \n"
MEMACCESS(1) "vld1.8 {d0[4]}, [%1], %2 \n"
"vld1.8 {d0[2]}, [%1], %2 \n" "vld1.8 {d0[5]}, [%1], %2 \n"
MEMACCESS(1) "vld1.8 {d0[6]}, [%1], %2 \n"
"vld1.8 {d0[3]}, [%1], %2 \n" "vld1.8 {d0[7]}, [%1] \n"
MEMACCESS(1)
"vld1.8 {d0[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld1.8 {d0[7]}, [%1] \n"
MEMACCESS(3) "vst1.64 {d0}, [%3] \n"
"vst1.64 {d0}, [%3] \n"
"4: \n" "4: \n"
: "=&r"(src_temp), // %0 : "=&r"(src_temp), // %0
"+r"(src), // %1 "+r"(src), // %1
"+r"(src_stride), // %2 "+r"(src_stride), // %2
"+r"(dst), // %3 "+r"(dst), // %3
"+r"(dst_stride), // %4 "+r"(dst_stride), // %4
"+r"(width) // %5 "+r"(width) // %5
: "r"(&kVTbl4x4Transpose) // %6 : "r"(&kVTbl4x4Transpose) // %6
: "memory", "cc", "q0", "q1", "q2", "q3" : "memory", "cc", "q0", "q1", "q2", "q3");
);
} }
static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
@ -253,31 +200,23 @@ void TransposeUVWx8_NEON(const uint8* src,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
const uint8* src_temp; const uint8* src_temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %7, #8 \n" "sub %7, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], %2 \n" "vld2.8 {d0, d1}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d2, d3}, [%0], %2 \n" "vld2.8 {d2, d3}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d4, d5}, [%0], %2 \n" "vld2.8 {d4, d5}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d6, d7}, [%0], %2 \n" "vld2.8 {d6, d7}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d16, d17}, [%0], %2 \n" "vld2.8 {d16, d17}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d18, d19}, [%0], %2 \n" "vld2.8 {d18, d19}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d20, d21}, [%0], %2 \n" "vld2.8 {d20, d21}, [%0], %2 \n"
MEMACCESS(0)
"vld2.8 {d22, d23}, [%0] \n" "vld2.8 {d22, d23}, [%0] \n"
"vtrn.8 q1, q0 \n" "vtrn.8 q1, q0 \n"
@ -306,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0], %4 \n" "vst1.8 {d6}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d18}, [%0], %4 \n" "vst1.8 {d18}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d16}, [%0], %4 \n" "vst1.8 {d16}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d22}, [%0], %4 \n" "vst1.8 {d22}, [%0], %4 \n"
MEMACCESS(0)
"vst1.8 {d20}, [%0] \n" "vst1.8 {d20}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
MEMACCESS(0)
"vst1.8 {d3}, [%0], %6 \n" "vst1.8 {d3}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d1}, [%0], %6 \n" "vst1.8 {d1}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d7}, [%0], %6 \n" "vst1.8 {d7}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d5}, [%0], %6 \n" "vst1.8 {d5}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d19}, [%0], %6 \n" "vst1.8 {d19}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d17}, [%0], %6 \n" "vst1.8 {d17}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d23}, [%0], %6 \n" "vst1.8 {d23}, [%0], %6 \n"
MEMACCESS(0)
"vst1.8 {d21}, [%0] \n" "vst1.8 {d21}, [%0] \n"
"add %1, #8*2 \n" // src += 8*2 "add %1, #8*2 \n" // src += 8*2
@ -348,187 +271,142 @@ void TransposeUVWx8_NEON(const uint8* src,
"subs %7, #8 \n" // w -= 8 "subs %7, #8 \n" // w -= 8
"bge 1b \n" "bge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %7, #8 \n" "adds %7, #8 \n"
"beq 4f \n" "beq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
"cmp %7, #2 \n" "cmp %7, #2 \n"
"blt 3f \n" "blt 3f \n"
"cmp %7, #4 \n" "cmp %7, #4 \n"
"blt 2f \n" "blt 2f \n"
// TODO(frkoenig): Clean this up // TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0) "vld1.64 {d0}, [%0], %2 \n"
"vld1.64 {d0}, [%0], %2 \n" "vld1.64 {d1}, [%0], %2 \n"
MEMACCESS(0) "vld1.64 {d2}, [%0], %2 \n"
"vld1.64 {d1}, [%0], %2 \n" "vld1.64 {d3}, [%0], %2 \n"
MEMACCESS(0) "vld1.64 {d4}, [%0], %2 \n"
"vld1.64 {d2}, [%0], %2 \n" "vld1.64 {d5}, [%0], %2 \n"
MEMACCESS(0) "vld1.64 {d6}, [%0], %2 \n"
"vld1.64 {d3}, [%0], %2 \n" "vld1.64 {d7}, [%0] \n"
MEMACCESS(0)
"vld1.64 {d4}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d5}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d6}, [%0], %2 \n"
MEMACCESS(0)
"vld1.64 {d7}, [%0] \n"
MEMACCESS(8) "vld1.8 {q15}, [%8] \n"
"vld1.8 {q15}, [%8] \n"
"vtrn.8 q0, q1 \n" "vtrn.8 q0, q1 \n"
"vtrn.8 q2, q3 \n" "vtrn.8 q2, q3 \n"
"vtbl.8 d16, {d0, d1}, d30 \n" "vtbl.8 d16, {d0, d1}, d30 \n"
"vtbl.8 d17, {d0, d1}, d31 \n" "vtbl.8 d17, {d0, d1}, d31 \n"
"vtbl.8 d18, {d2, d3}, d30 \n" "vtbl.8 d18, {d2, d3}, d30 \n"
"vtbl.8 d19, {d2, d3}, d31 \n" "vtbl.8 d19, {d2, d3}, d31 \n"
"vtbl.8 d20, {d4, d5}, d30 \n" "vtbl.8 d20, {d4, d5}, d30 \n"
"vtbl.8 d21, {d4, d5}, d31 \n" "vtbl.8 d21, {d4, d5}, d31 \n"
"vtbl.8 d22, {d6, d7}, d30 \n" "vtbl.8 d22, {d6, d7}, d30 \n"
"vtbl.8 d23, {d6, d7}, d31 \n" "vtbl.8 d23, {d6, d7}, d31 \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0) "vst1.32 {d16[0]}, [%0], %4 \n"
"vst1.32 {d16[0]}, [%0], %4 \n" "vst1.32 {d16[1]}, [%0], %4 \n"
MEMACCESS(0) "vst1.32 {d17[0]}, [%0], %4 \n"
"vst1.32 {d16[1]}, [%0], %4 \n" "vst1.32 {d17[1]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d17[1]}, [%0], %4 \n"
"add %0, %3, #4 \n" "add %0, %3, #4 \n"
MEMACCESS(0) "vst1.32 {d20[0]}, [%0], %4 \n"
"vst1.32 {d20[0]}, [%0], %4 \n" "vst1.32 {d20[1]}, [%0], %4 \n"
MEMACCESS(0) "vst1.32 {d21[0]}, [%0], %4 \n"
"vst1.32 {d20[1]}, [%0], %4 \n" "vst1.32 {d21[1]}, [%0] \n"
MEMACCESS(0)
"vst1.32 {d21[0]}, [%0], %4 \n"
MEMACCESS(0)
"vst1.32 {d21[1]}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
MEMACCESS(0) "vst1.32 {d18[0]}, [%0], %6 \n"
"vst1.32 {d18[0]}, [%0], %6 \n" "vst1.32 {d18[1]}, [%0], %6 \n"
MEMACCESS(0) "vst1.32 {d19[0]}, [%0], %6 \n"
"vst1.32 {d18[1]}, [%0], %6 \n" "vst1.32 {d19[1]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d19[1]}, [%0], %6 \n"
"add %0, %5, #4 \n" "add %0, %5, #4 \n"
MEMACCESS(0) "vst1.32 {d22[0]}, [%0], %6 \n"
"vst1.32 {d22[0]}, [%0], %6 \n" "vst1.32 {d22[1]}, [%0], %6 \n"
MEMACCESS(0) "vst1.32 {d23[0]}, [%0], %6 \n"
"vst1.32 {d22[1]}, [%0], %6 \n" "vst1.32 {d23[1]}, [%0] \n"
MEMACCESS(0)
"vst1.32 {d23[0]}, [%0], %6 \n"
MEMACCESS(0)
"vst1.32 {d23[1]}, [%0] \n"
"add %1, #4*2 \n" // src += 4 * 2 "add %1, #4*2 \n" // src += 4 * 2
"add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a "add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
"add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b // dst_stride_a
"subs %7, #4 \n" // w -= 4 "add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
"beq 4f \n" // dst_stride_b
"subs %7, #4 \n" // w -= 4
"beq 4f \n"
// some residual, check to see if it includes a 2x8 block, // some residual, check to see if it includes a 2x8 block,
// or less // or less
"cmp %7, #2 \n" "cmp %7, #2 \n"
"blt 3f \n" "blt 3f \n"
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0) "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
"vld2.16 {d0[0], d2[0]}, [%0], %2 \n" "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
MEMACCESS(0) "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
"vld2.16 {d1[0], d3[0]}, [%0], %2 \n" "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
MEMACCESS(0) "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
"vld2.16 {d0[1], d2[1]}, [%0], %2 \n" "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
MEMACCESS(0) "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
"vld2.16 {d1[1], d3[1]}, [%0], %2 \n" "vld2.16 {d1[3], d3[3]}, [%0] \n"
MEMACCESS(0)
"vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
MEMACCESS(0)
"vld2.16 {d1[3], d3[3]}, [%0] \n"
"vtrn.8 d0, d1 \n" "vtrn.8 d0, d1 \n"
"vtrn.8 d2, d3 \n" "vtrn.8 d2, d3 \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0) "vst1.64 {d0}, [%0], %4 \n"
"vst1.64 {d0}, [%0], %4 \n" "vst1.64 {d2}, [%0] \n"
MEMACCESS(0)
"vst1.64 {d2}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
MEMACCESS(0) "vst1.64 {d1}, [%0], %6 \n"
"vst1.64 {d1}, [%0], %6 \n" "vst1.64 {d3}, [%0] \n"
MEMACCESS(0)
"vst1.64 {d3}, [%0] \n"
"add %1, #2*2 \n" // src += 2 * 2 "add %1, #2*2 \n" // src += 2 * 2
"add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a "add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
"add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b // dst_stride_a
"subs %7, #2 \n" // w -= 2 "add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
"beq 4f \n" // dst_stride_b
"subs %7, #2 \n" // w -= 2
"beq 4f \n"
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1) "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
"vld2.8 {d0[0], d1[0]}, [%1], %2 \n" "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
MEMACCESS(1) "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
"vld2.8 {d0[1], d1[1]}, [%1], %2 \n" "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
MEMACCESS(1) "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
"vld2.8 {d0[2], d1[2]}, [%1], %2 \n" "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
MEMACCESS(1) "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
"vld2.8 {d0[3], d1[3]}, [%1], %2 \n" "vld2.8 {d0[7], d1[7]}, [%1] \n"
MEMACCESS(1)
"vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
MEMACCESS(1)
"vld2.8 {d0[7], d1[7]}, [%1] \n"
MEMACCESS(3) "vst1.64 {d0}, [%3] \n"
"vst1.64 {d0}, [%3] \n" "vst1.64 {d1}, [%5] \n"
MEMACCESS(5)
"vst1.64 {d1}, [%5] \n"
"4: \n" "4: \n"
: "=&r"(src_temp), // %0 : "=&r"(src_temp), // %0
"+r"(src), // %1 "+r"(src), // %1
"+r"(src_stride), // %2 "+r"(src_stride), // %2
"+r"(dst_a), // %3 "+r"(dst_a), // %3
"+r"(dst_stride_a), // %4 "+r"(dst_stride_a), // %4
"+r"(dst_b), // %5 "+r"(dst_b), // %5
"+r"(dst_stride_b), // %6 "+r"(dst_stride_b), // %6
"+r"(width) // %7 "+r"(width) // %7
: "r"(&kVTbl4x4TransposeDi) // %8 : "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc", : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
} }
#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #endif // defined(__ARM_NEON__) && !defined(__aarch64__)

View File

@ -34,27 +34,19 @@ void TransposeWx8_NEON(const uint8* src,
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %w3, %w3, #8 \n" "sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n" "ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n" "ld1 {v7.8b}, [%0] \n"
"trn2 v16.8b, v0.8b, v1.8b \n" "trn2 v16.8b, v0.8b, v1.8b \n"
@ -86,31 +78,23 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v17.8b}, [%0], %6 \n" "st1 {v17.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.8b}, [%0], %6 \n" "st1 {v16.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.8b}, [%0], %6 \n" "st1 {v19.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.8b}, [%0], %6 \n" "st1 {v18.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v21.8b}, [%0], %6 \n" "st1 {v21.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v20.8b}, [%0], %6 \n" "st1 {v20.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v23.8b}, [%0], %6 \n" "st1 {v23.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v22.8b}, [%0] \n" "st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8 "add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
"subs %w3, %w3, #8 \n" // w -= 8 "subs %w3, %w3, #8 \n" // w -= 8
"b.ge 1b \n" "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %w3, %w3, #8 \n" "adds %w3, %w3, #8 \n"
"b.eq 4f \n" "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
@ -122,26 +106,17 @@ void TransposeWx8_NEON(const uint8* src,
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %5 \n" "ld1 {v0.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %5 \n" "ld1 {v0.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %5 \n" "ld1 {v0.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %5 \n" "ld1 {v0.s}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[0], [%0], %5 \n" "ld1 {v1.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[1], [%0], %5 \n" "ld1 {v1.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[2], [%0], %5 \n" "ld1 {v1.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[3], [%0] \n" "ld1 {v1.s}[3], [%0] \n"
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%4] \n" "ld1 {v2.16b}, [%4] \n"
"tbl v3.16b, {v0.16b}, v2.16b \n" "tbl v3.16b, {v0.16b}, v2.16b \n"
@ -149,53 +124,37 @@ void TransposeWx8_NEON(const uint8* src,
// TODO(frkoenig): Rework shuffle above to // TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes. // write out with 4 instead of 8 writes.
MEMACCESS(0)
"st1 {v3.s}[0], [%0], %6 \n" "st1 {v3.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[1], [%0], %6 \n" "st1 {v3.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[2], [%0], %6 \n" "st1 {v3.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[3], [%0] \n" "st1 {v3.s}[3], [%0] \n"
"add %0, %2, #4 \n" "add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v0.s}[0], [%0], %6 \n" "st1 {v0.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[1], [%0], %6 \n" "st1 {v0.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[2], [%0], %6 \n" "st1 {v0.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[3], [%0] \n" "st1 {v0.s}[3], [%0] \n"
"add %1, %1, #4 \n" // src += 4 "add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %w3, %w3, #4 \n" // w -= 4 "subs %w3, %w3, #4 \n" // w -= 4
"b.eq 4f \n" "b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block, // some residual, check to see if it includes a 2x8 block,
// or less // or less
"cmp %w3, #2 \n" "cmp %w3, #2 \n"
"b.lt 3f \n" "b.lt 3f \n"
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.h}[0], [%0], %5 \n" "ld1 {v0.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[0], [%0], %5 \n" "ld1 {v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[1], [%0], %5 \n" "ld1 {v0.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[1], [%0], %5 \n" "ld1 {v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[2], [%0], %5 \n" "ld1 {v0.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[2], [%0], %5 \n" "ld1 {v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[3], [%0], %5 \n" "ld1 {v0.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[3], [%0] \n" "ld1 {v1.h}[3], [%0] \n"
"trn2 v2.8b, v0.8b, v1.8b \n" "trn2 v2.8b, v0.8b, v1.8b \n"
@ -203,36 +162,25 @@ void TransposeWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v3.8b}, [%0], %6 \n" "st1 {v3.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v2.8b}, [%0] \n" "st1 {v2.8b}, [%0] \n"
"add %1, %1, #2 \n" // src += 2 "add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %w3, %w3, #2 \n" // w -= 2 "subs %w3, %w3, #2 \n" // w -= 2
"b.eq 4f \n" "b.eq 4f \n"
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"ld1 {v0.b}[0], [%1], %5 \n" "ld1 {v0.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[1], [%1], %5 \n" "ld1 {v0.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[2], [%1], %5 \n" "ld1 {v0.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[3], [%1], %5 \n" "ld1 {v0.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[4], [%1], %5 \n" "ld1 {v0.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[5], [%1], %5 \n" "ld1 {v0.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[6], [%1], %5 \n" "ld1 {v0.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[7], [%1] \n" "ld1 {v0.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.8b}, [%2] \n" "st1 {v0.8b}, [%2] \n"
"4: \n" "4: \n"
@ -265,27 +213,19 @@ void TransposeUVWx8_NEON(const uint8* src,
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %w4, %w4, #8 \n" "sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %5 \n" "ld1 {v0.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], %5 \n" "ld1 {v1.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.16b}, [%0], %5 \n" "ld1 {v2.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.16b}, [%0], %5 \n" "ld1 {v3.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.16b}, [%0], %5 \n" "ld1 {v4.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.16b}, [%0], %5 \n" "ld1 {v5.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.16b}, [%0], %5 \n" "ld1 {v6.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.16b}, [%0] \n" "ld1 {v7.16b}, [%0] \n"
"trn1 v16.16b, v0.16b, v1.16b \n" "trn1 v16.16b, v0.16b, v1.16b \n"
@ -317,81 +257,56 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.d}[0], [%0], %6 \n" "st1 {v16.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[0], [%0], %6 \n" "st1 {v18.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[0], [%0], %6 \n" "st1 {v17.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[0], [%0], %6 \n" "st1 {v19.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.d}[1], [%0], %6 \n" "st1 {v16.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[1], [%0], %6 \n" "st1 {v18.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[1], [%0], %6 \n" "st1 {v17.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[1], [%0] \n" "st1 {v19.d}[1], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v20.d}[0], [%0], %7 \n" "st1 {v20.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[0], [%0], %7 \n" "st1 {v22.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[0], [%0], %7 \n" "st1 {v21.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[0], [%0], %7 \n" "st1 {v23.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v20.d}[1], [%0], %7 \n" "st1 {v20.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[1], [%0], %7 \n" "st1 {v22.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[1], [%0], %7 \n" "st1 {v21.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[1], [%0] \n" "st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2 "add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %w4, %w4, #8 \n" // w -= 8 "subs %w4, %w4, #8 \n" // w -= 8
"b.ge 1b \n" "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %w4, %w4, #8 \n" "adds %w4, %w4, #8 \n"
"b.eq 4f \n" "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
"cmp %w4, #2 \n" "cmp %w4, #2 \n"
"b.lt 3f \n" "b.lt 3f \n"
"cmp %w4, #4 \n" "cmp %w4, #4 \n"
"b.lt 2f \n" "b.lt 2f \n"
// TODO(frkoenig): Clean this up // TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n" "ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n" "ld1 {v7.8b}, [%0] \n"
MEMACCESS(8)
"ld1 {v30.16b}, [%8], #16 \n" "ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n" "ld1 {v31.16b}, [%8] \n"
@ -402,75 +317,51 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.s}[0], [%0], %6 \n" "st1 {v16.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[1], [%0], %6 \n" "st1 {v16.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[2], [%0], %6 \n" "st1 {v16.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[3], [%0], %6 \n" "st1 {v16.s}[3], [%0], %6 \n"
"add %0, %2, #4 \n" "add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v18.s}[0], [%0], %6 \n" "st1 {v18.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[1], [%0], %6 \n" "st1 {v18.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[2], [%0], %6 \n" "st1 {v18.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[3], [%0] \n" "st1 {v18.s}[3], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v17.s}[0], [%0], %7 \n" "st1 {v17.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[1], [%0], %7 \n" "st1 {v17.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[2], [%0], %7 \n" "st1 {v17.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[3], [%0], %7 \n" "st1 {v17.s}[3], [%0], %7 \n"
"add %0, %3, #4 \n" "add %0, %3, #4 \n"
MEMACCESS(0)
"st1 {v19.s}[0], [%0], %7 \n" "st1 {v19.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[1], [%0], %7 \n" "st1 {v19.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[2], [%0], %7 \n" "st1 {v19.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[3], [%0] \n" "st1 {v19.s}[3], [%0] \n"
"add %1, %1, #8 \n" // src += 4 * 2 "add %1, %1, #8 \n" // src += 4 * 2
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %w4, %w4, #4 \n" // w -= 4 "subs %w4, %w4, #4 \n" // w -= 4
"b.eq 4f \n" "b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block, // some residual, check to see if it includes a 2x8 block,
// or less // or less
"cmp %w4, #2 \n" "cmp %w4, #2 \n"
"b.lt 3f \n" "b.lt 3f \n"
// 2x8 block // 2x8 block
"2: \n" "2: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[0], [%0], %5 \n" "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[0], [%0], %5 \n" "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[1], [%0], %5 \n" "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[1], [%0], %5 \n" "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[2], [%0], %5 \n" "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[2], [%0], %5 \n" "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[3], [%0], %5 \n" "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[3], [%0] \n" "ld2 {v2.h, v3.h}[3], [%0] \n"
"trn1 v4.8b, v0.8b, v2.8b \n" "trn1 v4.8b, v0.8b, v2.8b \n"
@ -480,46 +371,32 @@ void TransposeUVWx8_NEON(const uint8* src,
"mov %0, %2 \n" "mov %0, %2 \n"
MEMACCESS(0)
"st1 {v4.d}[0], [%0], %6 \n" "st1 {v4.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v6.d}[0], [%0] \n" "st1 {v6.d}[0], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
MEMACCESS(0)
"st1 {v5.d}[0], [%0], %7 \n" "st1 {v5.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v7.d}[0], [%0] \n" "st1 {v7.d}[0], [%0] \n"
"add %1, %1, #4 \n" // src += 2 * 2 "add %1, %1, #4 \n" // src += 2 * 2
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %w4, %w4, #2 \n" // w -= 2 "subs %w4, %w4, #2 \n" // w -= 2
"b.eq 4f \n" "b.eq 4f \n"
// 1x8 block // 1x8 block
"3: \n" "3: \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[0], [%1], %5 \n" "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[1], [%1], %5 \n" "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[2], [%1], %5 \n" "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[3], [%1], %5 \n" "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[4], [%1], %5 \n" "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[5], [%1], %5 \n" "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[6], [%1], %5 \n" "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[7], [%1] \n" "ld2 {v0.b, v1.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.d}[0], [%2] \n" "st1 {v0.d}[0], [%2] \n"
MEMACCESS(3)
"st1 {v1.d}[0], [%3] \n" "st1 {v1.d}[0], [%3] \n"
"4: \n" "4: \n"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -50,7 +50,7 @@ CANY(ScaleARGBFilterCols_Any_NEON,
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
int dst_width) { \ int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \
int n = dst_width - r; \ int n = dst_width - r; \
if (n > 0) { \ if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
@ -65,7 +65,7 @@ CANY(ScaleARGBFilterCols_Any_NEON,
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ #define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \ void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
int dst_width) { \ int dst_width) { \
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \
int n = (dst_width - 1) - r; \ int n = (dst_width - 1) - r; \
if (n > 0) { \ if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \

File diff suppressed because it is too large Load Diff

View File

@ -29,10 +29,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -51,14 +49,12 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // add adjacent "uaddlp v0.8h, v0.16b \n" // add adjacent
"uaddlp v1.8h, v1.16b \n" "uaddlp v1.8h, v1.16b \n"
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #1 \n" "rshrn2 v0.16b, v1.8h, #1 \n"
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -78,9 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
MEMACCESS(1)
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
@ -89,7 +83,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr,
"uadalp v1.8h, v3.16b \n" "uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #2 \n" "rshrn2 v0.16b, v1.8h, #2 \n"
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -108,10 +101,8 @@ void ScaleRowDown4_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -131,13 +122,9 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
MEMACCESS(3)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%3], #16 \n" "ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5)
"ld1 {v3.16b}, [%4], #16 \n" "ld1 {v3.16b}, [%4], #16 \n"
"subs %w5, %w5, #4 \n" "subs %w5, %w5, #4 \n"
"uaddlp v0.8h, v0.16b \n" "uaddlp v0.8h, v0.16b \n"
@ -146,7 +133,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr,
"uadalp v0.8h, v3.16b \n" "uadalp v0.8h, v3.16b \n"
"addp v0.8h, v0.8h, v0.8h \n" "addp v0.8h, v0.8h, v0.8h \n"
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
MEMACCESS(1)
"st1 {v0.s}[0], [%1], #4 \n" "st1 {v0.s}[0], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -170,11 +156,9 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -193,9 +177,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"movi v20.8b, #3 \n" "movi v20.8b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
@ -232,8 +214,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"umlal v16.8h, v3.8b, v20.8b \n" "umlal v16.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v16.8h, #2 \n" "uqrshrn v2.8b, v16.8h, #2 \n"
MEMACCESS(1) "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -254,10 +235,8 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"movi v20.8b, #3 \n" "movi v20.8b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
// average src line 0 with src line 1 // average src line 0 with src line 1
"urhadd v0.8b, v0.8b, v4.8b \n" "urhadd v0.8b, v0.8b, v4.8b \n"
@ -278,8 +257,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"umlal v4.8h, v3.8b, v20.8b \n" "umlal v4.8h, v3.8b, v20.8b \n"
"uqrshrn v2.8b, v4.8h, #2 \n" "uqrshrn v2.8b, v4.8h, #2 \n"
MEMACCESS(1) "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -305,16 +283,12 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
MEMACCESS(3)
"ld1 {v3.16b}, [%3] \n" "ld1 {v3.16b}, [%3] \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #12 \n" "subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v2.s}[2], [%1], #4 \n" "st1 {v2.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -334,11 +308,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t tmp_src_stride = src_stride; ptrdiff_t tmp_src_stride = src_stride;
asm volatile ( asm volatile (
MEMACCESS(5)
"ld1 {v29.8h}, [%5] \n" "ld1 {v29.8h}, [%5] \n"
MEMACCESS(6)
"ld1 {v30.16b}, [%6] \n" "ld1 {v30.16b}, [%6] \n"
MEMACCESS(7)
"ld1 {v31.8h}, [%7] \n" "ld1 {v31.8h}, [%7] \n"
"add %2, %2, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
@ -347,12 +318,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// 10 50 11 51 12 52 13 53 // 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63 // 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73 // 30 70 31 71 32 72 33 73
MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(3) "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %w4, %w4, #12 \n" "subs %w4, %w4, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
@ -436,9 +404,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// be adjacent // be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
MEMACCESS(1)
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -463,9 +429,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// TODO(fbarchard): use src_stride directly for clang 3.5+. // TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride; ptrdiff_t tmp_src_stride = src_stride;
asm volatile ( asm volatile (
MEMACCESS(4)
"ld1 {v30.8h}, [%4] \n" "ld1 {v30.8h}, [%4] \n"
MEMACCESS(5)
"ld1 {v31.16b}, [%5] \n" "ld1 {v31.16b}, [%5] \n"
"add %2, %2, %0 \n" "add %2, %2, %0 \n"
"1: \n" "1: \n"
@ -474,10 +438,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// 10 50 11 51 12 52 13 53 // 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63 // 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73 // 30 70 31 71 32 72 33 73
MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"subs %w3, %w3, #12 \n" "subs %w3, %w3, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
@ -547,9 +509,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
MEMACCESS(1)
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
MEMACCESS(1)
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@ -577,13 +537,11 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"eor v3.16b, v3.16b, v3.16b \n" "eor v3.16b, v3.16b, v3.16b \n"
"2: \n" "2: \n"
// load 16 pixels into q0 // load 16 pixels into q0
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" "ld1 {v0.16b}, [%0], %3 \n"
"uaddw2 v3.8h, v3.8h, v0.16b \n" "uaddw2 v3.8h, v3.8h, v0.16b \n"
"uaddw v2.8h, v2.8h, v0.8b \n" "uaddw v2.8h, v2.8h, v0.8b \n"
"subs w12, w12, #1 \n" "subs w12, w12, #1 \n"
"b.gt 2b \n" "b.gt 2b \n"
MEMACCESS(2)
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
"add %1, %1, #16 \n" "add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop "subs %w4, %w4, #16 \n" // 16 processed per loop
@ -606,7 +564,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \ "add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n" "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
// clang-format on // clang-format on
@ -660,7 +617,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr,
"add v4.8h, v4.8h, v6.8h \n" "add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n" "xtn v4.8b, v4.8h \n"
MEMACCESS(0)
"st1 {v4.8b}, [%0], #8 \n" // store pixels "st1 {v4.8b}, [%0], #8 \n" // store pixels
"add v1.4s, v1.4s, v0.4s \n" "add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n" "add v2.4s, v2.4s, v0.4s \n"
@ -703,9 +659,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"dup v4.8b, %w5 \n" "dup v4.8b, %w5 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"umull v6.8h, v0.8b, v4.8b \n" "umull v6.8h, v0.8b, v4.8b \n"
@ -714,63 +668,50 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"umlal2 v7.8h, v1.16b, v5.16b \n" "umlal2 v7.8h, v1.16b, v5.16b \n"
"rshrn v0.8b, v6.8h, #8 \n" "rshrn v0.8b, v6.8h, #8 \n"
"rshrn2 v0.16b, v7.8h, #8 \n" "rshrn2 v0.16b, v7.8h, #8 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75. // Blend 25 / 75.
"25: \n" "25: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 25b \n" "b.gt 25b \n"
"b 99f \n" "b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 50b \n" "b.gt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25. // Blend 75 / 25.
"75: \n" "75: \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 75b \n" "b.gt 75b \n"
"b 99f \n" "b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n" "b.gt 100b \n"
"99: \n" "99: \n"
MEMACCESS(0)
"st1 {v0.b}[15], [%0] \n" "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
@ -791,14 +732,10 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS (0)
"ld2 {v0.4s, v1.4s}, [%0], #32 \n" "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
MEMACCESS (0)
"ld2 {v2.4s, v3.4s}, [%0], #32 \n" "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS (1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
MEMACCESS (1)
"st1 {v3.16b}, [%1], #16 \n" "st1 {v3.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r" (src_ptr), // %0 : "+r" (src_ptr), // %0
@ -816,7 +753,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS (0)
// load 8 ARGB pixels. // load 8 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
@ -828,7 +764,6 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
"rshrn v1.8b, v1.8h, #1 \n" "rshrn v1.8b, v1.8h, #1 \n"
"rshrn v2.8b, v2.8h, #1 \n" "rshrn v2.8b, v2.8h, #1 \n"
"rshrn v3.8b, v3.8h, #1 \n" "rshrn v3.8b, v3.8h, #1 \n"
MEMACCESS (1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -847,14 +782,12 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS (0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
MEMACCESS (1)
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
@ -864,7 +797,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
"rshrn v1.8b, v1.8h, #2 \n" "rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n" "rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n"
MEMACCESS (2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r" (src_ptr), // %0 : "+r" (src_ptr), // %0
@ -886,16 +818,11 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %3 \n" "ld1 {v0.s}[0], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %3 \n" "ld1 {v0.s}[1], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %3 \n" "ld1 {v0.s}[2], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %3 \n" "ld1 {v0.s}[3], [%0], %3 \n"
"subs %w2, %w2, #4 \n" // 4 pixels per loop. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -918,21 +845,13 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
asm volatile ( asm volatile (
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
MEMACCESS(1)
"ld1 {v1.8b}, [%1], %4 \n" "ld1 {v1.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %4 \n" "ld1 {v2.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v3.8b}, [%1], %4 \n" "ld1 {v3.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %4 \n" "ld1 {v4.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v5.8b}, [%1], %4 \n" "ld1 {v5.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %4 \n" "ld1 {v6.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v7.8b}, [%1], %4 \n" "ld1 {v7.8b}, [%1], %4 \n"
"uaddl v0.8h, v0.8b, v1.8b \n" "uaddl v0.8h, v0.8b, v1.8b \n"
"uaddl v2.8h, v2.8b, v3.8b \n" "uaddl v2.8h, v2.8b, v3.8b \n"
@ -949,7 +868,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop. "subs %w3, %w3, #4 \n" // 4 pixels per loop.
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -968,7 +886,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld1 {" #vn ".s}[" #n "], [%6] \n" "ld1 {" #vn ".s}[" #n "], [%6] \n"
// clang-format on // clang-format on
@ -992,10 +909,9 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3) LOAD1_DATA32_LANE(v1, 3)
MEMACCESS(0)
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
@ -1017,7 +933,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb,
"lsr %5, %3, #16 \n" \ "lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \ "add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \ "add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
// clang-format on // clang-format on
@ -1067,7 +982,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb,
"shrn v0.8b, v16.8h, #7 \n" "shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n"
MEMACCESS(0)
"st1 {v0.4s}, [%0], #16 \n" // store pixels "st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n" "add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop "subs %w2, %w2, #4 \n" // 4 processed per loop