Apply format with no code changes

Bug: None
Change-Id: I8923bacb9af7e7d4f13e210c8b3d7ea6b81568a5
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6301086
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
This commit is contained in:
Frank Barchard 2025-02-24 23:22:09 -08:00 committed by libyuv LUCI CQ
parent 61354d2671
commit 3a7e0ba671
12 changed files with 819 additions and 819 deletions

View File

@ -37,7 +37,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
// Process 32 bytes per loop. // Process 32 bytes per loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"mov (%0),%%rcx \n" "mov (%0),%%rcx \n"
"mov 0x8(%0),%%rdx \n" "mov 0x8(%0),%%rdx \n"
"xor (%1),%%rcx \n" "xor (%1),%%rcx \n"
@ -80,7 +80,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
asm volatile( asm volatile(
// Process 16 bytes per loop. // Process 16 bytes per loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"mov (%0),%%ecx \n" "mov (%0),%%ecx \n"
"mov 0x4(%0),%%edx \n" "mov 0x4(%0),%%edx \n"
"xor (%1),%%ecx \n" "xor (%1),%%ecx \n"
@ -129,7 +129,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
"sub %0,%1 \n" "sub %0,%1 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqa (%0),%%xmm4 \n" "movdqa (%0),%%xmm4 \n"
"movdqa 0x10(%0), %%xmm5 \n" "movdqa 0x10(%0), %%xmm5 \n"
"pxor (%0,%1), %%xmm4 \n" "pxor (%0,%1), %%xmm4 \n"
@ -188,7 +188,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
"sub %0,%1 \n" "sub %0,%1 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqa (%0),%%ymm4 \n" "vmovdqa (%0),%%ymm4 \n"
"vmovdqa 0x20(%0), %%ymm5 \n" "vmovdqa 0x20(%0), %%ymm5 \n"
"vpxor (%0,%1), %%ymm4, %%ymm4 \n" "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
@ -217,7 +217,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
"vpermq $0xaa,%%ymm0,%%ymm1 \n" "vpermq $0xaa,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n" "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vmovd %%xmm0,%3 \n" "vmovd %%xmm0,%3 \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_a), // %0 : "+r"(src_a), // %0
"+r"(src_b), // %1 "+r"(src_b), // %1
"+r"(count), // %2 "+r"(count), // %2
@ -239,7 +239,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm1 \n" "movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"movdqu (%1),%%xmm2 \n" "movdqu (%1),%%xmm2 \n"
@ -306,7 +306,7 @@ uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
"movdqa %4,%%xmm6 \n" "movdqa %4,%%xmm6 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm1 \n" "movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"pmulld %%xmm6,%%xmm0 \n" "pmulld %%xmm6,%%xmm0 \n"

View File

@ -31,7 +31,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
asm volatile( asm volatile(
"vmov.u16 q4, #0 \n" // accumulator "vmov.u16 q4, #0 \n" // accumulator
"1: \n" "1: \n"
"vld1.8 {q0, q1}, [%0]! \n" "vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n" "vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n" "veor.32 q0, q0, q2 \n"
@ -64,7 +64,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
"vmov.u8 q9, #0 \n" "vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n" "vmov.u8 q11, #0 \n"
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n" "subs %2, %2, #16 \n"

View File

@ -29,7 +29,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
asm volatile( asm volatile(
"movi v4.8h, #0 \n" "movi v4.8h, #0 \n"
"1: \n" "1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n" "eor v0.16b, v0.16b, v2.16b \n"
@ -61,7 +61,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
"movi v18.16b, #0 \n" "movi v18.16b, #0 \n"
"movi v19.16b, #0 \n" "movi v19.16b, #0 \n"
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" "ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n" "subs %w2, %w2, #16 \n"
@ -122,7 +122,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
// count is always a multiple of 16. // count is always a multiple of 16.
// maintain two accumulators, reduce and then final sum in scalar since // maintain two accumulators, reduce and then final sum in scalar since
// this has better performance on little cores. // this has better performance on little cores.
"1: \n" "1: \n"
"ldr q0, [%[src]], #16 \n" "ldr q0, [%[src]], #16 \n"
"subs %w[count], %w[count], #16 \n" "subs %w[count], %w[count], #16 \n"
"tbl v3.16b, {v0.16b}, v19.16b \n" "tbl v3.16b, {v0.16b}, v19.16b \n"
@ -162,7 +162,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
"movi v5.4s, #0 \n" "movi v5.4s, #0 \n"
"movi v6.16b, #1 \n" "movi v6.16b, #1 \n"
"1: \n" "1: \n"
"ldp q0, q1, [%0], #32 \n" "ldp q0, q1, [%0], #32 \n"
"ldp q2, q3, [%1], #32 \n" "ldp q2, q3, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n" "eor v0.16b, v0.16b, v2.16b \n"
@ -194,7 +194,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
"movi v4.4s, #0 \n" "movi v4.4s, #0 \n"
"movi v5.4s, #0 \n" "movi v5.4s, #0 \n"
"1: \n" "1: \n"
"ldp q0, q2, [%0], #32 \n" "ldp q0, q2, [%0], #32 \n"
"ldp q1, q3, [%1], #32 \n" "ldp q1, q3, [%1], #32 \n"
"subs %w2, %w2, #32 \n" "subs %w2, %w2, #32 \n"

View File

@ -30,7 +30,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" "movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n" "movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
@ -120,7 +120,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n" "movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
@ -265,7 +265,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n" "movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
@ -393,7 +393,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
int width) { int width) {
asm volatile( asm volatile(
// Main loop transpose 4x4. Read a column, write a row. // Main loop transpose 4x4. Read a column, write a row.
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" // a b c d "movdqu (%0),%%xmm0 \n" // a b c d
"movdqu (%0,%3),%%xmm1 \n" // e f g h "movdqu (%0,%3),%%xmm1 \n" // e f g h
"lea (%0,%3,2),%0 \n" // src += stride * 2 "lea (%0,%3,2),%0 \n" // src += stride * 2
@ -449,7 +449,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
int width) { int width) {
asm volatile( asm volatile(
// Main loop transpose 2 blocks of 4x4. Read a column, write a row. // Main loop transpose 2 blocks of 4x4. Read a column, write a row.
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // a b c d "vmovdqu (%0),%%xmm0 \n" // a b c d
"vmovdqu (%0,%3),%%xmm1 \n" // e f g h "vmovdqu (%0,%3),%%xmm1 \n" // e f g h
"lea (%0,%3,2),%0 \n" // src += stride * 2 "lea (%0,%3,2),%0 \n" // src += stride * 2
@ -484,7 +484,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
"sub %4,%1 \n" "sub %4,%1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+rm"(width) // %2 "+rm"(width) // %2

View File

@ -33,7 +33,7 @@ void TransposeWx8_NEON(const uint8_t* src,
// at w-8 allow for this // at w-8 allow for this
"sub %[width], #8 \n" "sub %[width], #8 \n"
"1: \n" "1: \n"
"mov %[temp], %[src] \n" "mov %[temp], %[src] \n"
"vld1.8 {d0}, [%[temp]], %[src_stride] \n" "vld1.8 {d0}, [%[temp]], %[src_stride] \n"
"vld1.8 {d1}, [%[temp]], %[src_stride] \n" "vld1.8 {d1}, [%[temp]], %[src_stride] \n"
@ -101,7 +101,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// at w-8 allow for this // at w-8 allow for this
"sub %[width], #8 \n" "sub %[width], #8 \n"
"1: \n" "1: \n"
"mov %[temp], %[src] \n" "mov %[temp], %[src] \n"
"vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n" "vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n"
"vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n" "vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n"
@ -186,7 +186,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
uint8_t* dst3 = dst2 + dst_stride; uint8_t* dst3 = dst2 + dst_stride;
asm volatile( asm volatile(
// Main loop transpose 4x4. Read a column, write a row. // Main loop transpose 4x4. Read a column, write a row.
"1: \n" "1: \n"
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n" "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
"vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n" "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
"vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n" "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"

View File

@ -28,7 +28,7 @@ void TransposeWx16_NEON(const uint8_t* src,
int width) { int width) {
const uint8_t* src_temp; const uint8_t* src_temp;
asm volatile( asm volatile(
"1: \n" "1: \n"
"mov %[src_temp], %[src] \n" "mov %[src_temp], %[src] \n"
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
@ -151,7 +151,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// at w-8 allow for this // at w-8 allow for this
"sub %w[width], %w[width], #8 \n" "sub %w[width], %w[width], #8 \n"
"1: \n" "1: \n"
"mov %[temp], %[src] \n" "mov %[temp], %[src] \n"
"ld1 {v0.16b}, [%[temp]], %[src_stride] \n" "ld1 {v0.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v1.16b}, [%[temp]], %[src_stride] \n" "ld1 {v1.16b}, [%[temp]], %[src_stride] \n"
@ -241,7 +241,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
uint8_t* dst3 = dst2 + dst_stride; uint8_t* dst3 = dst2 + dst_stride;
asm volatile( asm volatile(
// Main loop transpose 4x4. Read a column, write a row. // Main loop transpose 4x4. Read a column, write a row.
"1: \n" "1: \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n" "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n" "ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n" "ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -100,7 +100,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
asm volatile( asm volatile(
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
@ -130,7 +130,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
@ -161,7 +161,7 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x00(%0,%3,1),%%xmm2 \n" "movdqu 0x00(%0,%3,1),%%xmm2 \n"
@ -196,7 +196,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n" "lea 0x40(%0),%0 \n"
@ -208,7 +208,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -228,7 +228,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
"vpxor %%ymm5,%%ymm5,%%ymm5 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n" "lea 0x40(%0),%0 \n"
@ -242,7 +242,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -261,7 +261,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
"vpxor %%ymm5,%%ymm5,%%ymm5 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n" "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
@ -283,7 +283,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -303,7 +303,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
"pslld $0x10,%%xmm5 \n" "pslld $0x10,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
@ -337,7 +337,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
"lea 0x00(%4,%4,2),%3 \n" "lea 0x00(%4,%4,2),%3 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x00(%0,%4,1),%%xmm2 \n" "movdqu 0x00(%0,%4,1),%%xmm2 \n"
@ -389,7 +389,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
"vpslld $0x10,%%ymm5,%%ymm5 \n" "vpslld $0x10,%%ymm5,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n" "lea 0x40(%0),%0 \n"
@ -404,7 +404,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -423,7 +423,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n" "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
@ -457,7 +457,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -482,7 +482,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"m"(kShuf2) // %2 "m"(kShuf2) // %2
); );
asm volatile( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm2 \n" "movdqu 0x10(%0),%%xmm2 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
@ -527,7 +527,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm volatile( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n" "pavgb %%xmm7,%%xmm6 \n"
@ -592,7 +592,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
); );
asm volatile( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n"
"pavgb %%xmm6,%%xmm7 \n" "pavgb %%xmm6,%%xmm7 \n"
@ -646,7 +646,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
"movdqa %4,%%xmm5 \n" "movdqa %4,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
@ -683,7 +683,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kScaleAb2) // %3 "m"(kScaleAb2) // %3
); );
asm volatile( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
@ -725,7 +725,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"m"(kScaleAc33) // %2 "m"(kScaleAc33) // %2
); );
asm volatile( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n"
"movhlps %%xmm0,%%xmm1 \n" "movhlps %%xmm0,%%xmm1 \n"
@ -789,7 +789,7 @@ void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
"psllw $1,%%xmm6 \n" // all 2 "psllw $1,%%xmm6 \n" // all 2
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm1 \n" // 01234567 "movq (%0),%%xmm1 \n" // 01234567
"movq 1(%0),%%xmm2 \n" // 12345678 "movq 1(%0),%%xmm2 \n" // 12345678
"movdqa %%xmm1,%%xmm3 \n" "movdqa %%xmm1,%%xmm3 \n"
@ -839,7 +839,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"pxor %%xmm0,%%xmm0 \n" // 0 "pxor %%xmm0,%%xmm0 \n" // 0
// above line // above line
"movq (%0),%%xmm1 \n" // 01234567 "movq (%0),%%xmm1 \n" // 01234567
@ -958,7 +958,7 @@ void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
"psllw $1,%%xmm4 \n" // all 2 "psllw $1,%%xmm4 \n" // all 2
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" // 01234567 (16) "movdqu (%0),%%xmm0 \n" // 01234567 (16)
"movdqu 2(%0),%%xmm1 \n" // 12345678 (16) "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
@ -1010,7 +1010,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
"movdqa %5,%%xmm6 \n" "movdqa %5,%%xmm6 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
// above line // above line
"movdqu (%0),%%xmm0 \n" // 01234567 (16) "movdqu (%0),%%xmm0 \n" // 01234567 (16)
"movdqu 2(%0),%%xmm1 \n" // 12345678 (16) "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
@ -1108,7 +1108,7 @@ void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
"pslld $1,%%xmm4 \n" // all 2 "pslld $1,%%xmm4 \n" // all 2
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" // 0123 (16b) "movq (%0),%%xmm0 \n" // 0123 (16b)
"movq 2(%0),%%xmm1 \n" // 1234 (16b) "movq 2(%0),%%xmm1 \n" // 1234 (16b)
@ -1161,7 +1161,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
"pslld $3,%%xmm6 \n" // all 8 "pslld $3,%%xmm6 \n" // all 8
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
"movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
"punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
@ -1269,7 +1269,7 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
"movdqa %3,%%xmm3 \n" "movdqa %3,%%xmm3 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" // 01234567 "movq (%0),%%xmm0 \n" // 01234567
"movq 1(%0),%%xmm1 \n" // 12345678 "movq 1(%0),%%xmm1 \n" // 12345678
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
@ -1310,7 +1310,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
"movdqa %5,%%xmm7 \n" "movdqa %5,%%xmm7 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" // 01234567 "movq (%0),%%xmm0 \n" // 01234567
"movq 1(%0),%%xmm1 \n" // 12345678 "movq 1(%0),%%xmm1 \n" // 12345678
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
@ -1395,7 +1395,7 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
"vbroadcastf128 %3,%%ymm3 \n" "vbroadcastf128 %3,%%ymm3 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
@ -1417,7 +1417,7 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -1439,7 +1439,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
"vbroadcastf128 %5,%%ymm7 \n" "vbroadcastf128 %5,%%ymm7 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
@ -1498,7 +1498,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -1521,7 +1521,7 @@ void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b) "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b)
"vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b) "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b)
@ -1551,7 +1551,7 @@ void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
"lea 0x40(%1),%1 \n" // 16 sample to 32 sample "lea 0x40(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -1573,7 +1573,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
"vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
@ -1613,7 +1613,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -1634,7 +1634,7 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
@ -1663,7 +1663,7 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -1684,7 +1684,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
@ -1747,7 +1747,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -1765,7 +1765,7 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm3 \n" "movdqu (%0),%%xmm3 \n"
"lea 0x10(%0),%0 \n" // src_ptr += 16 "lea 0x10(%0),%0 \n" // src_ptr += 16
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
@ -1795,7 +1795,7 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm3 \n" "vmovdqu (%0),%%ymm3 \n"
"lea 0x20(%0),%0 \n" // src_ptr += 32 "lea 0x20(%0),%0 \n" // src_ptr += 32
"vpermq $0xd8,%%ymm3,%%ymm3 \n" "vpermq $0xd8,%%ymm3,%%ymm3 \n"
@ -1808,7 +1808,7 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
"lea 0x40(%1),%1 \n" "lea 0x40(%1),%1 \n"
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(src_width) // %2 "+r"(src_width) // %2
@ -1854,7 +1854,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
"pextrw $0x3,%%xmm2,%k4 \n" "pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN LABELALIGN
"2: \n" "2: \n"
"movdqa %%xmm2,%%xmm1 \n" "movdqa %%xmm2,%%xmm1 \n"
"paddd %%xmm3,%%xmm2 \n" "paddd %%xmm3,%%xmm2 \n"
"movzwl 0x00(%1,%3,1),%k2 \n" "movzwl 0x00(%1,%3,1),%k2 \n"
@ -1881,7 +1881,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
"jge 2b \n" "jge 2b \n"
LABELALIGN LABELALIGN
"29: \n" "29: \n"
"addl $0x1,%5 \n" "addl $0x1,%5 \n"
"jl 99f \n" "jl 99f \n"
"movzwl 0x00(%1,%3,1),%k2 \n" "movzwl 0x00(%1,%3,1),%k2 \n"
@ -1897,7 +1897,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
"packuswb %%xmm2,%%xmm2 \n" "packuswb %%xmm2,%%xmm2 \n"
"movd %%xmm2,%k2 \n" "movd %%xmm2,%k2 \n"
"mov %b2,(%0) \n" "mov %b2,(%0) \n"
"99: \n" "99: \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"=&a"(temp_pixel), // %2 "=&a"(temp_pixel), // %2
@ -1931,7 +1931,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
(void)x; (void)x;
(void)dx; (void)dx;
asm volatile( asm volatile(
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
@ -1956,7 +1956,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
@ -1978,7 +1978,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
@ -2002,7 +2002,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x00(%0,%3,1),%%xmm2 \n" "movdqu 0x00(%0,%3,1),%%xmm2 \n"
@ -2040,7 +2040,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
"lea 0x00(%1,%1,2),%4 \n" "lea 0x00(%1,%1,2),%4 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movd (%0),%%xmm0 \n" "movd (%0),%%xmm0 \n"
"movd 0x00(%0,%1,1),%%xmm1 \n" "movd 0x00(%0,%1,1),%%xmm1 \n"
"punpckldq %%xmm1,%%xmm0 \n" "punpckldq %%xmm1,%%xmm0 \n"
@ -2078,7 +2078,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
"lea 0x00(%0,%5,1),%5 \n" "lea 0x00(%0,%5,1),%5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" "movq (%0),%%xmm0 \n"
"movhps 0x00(%0,%1,1),%%xmm0 \n" "movhps 0x00(%0,%1,1),%%xmm0 \n"
"movq 0x00(%0,%1,2),%%xmm1 \n" "movq 0x00(%0,%1,2),%%xmm1 \n"
@ -2134,7 +2134,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
"jl 49f \n" "jl 49f \n"
LABELALIGN LABELALIGN
"40: \n" "40: \n"
"movd 0x00(%3,%0,4),%%xmm0 \n" "movd 0x00(%3,%0,4),%%xmm0 \n"
"movd 0x00(%3,%1,4),%%xmm1 \n" "movd 0x00(%3,%1,4),%%xmm1 \n"
"pextrw $0x5,%%xmm2,%k0 \n" "pextrw $0x5,%%xmm2,%k0 \n"
@ -2152,7 +2152,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
"sub $0x4,%4 \n" "sub $0x4,%4 \n"
"jge 40b \n" "jge 40b \n"
"49: \n" "49: \n"
"test $0x2,%4 \n" "test $0x2,%4 \n"
"je 29f \n" "je 29f \n"
"movd 0x00(%3,%0,4),%%xmm0 \n" "movd 0x00(%3,%0,4),%%xmm0 \n"
@ -2161,12 +2161,12 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
"punpckldq %%xmm1,%%xmm0 \n" "punpckldq %%xmm1,%%xmm0 \n"
"movq %%xmm0,(%2) \n" "movq %%xmm0,(%2) \n"
"lea 0x8(%2),%2 \n" "lea 0x8(%2),%2 \n"
"29: \n" "29: \n"
"test $0x1,%4 \n" "test $0x1,%4 \n"
"je 99f \n" "je 99f \n"
"movd 0x00(%3,%0,4),%%xmm0 \n" "movd 0x00(%3,%0,4),%%xmm0 \n"
"movd %%xmm0,(%2) \n" "movd %%xmm0,(%2) \n"
"99: \n" "99: \n"
: "=&a"(x0), // %0 : "=&a"(x0), // %0
"=&d"(x1), // %1 "=&d"(x1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
@ -2187,7 +2187,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
(void)x; (void)x;
(void)dx; (void)dx;
asm volatile( asm volatile(
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
@ -2248,7 +2248,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"pextrw $0x3,%%xmm2,%k4 \n" "pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN LABELALIGN
"2: \n" "2: \n"
"movdqa %%xmm2,%%xmm1 \n" "movdqa %%xmm2,%%xmm1 \n"
"paddd %%xmm3,%%xmm2 \n" "paddd %%xmm3,%%xmm2 \n"
"movq 0x00(%1,%3,4),%%xmm0 \n" "movq 0x00(%1,%3,4),%%xmm0 \n"
@ -2268,7 +2268,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"jge 2b \n" "jge 2b \n"
LABELALIGN LABELALIGN
"29: \n" "29: \n"
"add $0x1,%2 \n" "add $0x1,%2 \n"
"jl 99f \n" "jl 99f \n"
"psrlw $0x9,%%xmm2 \n" "psrlw $0x9,%%xmm2 \n"
@ -2281,7 +2281,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,(%0) \n" "movd %%xmm0,(%0) \n"
LABELALIGN "99: \n" LABELALIGN "99: \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
@ -2296,7 +2296,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
// Divide num by div and return as 16.16 fixed point result. // Divide num by div and return as 16.16 fixed point result.
int FixedDiv_X86(int num, int div) { int FixedDiv_X86(int num, int div) {
asm volatile( asm volatile(
"cdq \n" "cdq \n"
"shld $0x10,%%eax,%%edx \n" "shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n" "shl $0x10,%%eax \n"
"idiv %1 \n" "idiv %1 \n"
@ -2310,7 +2310,7 @@ int FixedDiv_X86(int num, int div) {
// Divide num - 1 by div - 1 and return as 16.16 fixed point result. // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_X86(int num, int div) { int FixedDiv1_X86(int num, int div) {
asm volatile( asm volatile(
"cdq \n" "cdq \n"
"shld $0x10,%%eax,%%edx \n" "shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n" "shl $0x10,%%eax \n"
"sub $0x10001,%%eax \n" "sub $0x10001,%%eax \n"
@ -2350,7 +2350,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
"movdqa %5,%%xmm3 \n" // merge shuffler "movdqa %5,%%xmm3 \n" // merge shuffler
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" // 8 UV row 0 "movdqu (%0),%%xmm0 \n" // 8 UV row 0
"movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
@ -2390,7 +2390,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
"vbroadcastf128 %5,%%ymm3 \n" // merge shuffler "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
@ -2407,7 +2407,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
"lea 0x10(%1),%1 \n" // 8 UV "lea 0x10(%1),%1 \n" // 8 UV
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -2432,7 +2432,7 @@ void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
"movdqa %3,%%xmm3 \n" "movdqa %3,%%xmm3 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" // 00112233 (1u1v) "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
"movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
"punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
@ -2473,7 +2473,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
"movdqa %5,%%xmm7 \n" "movdqa %5,%%xmm7 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" // 00112233 (1u1v) "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
"movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
"punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
@ -2557,7 +2557,7 @@ void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
"vbroadcastf128 %3,%%ymm3 \n" "vbroadcastf128 %3,%%ymm3 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" "vmovdqu (%0),%%xmm0 \n"
"vmovdqu 2(%0),%%xmm1 \n" "vmovdqu 2(%0),%%xmm1 \n"
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
@ -2578,7 +2578,7 @@ void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 uv to 16 uv "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -2600,7 +2600,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
"vbroadcastf128 %5,%%ymm7 \n" "vbroadcastf128 %5,%%ymm7 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" "vmovdqu (%0),%%xmm0 \n"
"vmovdqu 2(%0),%%xmm1 \n" "vmovdqu 2(%0),%%xmm1 \n"
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
@ -2657,7 +2657,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 uv to 16 uv "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -2680,7 +2680,7 @@ void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
"pslld $1,%%xmm4 \n" // all 2 "pslld $1,%%xmm4 \n" // all 2
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
"movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
@ -2732,7 +2732,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
"pslld $3,%%xmm6 \n" // all 8 "pslld $3,%%xmm6 \n" // all 8
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
"movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
"punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
@ -2822,7 +2822,7 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
"vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
@ -2850,7 +2850,7 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 4 uv to 8 uv "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
@ -2871,7 +2871,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
"vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
@ -2932,7 +2932,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 4 uv to 8 uv "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2

View File

@ -30,7 +30,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
"vld2.8 {q0, q1}, [%0]! \n" "vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
@ -51,7 +51,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"vrhadd.u8 q0, q0, q1 \n" // rounding half add "vrhadd.u8 q0, q0, q1 \n" // rounding half add
@ -73,7 +73,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
@ -102,7 +102,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vst1.8 {d2}, [%1]! \n" "vst1.8 {d2}, [%1]! \n"
@ -122,7 +122,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
const uint8_t* src_ptr3 = src_ptr + src_stride * 3; const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q1}, [%3]! \n" "vld1.8 {q1}, [%3]! \n"
"vld1.8 {q2}, [%4]! \n" "vld1.8 {q2}, [%4]! \n"
@ -156,7 +156,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2 "vmov d2, d3 \n" // order d0, d1, d2
@ -176,7 +176,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
@ -233,7 +233,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
@ -284,7 +284,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"vld1.8 {q3}, [%3] \n" "vld1.8 {q3}, [%3] \n"
"1: \n" "1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n" "subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
@ -311,7 +311,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
"vld1.8 {q14}, [%6] \n" "vld1.8 {q14}, [%6] \n"
"vld1.8 {q15}, [%7] \n" "vld1.8 {q15}, [%7] \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
// d0 = 00 40 01 41 02 42 03 43 // d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
@ -420,7 +420,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"vld1.16 {q13}, [%4] \n" "vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n" "vld1.8 {q14}, [%5] \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
// d0 = 00 40 01 41 02 42 03 43 // d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53 // d1 = 10 50 11 51 12 52 13 53
@ -512,7 +512,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
"vmov.u8 d30, #3 \n" "vmov.u8 d30, #3 \n"
"1: \n" "1: \n"
"vld1.8 {d4}, [%0]! \n" // 01234567 "vld1.8 {d4}, [%0]! \n" // 01234567
"vld1.8 {d5}, [%3]! \n" // 12345678 "vld1.8 {d5}, [%3]! \n" // 12345678
@ -550,7 +550,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n" "vmov.u8 d28, #3 \n"
"1: \n" "1: \n"
"vld1.8 {d4}, [%0]! \n" // 01234567 "vld1.8 {d4}, [%0]! \n" // 01234567
"vld1.8 {d5}, [%5]! \n" // 12345678 "vld1.8 {d5}, [%5]! \n" // 12345678
@ -611,7 +611,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
asm volatile( asm volatile(
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"1: \n" "1: \n"
"vld1.16 {q1}, [%0]! \n" // 01234567 (16b) "vld1.16 {q1}, [%0]! \n" // 01234567 (16b)
"vld1.16 {q0}, [%3]! \n" // 12345678 (16b) "vld1.16 {q0}, [%3]! \n" // 12345678 (16b)
@ -647,7 +647,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
asm volatile( asm volatile(
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" // 01234567 (16b) "vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
"vld1.16 {q1}, [%5]! \n" // 12345678 (16b) "vld1.16 {q1}, [%5]! \n" // 12345678 (16b)
@ -698,7 +698,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
asm volatile( asm volatile(
"vmov.u16 d31, #3 \n" "vmov.u16 d31, #3 \n"
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" // 01234567 (16b) "vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
"vld1.16 {q1}, [%3]! \n" // 12345678 (16b) "vld1.16 {q1}, [%3]! \n" // 12345678 (16b)
@ -743,7 +743,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
"vmov.u16 d31, #3 \n" "vmov.u16 d31, #3 \n"
"vmov.u32 q14, #3 \n" "vmov.u32 q14, #3 \n"
"1: \n" "1: \n"
"vld1.16 {d0}, [%0]! \n" // 0123 (16b) "vld1.16 {d0}, [%0]! \n" // 0123 (16b)
"vld1.16 {d1}, [%5]! \n" // 1234 (16b) "vld1.16 {d1}, [%5]! \n" // 1234 (16b)
"vmovl.u16 q2, d0 \n" // 0123 (32b) "vmovl.u16 q2, d0 \n" // 0123 (32b)
@ -794,7 +794,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
"vmov.u8 d30, #3 \n" "vmov.u8 d30, #3 \n"
"1: \n" "1: \n"
"vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
"vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v) "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v)
@ -832,7 +832,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n" "vmov.u8 d28, #3 \n"
"1: \n" "1: \n"
"vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
"vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v) "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v)
@ -893,7 +893,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
asm volatile( asm volatile(
"vmov.u16 d30, #3 \n" "vmov.u16 d30, #3 \n"
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16) "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16)
"vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16) "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16)
@ -939,7 +939,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
"vmov.u16 d30, #3 \n" "vmov.u16 d30, #3 \n"
"vmov.u32 q14, #3 \n" "vmov.u32 q14, #3 \n"
"1: \n" "1: \n"
"vld1.8 {d0}, [%0]! \n" // 0011 (1u1v) "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v)
"vld1.8 {d1}, [%5]! \n" // 1122 (1u1v) "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v)
"vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
@ -989,7 +989,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld1.16 {q1, q2}, [%1] \n" // load accumulator "vld1.16 {q1, q2}, [%1] \n" // load accumulator
"vld1.8 {q0}, [%0]! \n" // load 16 bytes "vld1.8 {q0}, [%0]! \n" // load 16 bytes
"vaddw.u8 q2, q2, d1 \n" // add "vaddw.u8 q2, q2, d1 \n" // add
@ -1036,7 +1036,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"vadd.s32 q2, q1, q3 \n" "vadd.s32 q2, q1, q3 \n"
"vshl.i32 q0, q3, #1 \n" // 8 * dx "vshl.i32 q0, q3, #1 \n" // 8 * dx
"1: \n" "1: \n"
LOAD2_DATA8_LANE(0) LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1) LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2) LOAD2_DATA8_LANE(2)
@ -1087,7 +1087,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
@ -1115,7 +1115,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
@ -1138,7 +1138,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
@ -1176,7 +1176,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
"1: \n" "1: \n"
"vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[0]}, [%0], r12 \n"
"vld1.32 {d0[1]}, [%0], r12 \n" "vld1.32 {d0[1]}, [%0], r12 \n"
"vld1.32 {d1[0]}, [%0], r12 \n" "vld1.32 {d1[0]}, [%0], r12 \n"
@ -1201,7 +1201,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
asm volatile( asm volatile(
"mov r12, %4, lsl #2 \n" "mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
"vld1.8 {d1}, [%1], r12 \n" "vld1.8 {d1}, [%1], r12 \n"
"vld1.8 {d2}, [%0], r12 \n" "vld1.8 {d2}, [%0], r12 \n"
@ -1247,7 +1247,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
int tmp; int tmp;
const uint8_t* src_tmp = src_argb; const uint8_t* src_tmp = src_argb;
asm volatile( asm volatile(
"1: \n" "1: \n"
// clang-format off // clang-format off
LOAD1_DATA32_LANE(d0, 0) LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d0, 1)
@ -1300,7 +1300,7 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"vmov.i16 q15, #0x7f \n" // 0x7F "vmov.i16 q15, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"vadd.s32 q8, q1, q0 \n" "vadd.s32 q8, q1, q0 \n"
"1: \n" "1: \n"
// d0, d1: a // d0, d1: a
// d2, d3: b // d2, d3: b
LOAD2_DATA32_LANE(d0, d2, 0) LOAD2_DATA32_LANE(d0, d2, 0)
@ -1350,7 +1350,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
@ -1369,7 +1369,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
@ -1390,7 +1390,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
@ -1423,7 +1423,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
const uint8_t* src3_ptr = src_ptr + src_stepx * 6; const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld1.16 {d0[0]}, [%0], %6 \n" "vld1.16 {d0[0]}, [%0], %6 \n"
"vld1.16 {d0[1]}, [%1], %6 \n" "vld1.16 {d0[1]}, [%1], %6 \n"
"vld1.16 {d0[2]}, [%2], %6 \n" "vld1.16 {d0[2]}, [%2], %6 \n"

View File

@ -27,7 +27,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
@ -49,7 +49,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
@ -73,7 +73,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
@ -102,7 +102,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -123,7 +123,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
const uint8_t* src_ptr3 = src_ptr + src_stride * 3; const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ldp q0, q4, [%0], #32 \n" // load up 16x8 "ldp q0, q4, [%0], #32 \n" // load up 16x8
"ldp q1, q5, [%2], #32 \n" "ldp q1, q5, [%2], #32 \n"
"ldp q2, q6, [%3], #32 \n" "ldp q2, q6, [%3], #32 \n"
@ -175,7 +175,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
"ld1 {v29.16b}, [%[kShuf34_0]] \n" "ld1 {v29.16b}, [%[kShuf34_0]] \n"
"ld1 {v30.16b}, [%[kShuf34_1]] \n" "ld1 {v30.16b}, [%[kShuf34_1]] \n"
"ld1 {v31.16b}, [%[kShuf34_2]] \n" "ld1 {v31.16b}, [%[kShuf34_2]] \n"
"1: \n" "1: \n"
"ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n" "ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n"
"subs %w[width], %w[width], #48 \n" "subs %w[width], %w[width], #48 \n"
"tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n" "tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n"
@ -201,7 +201,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
"movi v24.16b, #3 \n" "movi v24.16b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1
"subs %w2, %w2, #48 \n" "subs %w2, %w2, #48 \n"
@ -279,7 +279,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
"movi v20.16b, #3 \n" "movi v20.16b, #3 \n"
"add %3, %3, %0 \n" "add %3, %3, %0 \n"
"1: \n" "1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1
"subs %w2, %w2, #48 \n" "subs %w2, %w2, #48 \n"
@ -339,7 +339,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
"subs %w[width], %w[width], #12 \n" "subs %w[width], %w[width], #12 \n"
"b.eq 2f \n" "b.eq 2f \n"
"1: \n" "1: \n"
"ldp q0, q1, [%[src_ptr]], #32 \n" "ldp q0, q1, [%[src_ptr]], #32 \n"
"subs %w[width], %w[width], #12 \n" "subs %w[width], %w[width], #12 \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
@ -350,7 +350,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
// Store exactly 12 bytes on the final iteration to avoid writing past // Store exactly 12 bytes on the final iteration to avoid writing past
// the end of the array. // the end of the array.
"2: \n" "2: \n"
"ldp q0, q1, [%[src_ptr]] \n" "ldp q0, q1, [%[src_ptr]] \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
"st1 {v2.8b}, [%[dst_ptr]], #8 \n" "st1 {v2.8b}, [%[dst_ptr]], #8 \n"
@ -384,7 +384,7 @@ void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
"ld1 {v31.16b}, [%[tblArray4]] \n" "ld1 {v31.16b}, [%[tblArray4]] \n"
"ld1 {v30.16b}, [%[div996]] \n" "ld1 {v30.16b}, [%[div996]] \n"
"1: \n" "1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n" "ldp q20, q0, [%[src_ptr]], #32 \n"
"ldp q21, q1, [%[src_ptr1]], #32 \n" "ldp q21, q1, [%[src_ptr1]], #32 \n"
"ldp q22, q2, [%[src_ptr2]], #32 \n" "ldp q22, q2, [%[src_ptr2]], #32 \n"
@ -451,7 +451,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"ld1 {v31.16b}, [%[tblArray3]] \n" "ld1 {v31.16b}, [%[tblArray3]] \n"
"ld1 {v30.8h}, [%[div664]] \n" "ld1 {v30.8h}, [%[div664]] \n"
"1: \n" "1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ... "ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ...
"ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ... "ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ...
"subs %w[width], %w[width], #12 \n" "subs %w[width], %w[width], #12 \n"
@ -500,7 +500,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
"movi v31.16b, #3 \n" "movi v31.16b, #3 \n"
"1: \n" "1: \n"
"ldr q0, [%0], #16 \n" // 0123456789abcdef "ldr q0, [%0], #16 \n" // 0123456789abcdef
"ldr q1, [%1], #16 \n" // 123456789abcdefg "ldr q1, [%1], #16 \n" // 123456789abcdefg
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -547,7 +547,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
"movi v31.8b, #3 \n" "movi v31.8b, #3 \n"
"movi v30.8h, #3 \n" "movi v30.8h, #3 \n"
"1: \n" "1: \n"
"ldr d0, [%0], #8 \n" // 01234567 "ldr d0, [%0], #8 \n" // 01234567
"ldr d1, [%2], #8 \n" // 12345678 "ldr d1, [%2], #8 \n" // 12345678
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -602,7 +602,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
asm volatile( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -639,7 +639,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
asm volatile( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
"ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b) "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -693,7 +693,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
asm volatile( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -739,7 +739,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
"movi v31.4h, #3 \n" "movi v31.4h, #3 \n"
"movi v30.4s, #3 \n" "movi v30.4s, #3 \n"
"1: \n" "1: \n"
"ldr d0, [%0], #8 \n" // 0123 (16b) "ldr d0, [%0], #8 \n" // 0123 (16b)
"ldr d1, [%2], #8 \n" // 1234 (16b) "ldr d1, [%2], #8 \n" // 1234 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -793,7 +793,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
"movi v31.8b, #3 \n" "movi v31.8b, #3 \n"
"1: \n" "1: \n"
"ldr d0, [%0], #8 \n" // 00112233 (1u1v) "ldr d0, [%0], #8 \n" // 00112233 (1u1v)
"ldr d1, [%1], #8 \n" // 11223344 (1u1v) "ldr d1, [%1], #8 \n" // 11223344 (1u1v)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -833,7 +833,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
"movi v31.8b, #3 \n" "movi v31.8b, #3 \n"
"movi v30.8h, #3 \n" "movi v30.8h, #3 \n"
"1: \n" "1: \n"
"ldr d0, [%0], #8 \n" "ldr d0, [%0], #8 \n"
"ldr d1, [%2], #8 \n" "ldr d1, [%2], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -888,7 +888,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
asm volatile( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -936,7 +936,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
"movi v31.4h, #3 \n" "movi v31.4h, #3 \n"
"movi v30.4s, #3 \n" "movi v30.4s, #3 \n"
"1: \n" "1: \n"
"ldr d0, [%0], #8 \n" "ldr d0, [%0], #8 \n"
"ldr d1, [%2], #8 \n" "ldr d1, [%2], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -988,7 +988,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
"uaddw2 v2.8h, v2.8h, v0.16b \n" // add "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
@ -1042,7 +1042,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
"trn1 v20.8h, v1.8h, v0.8h \n" "trn1 v20.8h, v1.8h, v0.8h \n"
"trn1 v21.8h, v2.8h, v0.8h \n" "trn1 v21.8h, v2.8h, v0.8h \n"
"1: \n" SCALE_FILTER_COLS_STEP_ADDR "1: \n" SCALE_FILTER_COLS_STEP_ADDR
"ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
@ -1090,7 +1090,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n" "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n"
"subs %w[width], %w[width], #8 \n" "subs %w[width], %w[width], #8 \n"
"prfm pldl1keep, [%[src], 448] \n" "prfm pldl1keep, [%[src], 448] \n"
@ -1112,7 +1112,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
(void)src_stride; (void)src_stride;
const uint8_t* src_argb1 = src_argb + 32; const uint8_t* src_argb1 = src_argb + 32;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]] \n" "ld2 {v0.4s, v1.4s}, [%[src]] \n"
"add %[src], %[src], #64 \n" "add %[src], %[src], #64 \n"
"ld2 {v2.4s, v3.4s}, [%[src1]] \n" "ld2 {v2.4s, v3.4s}, [%[src1]] \n"
@ -1136,7 +1136,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n" "ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
"uaddl v2.8h, v0.8b, v1.8b \n" "uaddl v2.8h, v0.8b, v1.8b \n"
@ -1167,7 +1167,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
int64_t i = 0; int64_t i = 0;
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ldr w10, [%[src], %[i]] \n" "ldr w10, [%[src], %[i]] \n"
"ldr w11, [%[src1], %[i]] \n" "ldr w11, [%[src1], %[i]] \n"
"ldr w12, [%[src2], %[i]] \n" "ldr w12, [%[src2], %[i]] \n"
@ -1196,7 +1196,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
int dst_width) { int dst_width) {
asm volatile( asm volatile(
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
"ld1 {v1.8b}, [%1], %4 \n" "ld1 {v1.8b}, [%1], %4 \n"
"ld1 {v2.8b}, [%0], %4 \n" "ld1 {v2.8b}, [%0], %4 \n"
@ -1248,7 +1248,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
int64_t dx64 = (int64_t)dx; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT
int64_t tmp64; int64_t tmp64;
asm volatile ( asm volatile (
"1: \n" "1: \n"
// clang-format off // clang-format off
LOAD1_DATA32_LANE(v0, 0) LOAD1_DATA32_LANE(v0, 0)
LOAD1_DATA32_LANE(v0, 1) LOAD1_DATA32_LANE(v0, 1)
@ -1306,7 +1306,7 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"add v5.4s, v1.4s, v0.4s \n" "add v5.4s, v1.4s, v0.4s \n"
"ldr q18, [%[kIndices]] \n" "ldr q18, [%[kIndices]] \n"
"1: \n" // "1: \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR SCALE_ARGB_FILTER_COLS_STEP_ADDR
"ldr d1, [%6] \n" // "ldr d1, [%6] \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR SCALE_ARGB_FILTER_COLS_STEP_ADDR
@ -1359,7 +1359,7 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
"subs %w[dst_width], %w[dst_width], #32 \n" "subs %w[dst_width], %w[dst_width], #32 \n"
"b.lt 2f \n" "b.lt 2f \n"
"1: \n" "1: \n"
"ldp q0, q1, [%[src_ptr]] \n" "ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n" "ldp q2, q3, [%[src_ptr], #32] \n"
"ldp q4, q5, [%[src_ptr], #64] \n" "ldp q4, q5, [%[src_ptr], #64] \n"
@ -1376,7 +1376,7 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
"add %[dst_ptr], %[dst_ptr], #64 \n" "add %[dst_ptr], %[dst_ptr], #64 \n"
"b.ge 1b \n" "b.ge 1b \n"
"2: \n" "2: \n"
"adds %w[dst_width], %w[dst_width], #32 \n" "adds %w[dst_width], %w[dst_width], #32 \n"
"b.eq 99f \n" "b.eq 99f \n"
@ -1386,7 +1386,7 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
"uzp2 v1.8h, v2.8h, v3.8h \n" "uzp2 v1.8h, v2.8h, v3.8h \n"
"stp q0, q1, [%[dst_ptr]] \n" "stp q0, q1, [%[dst_ptr]] \n"
"99: \n" "99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr] : [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst), // %[dst_ptr] [dst_ptr] "+r"(dst), // %[dst_ptr]
[dst_width] "+r"(dst_width) // %[dst_width] [dst_width] "+r"(dst_width) // %[dst_width]
@ -1400,7 +1400,7 @@ void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n" "ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n"
"ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n" "ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n"
"subs %w[dst_width], %w[dst_width], #16 \n" "subs %w[dst_width], %w[dst_width], #16 \n"
@ -1424,7 +1424,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
asm volatile( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"1: \n" "1: \n"
"ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #8 \n" // 8 processed per loop "subs %w3, %w3, #8 \n" // 8 processed per loop
@ -1453,7 +1453,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -1472,7 +1472,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
@ -1493,7 +1493,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
asm volatile( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
@ -1526,7 +1526,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
const uint8_t* src3_ptr = src_ptr + src_stepx * 6; const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.h}[0], [%0], %6 \n" "ld1 {v0.h}[0], [%0], %6 \n"
"ld1 {v1.h}[0], [%1], %6 \n" "ld1 {v1.h}[0], [%1], %6 \n"
"ld1 {v2.h}[0], [%2], %6 \n" "ld1 {v2.h}[0], [%2], %6 \n"