Apply format with no code changes

Bug: None
Change-Id: I8923bacb9af7e7d4f13e210c8b3d7ea6b81568a5
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6301086
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
This commit is contained in:
Frank Barchard 2025-02-24 23:22:09 -08:00 committed by libyuv LUCI CQ
parent 61354d2671
commit 3a7e0ba671
12 changed files with 819 additions and 819 deletions

View File

@ -37,7 +37,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
// Process 32 bytes per loop.
LABELALIGN
"1: \n"
"1: \n"
"mov (%0),%%rcx \n"
"mov 0x8(%0),%%rdx \n"
"xor (%1),%%rcx \n"
@ -80,7 +80,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
asm volatile(
// Process 16 bytes per loop.
LABELALIGN
"1: \n"
"1: \n"
"mov (%0),%%ecx \n"
"mov 0x4(%0),%%edx \n"
"xor (%1),%%ecx \n"
@ -129,7 +129,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"1: \n"
"movdqa (%0),%%xmm4 \n"
"movdqa 0x10(%0), %%xmm5 \n"
"pxor (%0,%1), %%xmm4 \n"
@ -188,7 +188,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqa (%0),%%ymm4 \n"
"vmovdqa 0x20(%0), %%ymm5 \n"
"vpxor (%0,%1), %%ymm4, %%ymm4 \n"
@ -217,7 +217,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
"vpermq $0xaa,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vmovd %%xmm0,%3 \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@ -239,7 +239,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n"
"movdqu (%1),%%xmm2 \n"
@ -306,7 +306,7 @@ uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
"movdqa %4,%%xmm6 \n"
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n"
"pmulld %%xmm6,%%xmm0 \n"

View File

@ -31,7 +31,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
asm volatile(
"vmov.u16 q4, #0 \n" // accumulator
"1: \n"
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n"
@ -64,7 +64,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"

View File

@ -29,7 +29,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
asm volatile(
"movi v4.8h, #0 \n"
"1: \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
@ -61,7 +61,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
"movi v18.16b, #0 \n"
"movi v19.16b, #0 \n"
"1: \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
@ -122,7 +122,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
// count is always a multiple of 16.
// maintain two accumulators, reduce and then final sum in scalar since
// this has better performance on little cores.
"1: \n"
"1: \n"
"ldr q0, [%[src]], #16 \n"
"subs %w[count], %w[count], #16 \n"
"tbl v3.16b, {v0.16b}, v19.16b \n"
@ -162,7 +162,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
"movi v5.4s, #0 \n"
"movi v6.16b, #1 \n"
"1: \n"
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"ldp q2, q3, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
@ -194,7 +194,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
"movi v4.4s, #0 \n"
"movi v5.4s, #0 \n"
"1: \n"
"1: \n"
"ldp q0, q2, [%0], #32 \n"
"ldp q1, q3, [%1], #32 \n"
"subs %w2, %w2, #32 \n"

View File

@ -30,7 +30,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
@ -120,7 +120,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
@ -265,7 +265,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
@ -393,7 +393,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
int width) {
asm volatile(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n" // a b c d
"movdqu (%0,%3),%%xmm1 \n" // e f g h
"lea (%0,%3,2),%0 \n" // src += stride * 2
@ -449,7 +449,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
int width) {
asm volatile(
// Main loop transpose 2 blocks of 4x4. Read a column, write a row.
"1: \n"
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // a b c d
"vmovdqu (%0,%3),%%xmm1 \n" // e f g h
"lea (%0,%3,2),%0 \n" // src += stride * 2
@ -484,7 +484,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
"sub %4,%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+rm"(width) // %2

View File

@ -33,7 +33,7 @@ void TransposeWx8_NEON(const uint8_t* src,
// at w-8 allow for this
"sub %[width], #8 \n"
"1: \n"
"1: \n"
"mov %[temp], %[src] \n"
"vld1.8 {d0}, [%[temp]], %[src_stride] \n"
"vld1.8 {d1}, [%[temp]], %[src_stride] \n"
@ -101,7 +101,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// at w-8 allow for this
"sub %[width], #8 \n"
"1: \n"
"1: \n"
"mov %[temp], %[src] \n"
"vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n"
"vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n"
@ -186,7 +186,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
uint8_t* dst3 = dst2 + dst_stride;
asm volatile(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"1: \n"
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
"vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
"vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"

View File

@ -28,7 +28,7 @@ void TransposeWx16_NEON(const uint8_t* src,
int width) {
const uint8_t* src_temp;
asm volatile(
"1: \n"
"1: \n"
"mov %[src_temp], %[src] \n"
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
@ -151,7 +151,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// at w-8 allow for this
"sub %w[width], %w[width], #8 \n"
"1: \n"
"1: \n"
"mov %[temp], %[src] \n"
"ld1 {v0.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v1.16b}, [%[temp]], %[src_stride] \n"
@ -241,7 +241,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
uint8_t* dst3 = dst2 + dst_stride;
asm volatile(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"1: \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -100,7 +100,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
asm volatile(
// 16 pixel loop.
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
@ -130,7 +130,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
@ -161,7 +161,7 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x00(%0,%3,1),%%xmm2 \n"
@ -196,7 +196,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n"
@ -208,7 +208,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -228,7 +228,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n"
@ -242,7 +242,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -261,7 +261,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
@ -283,7 +283,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -303,7 +303,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
"pslld $0x10,%%xmm5 \n"
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
@ -337,7 +337,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
"lea 0x00(%4,%4,2),%3 \n"
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x00(%0,%4,1),%%xmm2 \n"
@ -389,7 +389,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
"vpslld $0x10,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n"
@ -404,7 +404,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -423,7 +423,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
@ -457,7 +457,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -482,7 +482,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"m"(kShuf2) // %2
);
asm volatile(
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm2 \n"
"lea 0x20(%0),%0 \n"
@ -527,7 +527,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kRound34) // %2
);
asm volatile(
"1: \n"
"1: \n"
"movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
@ -592,7 +592,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
);
asm volatile(
"1: \n"
"1: \n"
"movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
"pavgb %%xmm6,%%xmm7 \n"
@ -646,7 +646,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
"movdqa %4,%%xmm5 \n"
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
@ -683,7 +683,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kScaleAb2) // %3
);
asm volatile(
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm1 \n"
"lea 0x10(%0),%0 \n"
@ -725,7 +725,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"m"(kScaleAc33) // %2
);
asm volatile(
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm6 \n"
"movhlps %%xmm0,%%xmm1 \n"
@ -789,7 +789,7 @@ void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
"psllw $1,%%xmm6 \n" // all 2
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm1 \n" // 01234567
"movq 1(%0),%%xmm2 \n" // 12345678
"movdqa %%xmm1,%%xmm3 \n"
@ -839,7 +839,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"1: \n"
"1: \n"
"pxor %%xmm0,%%xmm0 \n" // 0
// above line
"movq (%0),%%xmm1 \n" // 01234567
@ -958,7 +958,7 @@ void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
"psllw $1,%%xmm4 \n" // all 2
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n" // 01234567 (16)
"movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
@ -1010,7 +1010,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
"movdqa %5,%%xmm6 \n"
LABELALIGN
"1: \n"
"1: \n"
// above line
"movdqu (%0),%%xmm0 \n" // 01234567 (16)
"movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
@ -1108,7 +1108,7 @@ void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
"pslld $1,%%xmm4 \n" // all 2
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm0 \n" // 0123 (16b)
"movq 2(%0),%%xmm1 \n" // 1234 (16b)
@ -1161,7 +1161,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
"pslld $3,%%xmm6 \n" // all 8
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
"movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
"punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
@ -1269,7 +1269,7 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
"movdqa %3,%%xmm3 \n"
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm0 \n" // 01234567
"movq 1(%0),%%xmm1 \n" // 12345678
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
@ -1310,7 +1310,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
"movdqa %5,%%xmm7 \n"
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm0 \n" // 01234567
"movq 1(%0),%%xmm1 \n" // 12345678
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
@ -1395,7 +1395,7 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
"vbroadcastf128 %3,%%ymm3 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
@ -1417,7 +1417,7 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1439,7 +1439,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
"vbroadcastf128 %5,%%ymm7 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
@ -1498,7 +1498,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1521,7 +1521,7 @@ void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b)
"vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b)
@ -1551,7 +1551,7 @@ void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
"lea 0x40(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1573,7 +1573,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
"vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
@ -1613,7 +1613,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1634,7 +1634,7 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
@ -1663,7 +1663,7 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1684,7 +1684,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
@ -1747,7 +1747,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1765,7 +1765,7 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
// 16 pixel loop.
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm3 \n"
"lea 0x10(%0),%0 \n" // src_ptr += 16
"movdqu (%1),%%xmm0 \n"
@ -1795,7 +1795,7 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%ymm3 \n"
"lea 0x20(%0),%0 \n" // src_ptr += 32
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
@ -1808,7 +1808,7 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
"lea 0x40(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@ -1854,7 +1854,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
"pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN
"2: \n"
"2: \n"
"movdqa %%xmm2,%%xmm1 \n"
"paddd %%xmm3,%%xmm2 \n"
"movzwl 0x00(%1,%3,1),%k2 \n"
@ -1881,7 +1881,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
"jge 2b \n"
LABELALIGN
"29: \n"
"29: \n"
"addl $0x1,%5 \n"
"jl 99f \n"
"movzwl 0x00(%1,%3,1),%k2 \n"
@ -1897,7 +1897,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
"packuswb %%xmm2,%%xmm2 \n"
"movd %%xmm2,%k2 \n"
"mov %b2,(%0) \n"
"99: \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"=&a"(temp_pixel), // %2
@ -1931,7 +1931,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
(void)x;
(void)dx;
asm volatile(
"1: \n"
"1: \n"
"movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n"
"movdqa %%xmm0,%%xmm1 \n"
@ -1956,7 +1956,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
@ -1978,7 +1978,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
@ -2002,7 +2002,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x00(%0,%3,1),%%xmm2 \n"
@ -2040,7 +2040,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
"lea 0x00(%1,%1,2),%4 \n"
LABELALIGN
"1: \n"
"1: \n"
"movd (%0),%%xmm0 \n"
"movd 0x00(%0,%1,1),%%xmm1 \n"
"punpckldq %%xmm1,%%xmm0 \n"
@ -2078,7 +2078,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
"lea 0x00(%0,%5,1),%5 \n"
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movhps 0x00(%0,%1,1),%%xmm0 \n"
"movq 0x00(%0,%1,2),%%xmm1 \n"
@ -2134,7 +2134,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
"jl 49f \n"
LABELALIGN
"40: \n"
"40: \n"
"movd 0x00(%3,%0,4),%%xmm0 \n"
"movd 0x00(%3,%1,4),%%xmm1 \n"
"pextrw $0x5,%%xmm2,%k0 \n"
@ -2152,7 +2152,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
"sub $0x4,%4 \n"
"jge 40b \n"
"49: \n"
"49: \n"
"test $0x2,%4 \n"
"je 29f \n"
"movd 0x00(%3,%0,4),%%xmm0 \n"
@ -2161,12 +2161,12 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
"punpckldq %%xmm1,%%xmm0 \n"
"movq %%xmm0,(%2) \n"
"lea 0x8(%2),%2 \n"
"29: \n"
"29: \n"
"test $0x1,%4 \n"
"je 99f \n"
"movd 0x00(%3,%0,4),%%xmm0 \n"
"movd %%xmm0,(%2) \n"
"99: \n"
"99: \n"
: "=&a"(x0), // %0
"=&d"(x1), // %1
"+r"(dst_argb), // %2
@ -2187,7 +2187,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
(void)x;
(void)dx;
asm volatile(
"1: \n"
"1: \n"
"movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n"
"movdqa %%xmm0,%%xmm1 \n"
@ -2248,7 +2248,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN
"2: \n"
"2: \n"
"movdqa %%xmm2,%%xmm1 \n"
"paddd %%xmm3,%%xmm2 \n"
"movq 0x00(%1,%3,4),%%xmm0 \n"
@ -2268,7 +2268,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"jge 2b \n"
LABELALIGN
"29: \n"
"29: \n"
"add $0x1,%2 \n"
"jl 99f \n"
"psrlw $0x9,%%xmm2 \n"
@ -2281,7 +2281,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,(%0) \n"
LABELALIGN "99: \n"
LABELALIGN "99: \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@ -2296,7 +2296,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_X86(int num, int div) {
asm volatile(
"cdq \n"
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"idiv %1 \n"
@ -2310,7 +2310,7 @@ int FixedDiv_X86(int num, int div) {
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_X86(int num, int div) {
asm volatile(
"cdq \n"
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"sub $0x10001,%%eax \n"
@ -2350,7 +2350,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
"movdqa %5,%%xmm3 \n" // merge shuffler
LABELALIGN
"1: \n"
"1: \n"
"movdqu (%0),%%xmm0 \n" // 8 UV row 0
"movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
"lea 0x10(%0),%0 \n"
@ -2390,7 +2390,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
"vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
"lea 0x20(%0),%0 \n"
@ -2407,7 +2407,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
"lea 0x10(%1),%1 \n" // 8 UV
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -2432,7 +2432,7 @@ void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
"movdqa %3,%%xmm3 \n"
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm0 \n" // 00112233 (1u1v)
"movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
"punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
@ -2473,7 +2473,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
"movdqa %5,%%xmm7 \n"
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm0 \n" // 00112233 (1u1v)
"movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
"punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
@ -2557,7 +2557,7 @@ void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
"vbroadcastf128 %3,%%ymm3 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%xmm0 \n"
"vmovdqu 2(%0),%%xmm1 \n"
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
@ -2578,7 +2578,7 @@ void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 uv to 16 uv
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -2600,7 +2600,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
"vbroadcastf128 %5,%%ymm7 \n"
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%xmm0 \n"
"vmovdqu 2(%0),%%xmm1 \n"
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
@ -2657,7 +2657,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 uv to 16 uv
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -2680,7 +2680,7 @@ void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
"pslld $1,%%xmm4 \n" // all 2
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
"movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
@ -2732,7 +2732,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
"pslld $3,%%xmm6 \n" // all 8
LABELALIGN
"1: \n"
"1: \n"
"movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
"movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
"punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
@ -2822,7 +2822,7 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
"vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
@ -2850,7 +2850,7 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 4 uv to 8 uv
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -2871,7 +2871,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
LABELALIGN
"1: \n"
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
"vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
@ -2932,7 +2932,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 4 uv to 8 uv
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2

View File

@ -30,7 +30,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
// load even pixels into q0, odd into q1
"vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop
@ -51,7 +51,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
"subs %2, %2, #16 \n" // 16 processed per loop
"vrhadd.u8 q0, q0, q1 \n" // rounding half add
@ -73,7 +73,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
asm volatile(
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
@ -102,7 +102,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
"vst1.8 {d2}, [%1]! \n"
@ -122,7 +122,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q1}, [%3]! \n"
"vld1.8 {q2}, [%4]! \n"
@ -156,7 +156,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2
@ -176,7 +176,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
asm volatile(
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
@ -233,7 +233,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
asm volatile(
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
@ -284,7 +284,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"vld1.8 {q3}, [%3] \n"
"1: \n"
"1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
@ -311,7 +311,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
"vld1.8 {q14}, [%6] \n"
"vld1.8 {q15}, [%7] \n"
"add %3, %0 \n"
"1: \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
@ -420,7 +420,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n"
"add %3, %0 \n"
"1: \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
@ -512,7 +512,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
asm volatile(
"vmov.u8 d30, #3 \n"
"1: \n"
"1: \n"
"vld1.8 {d4}, [%0]! \n" // 01234567
"vld1.8 {d5}, [%3]! \n" // 12345678
@ -550,7 +550,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
"vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n"
"1: \n"
"1: \n"
"vld1.8 {d4}, [%0]! \n" // 01234567
"vld1.8 {d5}, [%5]! \n" // 12345678
@ -611,7 +611,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
"1: \n"
"vld1.16 {q1}, [%0]! \n" // 01234567 (16b)
"vld1.16 {q0}, [%3]! \n" // 12345678 (16b)
@ -647,7 +647,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
"vld1.16 {q1}, [%5]! \n" // 12345678 (16b)
@ -698,7 +698,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
asm volatile(
"vmov.u16 d31, #3 \n"
"1: \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
"vld1.16 {q1}, [%3]! \n" // 12345678 (16b)
@ -743,7 +743,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
"vmov.u16 d31, #3 \n"
"vmov.u32 q14, #3 \n"
"1: \n"
"1: \n"
"vld1.16 {d0}, [%0]! \n" // 0123 (16b)
"vld1.16 {d1}, [%5]! \n" // 1234 (16b)
"vmovl.u16 q2, d0 \n" // 0123 (32b)
@ -794,7 +794,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
asm volatile(
"vmov.u8 d30, #3 \n"
"1: \n"
"1: \n"
"vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
"vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v)
@ -832,7 +832,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
"vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n"
"1: \n"
"1: \n"
"vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
"vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v)
@ -893,7 +893,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
asm volatile(
"vmov.u16 d30, #3 \n"
"1: \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16)
"vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16)
@ -939,7 +939,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
"vmov.u16 d30, #3 \n"
"vmov.u32 q14, #3 \n"
"1: \n"
"1: \n"
"vld1.8 {d0}, [%0]! \n" // 0011 (1u1v)
"vld1.8 {d1}, [%5]! \n" // 1122 (1u1v)
"vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
@ -989,7 +989,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile(
"1: \n"
"1: \n"
"vld1.16 {q1, q2}, [%1] \n" // load accumulator
"vld1.8 {q0}, [%0]! \n" // load 16 bytes
"vaddw.u8 q2, q2, d1 \n" // add
@ -1036,7 +1036,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"vadd.s32 q2, q1, q3 \n"
"vshl.i32 q0, q3, #1 \n" // 8 * dx
"1: \n"
"1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
@ -1087,7 +1087,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"subs %2, %2, #8 \n" // 8 processed per loop
@ -1115,7 +1115,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"subs %2, %2, #8 \n" // 8 processed per loop
@ -1138,7 +1138,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
"subs %3, %3, #8 \n" // 8 processed per loop.
@ -1176,7 +1176,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
(void)src_stride;
asm volatile(
"mov r12, %3, lsl #2 \n"
"1: \n"
"1: \n"
"vld1.32 {d0[0]}, [%0], r12 \n"
"vld1.32 {d0[1]}, [%0], r12 \n"
"vld1.32 {d1[0]}, [%0], r12 \n"
@ -1201,7 +1201,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
asm volatile(
"mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
"1: \n"
"1: \n"
"vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
"vld1.8 {d1}, [%1], r12 \n"
"vld1.8 {d2}, [%0], r12 \n"
@ -1247,7 +1247,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
int tmp;
const uint8_t* src_tmp = src_argb;
asm volatile(
"1: \n"
"1: \n"
// clang-format off
LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1)
@ -1300,7 +1300,7 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"vmov.i16 q15, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"vadd.s32 q8, q1, q0 \n"
"1: \n"
"1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(d0, d2, 0)
@ -1350,7 +1350,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
"subs %2, %2, #8 \n" // 8 processed per loop.
@ -1369,7 +1369,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
"subs %2, %2, #8 \n" // 8 processed per loop.
@ -1390,7 +1390,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
"1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
"subs %3, %3, #8 \n" // 8 processed per loop.
@ -1423,7 +1423,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"vld1.16 {d0[0]}, [%0], %6 \n"
"vld1.16 {d0[1]}, [%1], %6 \n"
"vld1.16 {d0[2]}, [%2], %6 \n"

View File

@ -27,7 +27,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
// load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop
@ -49,7 +49,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
// load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop
@ -73,7 +73,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #16 \n" // 16 processed per loop
@ -102,7 +102,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0
"subs %w2, %w2, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -123,7 +123,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
"1: \n"
"ldp q0, q4, [%0], #32 \n" // load up 16x8
"ldp q1, q5, [%2], #32 \n"
"ldp q2, q6, [%3], #32 \n"
@ -175,7 +175,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
"ld1 {v29.16b}, [%[kShuf34_0]] \n"
"ld1 {v30.16b}, [%[kShuf34_1]] \n"
"ld1 {v31.16b}, [%[kShuf34_2]] \n"
"1: \n"
"1: \n"
"ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n"
"subs %w[width], %w[width], #48 \n"
"tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n"
@ -201,7 +201,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
"movi v24.16b, #3 \n"
"add %3, %3, %0 \n"
"1: \n"
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1
"subs %w2, %w2, #48 \n"
@ -279,7 +279,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
"movi v20.16b, #3 \n"
"add %3, %3, %0 \n"
"1: \n"
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1
"subs %w2, %w2, #48 \n"
@ -339,7 +339,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
"subs %w[width], %w[width], #12 \n"
"b.eq 2f \n"
"1: \n"
"1: \n"
"ldp q0, q1, [%[src_ptr]], #32 \n"
"subs %w[width], %w[width], #12 \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
@ -350,7 +350,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
// Store exactly 12 bytes on the final iteration to avoid writing past
// the end of the array.
"2: \n"
"2: \n"
"ldp q0, q1, [%[src_ptr]] \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
"st1 {v2.8b}, [%[dst_ptr]], #8 \n"
@ -384,7 +384,7 @@ void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
"ld1 {v31.16b}, [%[tblArray4]] \n"
"ld1 {v30.16b}, [%[div996]] \n"
"1: \n"
"1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n"
"ldp q21, q1, [%[src_ptr1]], #32 \n"
"ldp q22, q2, [%[src_ptr2]], #32 \n"
@ -451,7 +451,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"ld1 {v31.16b}, [%[tblArray3]] \n"
"ld1 {v30.8h}, [%[div664]] \n"
"1: \n"
"1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ...
"ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ...
"subs %w[width], %w[width], #12 \n"
@ -500,7 +500,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
asm volatile(
"movi v31.16b, #3 \n"
"1: \n"
"1: \n"
"ldr q0, [%0], #16 \n" // 0123456789abcdef
"ldr q1, [%1], #16 \n" // 123456789abcdefg
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -547,7 +547,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
"movi v31.8b, #3 \n"
"movi v30.8h, #3 \n"
"1: \n"
"1: \n"
"ldr d0, [%0], #8 \n" // 01234567
"ldr d1, [%2], #8 \n" // 12345678
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -602,7 +602,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -639,7 +639,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
"1: \n"
"ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -693,7 +693,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -739,7 +739,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
"movi v31.4h, #3 \n"
"movi v30.4s, #3 \n"
"1: \n"
"1: \n"
"ldr d0, [%0], #8 \n" // 0123 (16b)
"ldr d1, [%2], #8 \n" // 1234 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -793,7 +793,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
asm volatile(
"movi v31.8b, #3 \n"
"1: \n"
"1: \n"
"ldr d0, [%0], #8 \n" // 00112233 (1u1v)
"ldr d1, [%1], #8 \n" // 11223344 (1u1v)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -833,7 +833,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
"movi v31.8b, #3 \n"
"movi v30.8h, #3 \n"
"1: \n"
"1: \n"
"ldr d0, [%0], #8 \n"
"ldr d1, [%2], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -888,7 +888,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -936,7 +936,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
"movi v31.4h, #3 \n"
"movi v30.4s, #3 \n"
"1: \n"
"1: \n"
"ldr d0, [%0], #8 \n"
"ldr d1, [%2], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -988,7 +988,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile(
"1: \n"
"1: \n"
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
"uaddw2 v2.8h, v2.8h, v0.16b \n" // add
@ -1042,7 +1042,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
"trn1 v20.8h, v1.8h, v0.8h \n"
"trn1 v21.8h, v2.8h, v0.8h \n"
"1: \n" SCALE_FILTER_COLS_STEP_ADDR
"1: \n" SCALE_FILTER_COLS_STEP_ADDR
"ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
@ -1090,7 +1090,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n"
"subs %w[width], %w[width], #8 \n"
"prfm pldl1keep, [%[src], 448] \n"
@ -1112,7 +1112,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
(void)src_stride;
const uint8_t* src_argb1 = src_argb + 32;
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]] \n"
"add %[src], %[src], #64 \n"
"ld2 {v2.4s, v3.4s}, [%[src1]] \n"
@ -1136,7 +1136,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
"uaddl v2.8h, v0.8b, v1.8b \n"
@ -1167,7 +1167,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
int64_t i = 0;
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"ldr w10, [%[src], %[i]] \n"
"ldr w11, [%[src1], %[i]] \n"
"ldr w12, [%[src2], %[i]] \n"
@ -1196,7 +1196,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
int dst_width) {
asm volatile(
"add %1, %1, %0 \n"
"1: \n"
"1: \n"
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
"ld1 {v1.8b}, [%1], %4 \n"
"ld1 {v2.8b}, [%0], %4 \n"
@ -1248,7 +1248,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
int64_t dx64 = (int64_t)dx; // NOLINT
int64_t tmp64;
asm volatile (
"1: \n"
"1: \n"
// clang-format off
LOAD1_DATA32_LANE(v0, 0)
LOAD1_DATA32_LANE(v0, 1)
@ -1306,7 +1306,7 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"add v5.4s, v1.4s, v0.4s \n"
"ldr q18, [%[kIndices]] \n"
"1: \n" //
"1: \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR
"ldr d1, [%6] \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR
@ -1359,7 +1359,7 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
"subs %w[dst_width], %w[dst_width], #32 \n"
"b.lt 2f \n"
"1: \n"
"1: \n"
"ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n"
"ldp q4, q5, [%[src_ptr], #64] \n"
@ -1376,7 +1376,7 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
"add %[dst_ptr], %[dst_ptr], #64 \n"
"b.ge 1b \n"
"2: \n"
"2: \n"
"adds %w[dst_width], %w[dst_width], #32 \n"
"b.eq 99f \n"
@ -1386,7 +1386,7 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
"uzp2 v1.8h, v2.8h, v3.8h \n"
"stp q0, q1, [%[dst_ptr]] \n"
"99: \n"
"99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst), // %[dst_ptr]
[dst_width] "+r"(dst_width) // %[dst_width]
@ -1400,7 +1400,7 @@ void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n"
"ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n"
"subs %w[dst_width], %w[dst_width], #16 \n"
@ -1424,7 +1424,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
asm volatile(
// change the stride to row 2 pointer
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"1: \n"
"1: \n"
"ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
"ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #8 \n" // 8 processed per loop
@ -1453,7 +1453,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
@ -1472,7 +1472,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
@ -1493,7 +1493,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
@ -1526,7 +1526,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.h}[0], [%0], %6 \n"
"ld1 {v1.h}[0], [%1], %6 \n"
"ld1 {v2.h}[0], [%2], %6 \n"