From 3a7e0ba6718cde11afe461fd0d153a1a79b77ca7 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 24 Feb 2025 23:22:09 -0800 Subject: [PATCH] Apply format with no code changes Bug: None Change-Id: I8923bacb9af7e7d4f13e210c8b3d7ea6b81568a5 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6301086 Commit-Queue: Frank Barchard Reviewed-by: Mirko Bonadei --- source/compare_gcc.cc | 14 +- source/compare_neon.cc | 4 +- source/compare_neon64.cc | 10 +- source/rotate_gcc.cc | 12 +- source/rotate_neon.cc | 6 +- source/rotate_neon64.cc | 6 +- source/row_gcc.cc | 652 +++++++++++++++++++-------------------- source/row_neon.cc | 276 ++++++++--------- source/row_neon64.cc | 354 ++++++++++----------- source/scale_gcc.cc | 156 +++++----- source/scale_neon.cc | 68 ++-- source/scale_neon64.cc | 80 ++--- 12 files changed, 819 insertions(+), 819 deletions(-) diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 3838abd72..83237ff38 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -37,7 +37,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, // Process 32 bytes per loop. LABELALIGN - "1: \n" + "1: \n" "mov (%0),%%rcx \n" "mov 0x8(%0),%%rdx \n" "xor (%1),%%rcx \n" @@ -80,7 +80,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, asm volatile( // Process 16 bytes per loop. LABELALIGN - "1: \n" + "1: \n" "mov (%0),%%ecx \n" "mov 0x4(%0),%%edx \n" "xor (%1),%%ecx \n" @@ -129,7 +129,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a, "sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "movdqa (%0),%%xmm4 \n" "movdqa 0x10(%0), %%xmm5 \n" "pxor (%0,%1), %%xmm4 \n" @@ -188,7 +188,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, "sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqa (%0),%%ymm4 \n" "vmovdqa 0x20(%0), %%ymm5 \n" "vpxor (%0,%1), %%ymm4, %%ymm4 \n" @@ -217,7 +217,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, "vpermq $0xaa,%%ymm0,%%ymm1 \n" "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" "vmovd %%xmm0,%3 \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -239,7 +239,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a, "pxor %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm1 \n" "lea 0x10(%0),%0 \n" "movdqu (%1),%%xmm2 \n" @@ -306,7 +306,7 @@ uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { "movdqa %4,%%xmm6 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm1 \n" "lea 0x10(%0),%0 \n" "pmulld %%xmm6,%%xmm0 \n" diff --git a/source/compare_neon.cc b/source/compare_neon.cc index afdd60121..ee1f7b26c 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -31,7 +31,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, asm volatile( "vmov.u16 q4, #0 \n" // accumulator - "1: \n" + "1: \n" "vld1.8 {q0, q1}, [%0]! \n" "vld1.8 {q2, q3}, [%1]! \n" "veor.32 q0, q0, q2 \n" @@ -64,7 +64,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, "vmov.u8 q9, #0 \n" "vmov.u8 q11, #0 \n" - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q1}, [%1]! \n" "subs %2, %2, #16 \n" diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 49246aaeb..756f83cb3 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -29,7 +29,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, asm volatile( "movi v4.8h, #0 \n" - "1: \n" + "1: \n" "ld1 {v0.16b, v1.16b}, [%0], #32 \n" "ld1 {v2.16b, v3.16b}, [%1], #32 \n" "eor v0.16b, v0.16b, v2.16b \n" @@ -61,7 +61,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, "movi v18.16b, #0 \n" "movi v19.16b, #0 \n" - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" "ld1 {v1.16b}, [%1], #16 \n" "subs %w2, %w2, #16 \n" @@ -122,7 +122,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) { // count is always a multiple of 16. // maintain two accumulators, reduce and then final sum in scalar since // this has better performance on little cores. - "1: \n" + "1: \n" "ldr q0, [%[src]], #16 \n" "subs %w[count], %w[count], #16 \n" "tbl v3.16b, {v0.16b}, v19.16b \n" @@ -162,7 +162,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a, "movi v5.4s, #0 \n" "movi v6.16b, #1 \n" - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" "ldp q2, q3, [%1], #32 \n" "eor v0.16b, v0.16b, v2.16b \n" @@ -194,7 +194,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a, "movi v4.4s, #0 \n" "movi v5.4s, #0 \n" - "1: \n" + "1: \n" "ldp q0, q2, [%0], #32 \n" "ldp q1, q3, [%1], #32 \n" "subs %w2, %w2, #32 \n" diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index fd5eee05f..e07bedfa7 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -30,7 +30,7 @@ void TransposeWx8_SSSE3(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" "movq (%0,%3),%%xmm1 \n" "lea (%0,%3,2),%0 \n" @@ -120,7 +120,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%0,%3),%%xmm1 \n" "lea (%0,%3,2),%0 \n" @@ -265,7 +265,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src, // Read in the data from the source pointer. // First round of bit swap. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%0,%4),%%xmm1 \n" "lea (%0,%4,2),%0 \n" @@ -393,7 +393,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src, int width) { asm volatile( // Main loop transpose 4x4. Read a column, write a row. - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // a b c d "movdqu (%0,%3),%%xmm1 \n" // e f g h "lea (%0,%3,2),%0 \n" // src += stride * 2 @@ -449,7 +449,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src, int width) { asm volatile( // Main loop transpose 2 blocks of 4x4. Read a column, write a row. - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // a b c d "vmovdqu (%0,%3),%%xmm1 \n" // e f g h "lea (%0,%3,2),%0 \n" // src += stride * 2 @@ -484,7 +484,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src, "sub %4,%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+rm"(width) // %2 diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index a16ef7266..27bd2251b 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -33,7 +33,7 @@ void TransposeWx8_NEON(const uint8_t* src, // at w-8 allow for this "sub %[width], #8 \n" - "1: \n" + "1: \n" "mov %[temp], %[src] \n" "vld1.8 {d0}, [%[temp]], %[src_stride] \n" "vld1.8 {d1}, [%[temp]], %[src_stride] \n" @@ -101,7 +101,7 @@ void TransposeUVWx8_NEON(const uint8_t* src, // at w-8 allow for this "sub %[width], #8 \n" - "1: \n" + "1: \n" "mov %[temp], %[src] \n" "vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n" "vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n" @@ -186,7 +186,7 @@ void Transpose4x4_32_NEON(const uint8_t* src, uint8_t* dst3 = dst2 + dst_stride; asm volatile( // Main loop transpose 4x4. Read a column, write a row. - "1: \n" + "1: \n" "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n" "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n" "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n" diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc index 4a5e181a6..e09bcb178 100644 --- a/source/rotate_neon64.cc +++ b/source/rotate_neon64.cc @@ -28,7 +28,7 @@ void TransposeWx16_NEON(const uint8_t* src, int width) { const uint8_t* src_temp; asm volatile( - "1: \n" + "1: \n" "mov %[src_temp], %[src] \n" "ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n" @@ -151,7 +151,7 @@ void TransposeUVWx8_NEON(const uint8_t* src, // at w-8 allow for this "sub %w[width], %w[width], #8 \n" - "1: \n" + "1: \n" "mov %[temp], %[src] \n" "ld1 {v0.16b}, [%[temp]], %[src_stride] \n" "ld1 {v1.16b}, [%[temp]], %[src_stride] \n" @@ -241,7 +241,7 @@ void Transpose4x4_32_NEON(const uint8_t* src, uint8_t* dst3 = dst2 + dst_stride; asm volatile( // Main loop transpose 4x4. Read a column, write a row. - "1: \n" + "1: \n" "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n" "ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n" "ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n" diff --git a/source/row_gcc.cc b/source/row_gcc.cc index b8e0b4d3e..ce8af5839 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -158,7 +158,7 @@ void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { "pslld $0x18,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" "lea 0x8(%0),%0 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -190,7 +190,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, "movdqa %3,%%xmm4 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm3 \n" @@ -229,7 +229,7 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "movdqa %4,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 12(%0),%%xmm1 \n" "movdqu 24(%0),%%xmm2 \n" @@ -266,7 +266,7 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "vbroadcastf128 %4,%%ymm5 \n" // LABELALIGN // - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // first 12 "vinserti128 $1,12(%0),%%ymm0,%%ymm0 \n" // second 12 "vmovdqu 24(%0),%%xmm1 \n" // third 12 @@ -291,7 +291,7 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "lea 0x80(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -308,7 +308,7 @@ void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { "movdqa %3,%%xmm4 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm3 \n" @@ -348,7 +348,7 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, "movdqa %5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x4(%0),%%xmm1 \n" "movdqu 0x8(%0),%%xmm2 \n" @@ -390,7 +390,7 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -437,7 +437,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -481,7 +481,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" "pand %%xmm4,%%xmm0 \n" @@ -511,7 +511,7 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile("movdqa %3,%%xmm6 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" @@ -549,7 +549,7 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile("movdqa %3,%%xmm6 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" @@ -593,7 +593,7 @@ void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "vmovdqa %4,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" @@ -621,7 +621,7 @@ void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "lea 0x60(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -654,7 +654,7 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { "vmovdqa %5,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" @@ -669,7 +669,7 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { "lea 0x60(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -687,7 +687,7 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "vmovdqa %4,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" @@ -715,7 +715,7 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "lea 0x60(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -737,7 +737,7 @@ void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { "pslld $0xb,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -782,7 +782,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, "pslld $0xb,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "paddusb %%xmm6,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -828,7 +828,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, "vpslld $0xb,%%ymm3,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" "vpsrld $0x5,%%ymm0,%%ymm2 \n" @@ -846,7 +846,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, "lea 0x10(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -868,7 +868,7 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { "pslld $0xf,%%xmm7 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -905,7 +905,7 @@ void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { "psrlw $0x8,%%xmm3 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm3,%%xmm0 \n" @@ -973,7 +973,7 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { "pshufd $0x0,%%xmm6,%%xmm6 \n" "sub %0,%1 \n" - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm2,%%xmm1 \n" // R0B0 @@ -1012,7 +1012,7 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { "pshufd $0x0,%%xmm6,%%xmm6 \n" "sub %0,%1 \n" - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm2,%%xmm1 \n" // R0B0 @@ -1048,7 +1048,7 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "vbroadcastss %7,%%ymm6 \n" // multipler for AG "sub %0,%1 \n" - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 @@ -1061,7 +1061,7 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "add $0x20,%0 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -1085,7 +1085,7 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "vbroadcastss %7,%%ymm6 \n" // multipler for AG "sub %0,%1 \n" - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 @@ -1098,7 +1098,7 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "add $0x20,%0 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -1124,7 +1124,7 @@ void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -1148,7 +1148,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, asm volatile( "movdqa %3,%%xmm2 \n" "movdqa %4,%%xmm3 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm2,%%xmm0 \n" @@ -1171,7 +1171,7 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "psrlw $8,%%xmm0 \n" @@ -1195,7 +1195,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, asm volatile("movdqa %3,%%xmm2 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "psrlw $8,%%xmm0 \n" @@ -1219,7 +1219,7 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { asm volatile( - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" @@ -1230,7 +1230,7 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb, "lea 0x40(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_ar64), // %1 "+r"(width) // %2 @@ -1246,7 +1246,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb, asm volatile( "vbroadcastf128 %3,%%ymm2 \n" "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpshufb %%ymm3,%%ymm0,%%ymm1 \n" @@ -1257,7 +1257,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb, "lea 0x40(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_ab64), // %1 "+r"(width) // %2 @@ -1272,7 +1272,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpsrlw $8,%%ymm0,%%ymm0 \n" @@ -1284,7 +1284,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ar64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1298,7 +1298,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { asm volatile("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpsrlw $8,%%ymm0,%%ymm0 \n" @@ -1311,7 +1311,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1479,7 +1479,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vbroadcastf128 %5,%%ymm7 \n" "vmovdqa %6,%%ymm6 \n" // LABELALIGN RGBTOY_AVX2( - ymm7) "vzeroupper \n" + ymm7) "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1500,7 +1500,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" "vmovdqa %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( - ymm7) "vzeroupper \n" + ymm7) "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1522,7 +1522,7 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vmovdqa %5,%%ymm6 \n" // LABELALIGN // RGBTOY_AVX2(ymm5) // - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1541,7 +1541,7 @@ void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqa %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( - ymm5) "vzeroupper \n" + ymm5) "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1561,7 +1561,7 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "vbroadcastf128 %4,%%ymm5 \n" "vmovdqa %5,%%ymm6 \n" // LABELALIGN RGBTOY_AVX2( - ymm5) "vzeroupper \n" + ymm5) "vzeroupper \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1591,7 +1591,7 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" @@ -1655,7 +1655,7 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" @@ -1695,7 +1695,7 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, "lea 0x20(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1724,7 +1724,7 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" @@ -1945,7 +1945,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" @@ -1981,7 +1981,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb, "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -2009,7 +2009,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" @@ -2045,7 +2045,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr, "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -2073,7 +2073,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" @@ -2109,7 +2109,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb, "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -2138,7 +2138,7 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" @@ -2174,7 +2174,7 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -2579,7 +2579,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV444 YUVTORGB(yuvconstants) STOREARGB @@ -2608,7 +2608,7 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA444 + LABELALIGN "1: \n" READYUVA444 YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" @@ -2641,7 +2641,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STORERGB24 @@ -2677,7 +2677,7 @@ void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf, "sub %[u_buf],%[v_buf] \n" LABELALIGN - "1: \n" + "1: \n" READYUV444 YUVTORGB(yuvconstants) STORERGB24 @@ -2712,7 +2712,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STOREARGB @@ -2746,7 +2746,7 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB16(yuvconstants) STOREAR30 @@ -2776,7 +2776,7 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210 YUVTORGB(yuvconstants) STOREARGB @@ -2806,7 +2806,7 @@ void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV212 YUVTORGB(yuvconstants) STOREARGB @@ -2841,7 +2841,7 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV210 YUVTORGB16(yuvconstants) STOREAR30 @@ -2876,7 +2876,7 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV212 YUVTORGB16(yuvconstants) STOREAR30 @@ -2906,7 +2906,7 @@ void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV410 YUVTORGB(yuvconstants) STOREARGB @@ -2935,7 +2935,7 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA210 + LABELALIGN "1: \n" READYUVA210 YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" @@ -2968,7 +2968,7 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA410 + LABELALIGN "1: \n" READYUVA410 YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" @@ -3006,7 +3006,7 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READYUV410 YUVTORGB16(yuvconstants) STOREAR30 @@ -3035,7 +3035,7 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA422 + LABELALIGN "1: \n" READYUVA422 YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" @@ -3064,7 +3064,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV12 + LABELALIGN "1: \n" READNV12 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" @@ -3086,7 +3086,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV21 + LABELALIGN "1: \n" READNV21 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" @@ -3109,7 +3109,7 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, "movdqa %[kShuffleYUY2UV],%%xmm7 \n" YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READYUY2 + LABELALIGN "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" @@ -3131,7 +3131,7 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, "movdqa %[kShuffleUYVYUV],%%xmm7 \n" YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READUYVY + LABELALIGN "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" @@ -3153,7 +3153,7 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP210 + LABELALIGN "1: \n" READP210 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" @@ -3175,7 +3175,7 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP410 + LABELALIGN "1: \n" READP410 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" @@ -3203,7 +3203,7 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READP210 YUVTORGB16(yuvconstants) STOREAR30 @@ -3234,7 +3234,7 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN - "1: \n" + "1: \n" READP410 YUVTORGB16(yuvconstants) STOREAR30 @@ -3262,7 +3262,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STORERGBA @@ -3673,13 +3673,13 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV444_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -3707,14 +3707,14 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -3748,14 +3748,14 @@ void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf, "vpbroadcastq %%xmm5,%%zmm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX512BW YUVTORGB_AVX512BW(yuvconstants) STOREARGB_AVX512BW "sub $0x20,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -3791,14 +3791,14 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -3826,14 +3826,14 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -3861,14 +3861,14 @@ void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV212_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -3901,14 +3901,14 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV210_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -3941,14 +3941,14 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf, "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV212_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -3976,13 +3976,13 @@ void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV410_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] @@ -4010,11 +4010,11 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA210_AVX2 + LABELALIGN "1: \n" READYUVA210_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "subl $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] @@ -4046,11 +4046,11 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA410_AVX2 + LABELALIGN "1: \n" READYUVA410_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "subl $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] @@ -4088,14 +4088,14 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READYUV410_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -4122,11 +4122,11 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA444_AVX2 + LABELALIGN "1: \n" READYUVA444_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "subl $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -4157,11 +4157,11 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA422_AVX2 + LABELALIGN "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "subl $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -4193,7 +4193,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) @@ -4234,11 +4234,11 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READNV12_AVX2 + LABELALIGN "1: \n" READNV12_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4261,11 +4261,11 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READNV21_AVX2 + LABELALIGN "1: \n" READNV21_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [vu_buf] "+r"(vu_buf), // %[vu_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4289,11 +4289,11 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READYUY2_AVX2 + LABELALIGN "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [yuy2_buf] "+r"(yuy2_buf), // %[yuy2_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+rm"(width) // %[width] @@ -4316,11 +4316,11 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READUYVY_AVX2 + LABELALIGN "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [uyvy_buf] "+r"(uyvy_buf), // %[uyvy_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+rm"(width) // %[width] @@ -4343,11 +4343,11 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READP210_AVX2 + LABELALIGN "1: \n" READP210_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4370,11 +4370,11 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READP410_AVX2 + LABELALIGN "1: \n" READP410_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4403,14 +4403,14 @@ void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf, "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READP210_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] @@ -4440,14 +4440,14 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" READP410_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] @@ -4471,7 +4471,7 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, "pslld $0x18,%%xmm4 \n" LABELALIGN - "1: \n" + "1: \n" // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 "movq (%0),%%xmm0 \n" "lea 0x8(%0),%0 \n" @@ -4516,7 +4516,7 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, "vpslld $0x18,%%ymm4,%%ymm4 \n" LABELALIGN - "1: \n" + "1: \n" // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 "vmovdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" @@ -4556,7 +4556,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu -0x10(%0,%2,1),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" @@ -4577,7 +4577,7 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpermq $0x4e,%%ymm0,%%ymm0 \n" @@ -4585,7 +4585,7 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4604,7 +4604,7 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu -0x10(%0,%2,2),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" @@ -4625,7 +4625,7 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpermq $0x4e,%%ymm0,%%ymm0 \n" @@ -4633,7 +4633,7 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { "lea 0x20(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(temp_width) // %2 @@ -4657,7 +4657,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "lea -0x10(%0),%0 \n" "pshufb %%xmm1,%%xmm0 \n" @@ -4696,7 +4696,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, "movdqa %4,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // first 5 "movdqu 15(%0),%%xmm1 \n" // next 5 "movdqu 30(%0),%%xmm2 \n" // next 5 @@ -4729,7 +4729,7 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile("lea -0x10(%0,%2,4),%0 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "pshufd $0x1b,%%xmm0,%%xmm0 \n" "lea -0x10(%0),%0 \n" @@ -4753,13 +4753,13 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile("vmovdqu %3,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4779,7 +4779,7 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -4796,7 +4796,7 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, "lea 0x20(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -4817,7 +4817,7 @@ void SplitUVRow_SSE2(const uint8_t* src_uv, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -4849,7 +4849,7 @@ void DetileRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "sub $0x10,%2 \n" "lea (%0,%3),%0 \n" @@ -4870,7 +4870,7 @@ void DetileRow_16_SSE2(const uint16_t* src, uint16_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea (%0,%3,2),%0 \n" @@ -4893,14 +4893,14 @@ void DetileRow_16_AVX(const uint16_t* src, uint16_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "lea (%0,%3,2),%0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -4918,7 +4918,7 @@ void DetileToYUY2_SSE2(const uint8_t* src_y, uint8_t* dst_yuy2, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // Load 16 Y "sub $0x10,%3 \n" "lea (%0,%4),%0 \n" @@ -4958,7 +4958,7 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, int width) { asm volatile( "movdqu %4,%%xmm1 \n" - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "lea (%0, %5),%0 \n" "pshufb %%xmm1,%%xmm0 \n" @@ -4986,7 +4986,7 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u, asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "vpmovzxbw (%0),%%zmm0 \n" "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" "lea 0x20(%0),%0 \n" @@ -4996,7 +4996,7 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u, "lea 0x40(%2),%2 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5014,7 +5014,7 @@ void MergeUVRow_AVX2(const uint8_t* src_u, asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "vpmovzxbw (%0),%%ymm0 \n" "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" "lea 0x10(%0),%0 \n" @@ -5024,7 +5024,7 @@ void MergeUVRow_AVX2(const uint8_t* src_u, "lea 0x20(%2),%2 \n" "sub $0x10,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5042,7 +5042,7 @@ void MergeUVRow_SSE2(const uint8_t* src_u, asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" "lea 0x10(%0),%0 \n" @@ -5077,7 +5077,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, // 8 pixels per loop. LABELALIGN - "1: \n" + "1: \n" "vpmovzxwd (%0),%%ymm0 \n" "vpmovzxwd 0x00(%0,%1,1),%%ymm1 \n" "lea 0x10(%0),%0 \n" @@ -5088,7 +5088,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, "lea 0x20(%2),%2 \n" "sub $0x8,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5115,7 +5115,7 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, // 16 pixels per loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "add $0x40,%0 \n" @@ -5133,7 +5133,7 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, "add $0x20,%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -5161,7 +5161,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, // 32 pixels per loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" @@ -5171,7 +5171,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, "add $0x40,%0 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -5197,7 +5197,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y, // 32 pixels per loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" @@ -5207,7 +5207,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y, "add $0x40,%0 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width), // %2 @@ -5233,7 +5233,7 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y, // 32 pixels per loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "add $0x20,%0 \n" @@ -5262,7 +5262,7 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, // 32 pixels per loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "add $0x40,%0 \n" @@ -5274,7 +5274,7 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, "add $0x20,%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -5292,7 +5292,7 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, // 64 pixels per loop. LABELALIGN - "1: \n" + "1: \n" "vmovups (%0),%%zmm0 \n" "vmovups 0x40(%0),%%zmm1 \n" "add $0x80,%0 \n" @@ -5305,7 +5305,7 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, "add $0x40,%1 \n" "sub $0x40,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -5329,7 +5329,7 @@ void Convert8To16Row_SSE2(const uint8_t* src_y, // 32 pixels per loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -5360,7 +5360,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, // 32 pixels per loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "add $0x20,%0 \n" @@ -5373,7 +5373,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, "add $0x40,%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -5410,7 +5410,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, uint8_t* dst_b, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" @@ -5474,7 +5474,7 @@ void SplitRGBRow_SSE41(const uint8_t* src_rgb, int width) { asm volatile( "movdqa 48(%5), %%xmm0 \n" - "1: \n" + "1: \n" "movdqu (%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm2 \n" "movdqu 0x20(%0),%%xmm3 \n" @@ -5523,7 +5523,7 @@ void SplitRGBRow_AVX2(const uint8_t* src_rgb, "vbroadcasti128 16(%5), %%ymm9 \n" "vbroadcasti128 32(%5), %%ymm10 \n" #endif - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm4 \n" "vmovdqu 0x20(%0),%%ymm5 \n" "vmovdqu 0x40(%0),%%ymm6 \n" @@ -5601,7 +5601,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, uint8_t* dst_rgb, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" "movdqu (%2),%%xmm2 \n" @@ -5661,7 +5661,7 @@ void MergeARGBRow_SSE2(const uint8_t* src_r, "sub %0,%3 \n" LABELALIGN - "1: \n" + "1: \n" "movq (%0,%2),%%xmm0 \n" // B "movq (%0),%%xmm1 \n" // R @@ -5697,7 +5697,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "movq (%2),%%xmm0 \n" // B "movq (%0),%%xmm1 \n" // R @@ -5740,7 +5740,7 @@ void MergeARGBRow_AVX2(const uint8_t* src_r, "sub %0,%3 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0,%2),%%xmm0 \n" // B "vmovdqu (%0,%1),%%xmm1 \n" // R @@ -5761,7 +5761,7 @@ void MergeARGBRow_AVX2(const uint8_t* src_r, "lea 64(%4),%4 \n" "sub $0x10,%5 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -5780,7 +5780,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "vmovdqu (%2),%%xmm0 \n" // B "vpcmpeqb %%ymm1,%%ymm1,%%ymm1 \n" // A(255) @@ -5803,7 +5803,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r, "lea 64(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -5827,7 +5827,7 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb, "sub %1,%4 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F "movdqu 16(%0),%%xmm1 \n" // 10-1F @@ -5873,7 +5873,7 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_b, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F "movdqu 16(%0),%%xmm1 \n" // 10-1F @@ -5928,7 +5928,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, "sub %1,%4 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F "movdqu 16(%0),%%xmm1 \n" // 10-1F @@ -5971,7 +5971,7 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb, "movdqa %5,%%xmm3 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F "movdqu 16(%0),%%xmm1 \n" // 10-1F @@ -6016,7 +6016,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, "vbroadcastf128 %6,%%ymm4 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // 00-0F "vmovdqu 16(%0),%%xmm1 \n" // 10-1F @@ -6036,7 +6036,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, "lea 16(%1),%1 \n" "subl $0x10,%5 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -6064,7 +6064,7 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, "vbroadcastf128 %5,%%ymm4 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // 00-0F "vmovdqu 16(%0),%%xmm1 \n" // 10-1F @@ -6086,7 +6086,7 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, "lea 16(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -6117,7 +6117,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r, "vmovd %5,%%xmm4 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0,%1),%%ymm1 \n" "vmovdqu (%0,%2),%%ymm2 \n" @@ -6145,7 +6145,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r, "lea 0x40(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -6181,7 +6181,7 @@ void MergeAR64Row_AVX2(const uint16_t* src_r, "vbroadcastss %7,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" // R "vmovdqu (%0,%1),%%ymm1 \n" // G "vmovdqu (%0,%2),%%ymm2 \n" // B @@ -6214,7 +6214,7 @@ void MergeAR64Row_AVX2(const uint16_t* src_r, "lea 0x80(%4),%4 \n" "subl $0x10,%5 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -6251,7 +6251,7 @@ void MergeXR64Row_AVX2(const uint16_t* src_r, "vbroadcastss %6,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" // R "vmovdqu (%0,%1),%%ymm1 \n" // G "vmovdqu (%0,%2),%%ymm2 \n" // B @@ -6281,7 +6281,7 @@ void MergeXR64Row_AVX2(const uint16_t* src_r, "lea 0x80(%3),%3 \n" "subl $0x10,%4 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -6314,7 +6314,7 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r, "vmovd %6,%%xmm6 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" // R "vmovdqu (%0,%1),%%ymm1 \n" // G "vmovdqu (%0,%2),%%ymm2 \n" // B @@ -6337,7 +6337,7 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r, "lea 0x40(%4),%4 \n" "subl $0x10,%5 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -6371,7 +6371,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff) LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" // R "vmovdqu (%0,%1),%%ymm1 \n" // G "vmovdqu (%0,%2),%%ymm2 \n" // B @@ -6392,7 +6392,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, "lea 0x40(%3),%3 \n" "subl $0x10,%4 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -6413,7 +6413,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "jne 2f \n" LABELALIGN - "1: \n" + "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -6425,7 +6425,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "jmp 9f \n" LABELALIGN - "2: \n" + "2: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -6435,7 +6435,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "sub $0x20,%2 \n" "jg 2b \n" - LABELALIGN "9: \n" + LABELALIGN "9: \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -6447,7 +6447,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_COPYROW_AVX void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -6456,7 +6456,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { "lea 0x40(%1),%1 \n" "sub $0x40,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -6468,7 +6468,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_COPYROW_AVX512BW void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "vmovups (%0),%%zmm0 \n" "vmovups 0x40(%0),%%zmm1 \n" "lea 0x80(%0),%0 \n" @@ -6477,7 +6477,7 @@ void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) { "lea 0x80(%1),%1 \n" "sub $0x80,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -6509,7 +6509,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "psrld $0x8,%%xmm1 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm2 \n" "movdqu 0x10(%0),%%xmm3 \n" "lea 0x20(%0),%0 \n" @@ -6542,7 +6542,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm2 \n" "lea 0x40(%0),%0 \n" @@ -6553,7 +6553,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -6568,7 +6568,7 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0), %%xmm0 \n" "movdqu 0x10(%0), %%xmm1 \n" "lea 0x20(%0), %0 \n" @@ -6601,7 +6601,7 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, "vbroadcastf128 %4,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0), %%ymm0 \n" "vmovdqu 0x20(%0), %%ymm1 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 @@ -6619,7 +6619,7 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, "lea 0x20(%1),%1 \n" "sub $0x20, %2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+rm"(width) // %2 @@ -6639,7 +6639,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "psrld $0x8,%%xmm1 \n" LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm2 \n" "lea 0x8(%0),%0 \n" "punpcklbw %%xmm2,%%xmm2 \n" @@ -6674,7 +6674,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN - "1: \n" + "1: \n" "vpmovzxbd (%0),%%ymm1 \n" "vpmovzxbd 0x8(%0),%%ymm2 \n" "lea 0x10(%0),%0 \n" @@ -6687,7 +6687,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -6733,7 +6733,7 @@ void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { "psrlw $0x8,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -6756,7 +6756,7 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_uv, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm2 \n" @@ -6789,7 +6789,7 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%4,1),%%xmm2 \n" @@ -6828,7 +6828,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -6855,7 +6855,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -6884,7 +6884,7 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%4,1),%%xmm2 \n" @@ -6923,7 +6923,7 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -6956,7 +6956,7 @@ void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { "vpsrlw $0x8,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -6968,7 +6968,7 @@ void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -6981,7 +6981,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_uv, int width) { asm volatile( - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpavgb 0x00(%0,%3,1),%%ymm0,%%ymm0 \n" @@ -6995,7 +6995,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_uv), // %1 "+r"(width) // %2 @@ -7014,7 +7014,7 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" @@ -7035,7 +7035,7 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -7054,7 +7054,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -7073,7 +7073,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -7084,7 +7084,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -7096,7 +7096,7 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -7114,7 +7114,7 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" @@ -7135,7 +7135,7 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -7154,7 +7154,7 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, "sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -7173,7 +7173,7 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -7207,7 +7207,7 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb, // 4 pixel loop. LABELALIGN - "40: \n" + "40: \n" "movdqu (%0),%%xmm3 \n" "lea 0x10(%0),%0 \n" "movdqa %%xmm3,%%xmm0 \n" @@ -7231,12 +7231,12 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb, "sub $0x4,%3 \n" "jge 40b \n" - "49: \n" + "49: \n" "add $0x3,%3 \n" "jl 99f \n" // 1 pixel loop. - "91: \n" + "91: \n" "movd (%0),%%xmm3 \n" "lea 0x4(%0),%0 \n" "movdqa %%xmm3,%%xmm0 \n" @@ -7259,7 +7259,7 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb, "lea 0x4(%2),%2 \n" "sub $0x1,%3 \n" "jge 91b \n" - "99: \n" + "99: \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -7296,7 +7296,7 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movq (%2),%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pxor %%xmm5,%%xmm0 \n" @@ -7348,7 +7348,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, // 32 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%2),%%ymm0 \n" "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" @@ -7371,7 +7371,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, "lea 0x20(%2),%2 \n" "sub $0x20,%4 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(alpha), // %2 @@ -7404,7 +7404,7 @@ void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm6 \n" "movdqa %%xmm6,%%xmm0 \n" "movdqa %%xmm6,%%xmm1 \n" @@ -7459,7 +7459,7 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm6 \n" "vpunpcklbw %%ymm5,%%ymm6,%%ymm0 \n" "vpunpckhbw %%ymm5,%%ymm6,%%ymm1 \n" @@ -7478,7 +7478,7 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, "lea 0x20(%0),%0 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -7497,7 +7497,7 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, asm volatile( // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movzb 0x03(%0),%3 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -7548,7 +7548,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" // replace VPGATHER "movzb 0x03(%0),%3 \n" "vmovd 0x00(%4,%3,4),%%xmm0 \n" @@ -7589,7 +7589,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, "lea 0x20(%0),%0 \n" "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width), // %2 @@ -7610,7 +7610,7 @@ void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "psubb %%xmm5,%%xmm0 \n" @@ -7673,7 +7673,7 @@ void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm6 \n" "pmaddubsw %%xmm2,%%xmm0 \n" @@ -7736,7 +7736,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm7 \n" "pmaddubsw %%xmm2,%%xmm0 \n" @@ -7808,7 +7808,7 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" @@ -7850,7 +7850,7 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb, // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -7883,7 +7883,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" "movdqu (%1),%%xmm2 \n" @@ -7920,7 +7920,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm1 \n" "lea 0x20(%0),%0 \n" "vmovdqu (%1),%%ymm3 \n" @@ -7936,7 +7936,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, "lea 0x20(%2),%2 \n" "sub $0x8,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -7955,7 +7955,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb, asm volatile( // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" "movdqu (%1),%%xmm1 \n" @@ -7983,7 +7983,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb, asm volatile( // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "lea 0x20(%0),%0 \n" "vpaddusb (%1),%%ymm0,%%ymm0 \n" @@ -7992,7 +7992,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb, "lea 0x20(%2),%2 \n" "sub $0x8,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -8011,7 +8011,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb, asm volatile( // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" "movdqu (%1),%%xmm1 \n" @@ -8039,7 +8039,7 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb, asm volatile( // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "lea 0x20(%0),%0 \n" "vpsubusb (%1),%%ymm0,%%ymm0 \n" @@ -8048,7 +8048,7 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb, "lea 0x20(%2),%2 \n" "sub $0x8,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -8076,7 +8076,7 @@ void SobelXRow_SSE2(const uint8_t* src_y0, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" "movq 0x2(%0),%%xmm1 \n" "punpcklbw %%xmm5,%%xmm0 \n" @@ -8129,7 +8129,7 @@ void SobelYRow_SSE2(const uint8_t* src_y0, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" "movq 0x00(%0,%1,1),%%xmm1 \n" "punpcklbw %%xmm5,%%xmm0 \n" @@ -8182,7 +8182,7 @@ void SobelRow_SSE2(const uint8_t* src_sobelx, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" "lea 0x10(%0),%0 \n" @@ -8229,7 +8229,7 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" "lea 0x10(%0),%0 \n" @@ -8263,7 +8263,7 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx, // 8 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" "lea 0x10(%0),%0 \n" @@ -8315,7 +8315,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, // 4 pixel loop. LABELALIGN - "40: \n" + "40: \n" "movdqu (%0),%%xmm2 \n" "lea 0x10(%0),%0 \n" "movdqa %%xmm2,%%xmm4 \n" @@ -8348,13 +8348,13 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, "sub $0x4,%3 \n" "jge 40b \n" - "49: \n" + "49: \n" "add $0x3,%3 \n" "jl 19f \n" // 1 pixel loop. LABELALIGN - "10: \n" + "10: \n" "movd (%0),%%xmm2 \n" "lea 0x4(%0),%0 \n" "punpcklbw %%xmm1,%%xmm2 \n" @@ -8368,7 +8368,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, "sub $0x1,%3 \n" "jge 10b \n" - "19: \n" + "19: \n" : "+r"(row), // %0 "+r"(cumsum), // %1 "+r"(previous_cumsum), // %2 @@ -8406,7 +8406,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, // 4 pixel small loop. LABELALIGN - "4: \n" + "4: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" @@ -8438,7 +8438,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, // 4 pixel loop LABELALIGN - "40: \n" + "40: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" @@ -8477,13 +8477,13 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, "sub $0x4,%3 \n" "jge 40b \n" - "49: \n" + "49: \n" "add $0x3,%3 \n" "jl 19f \n" // 1 pixel loop LABELALIGN - "10: \n" + "10: \n" "movdqu (%0),%%xmm0 \n" "psubd 0x00(%0,%4,4),%%xmm0 \n" "lea 0x10(%0),%0 \n" @@ -8499,7 +8499,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, "lea 0x4(%2),%2 \n" "sub $0x1,%3 \n" "jge 10b \n" - "19: \n" + "19: \n" : "+r"(topleft), // %0 "+r"(botleft), // %1 "+r"(dst), // %2 @@ -8542,7 +8542,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, // 4 pixel loop LABELALIGN - "40: \n" + "40: \n" "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts @@ -8568,13 +8568,13 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, "sub $0x4,%4 \n" "jge 40b \n" - "49: \n" + "49: \n" "add $0x3,%4 \n" "jl 19f \n" // 1 pixel loop LABELALIGN - "10: \n" + "10: \n" "cvttps2dq %%xmm2,%%xmm0 \n" "packssdw %%xmm0,%%xmm0 \n" "pmaddwd %%xmm5,%%xmm0 \n" @@ -8585,7 +8585,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, "lea 0x04(%2),%2 \n" "sub $0x1,%4 \n" "jge 10b \n" - "19: \n" + "19: \n" : "+r"(src_argb), // %0 "+r"(src_argb_stride_temp), // %1 "+r"(dst_argb), // %2 @@ -8625,7 +8625,7 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, // General purpose row blend. LABELALIGN - "1: \n" + "1: \n" "movdqu (%1),%%xmm0 \n" "movdqu 0x00(%1,%4,1),%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -8650,7 +8650,7 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, // Blend 50 / 50. LABELALIGN - "50: \n" + "50: \n" "movdqu (%1),%%xmm0 \n" "movdqu 0x00(%1,%4,1),%%xmm1 \n" "pavgb %%xmm1,%%xmm0 \n" @@ -8662,14 +8662,14 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, // Blend 100 / 0 - Copy row unchanged. LABELALIGN - "100: \n" + "100: \n" "movdqu (%1),%%xmm0 \n" "movdqu %%xmm0,0x00(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 100b \n" - "99: \n" + "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+rm"(width), // %2 @@ -8706,7 +8706,7 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, // General purpose row blend. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%1),%%ymm0 \n" "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" @@ -8728,7 +8728,7 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, // Blend 50 / 50. LABELALIGN - "50: \n" + "50: \n" "vmovdqu (%1),%%ymm0 \n" "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,0x00(%1,%0,1) \n" @@ -8739,15 +8739,15 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, // Blend 100 / 0 - Copy row unchanged. LABELALIGN - "100: \n" + "100: \n" "vmovdqu (%1),%%ymm0 \n" "vmovdqu %%ymm0,0x00(%1,%0,1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 100b \n" - "99: \n" - "vzeroupper \n" + "99: \n" + "vzeroupper \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(width), // %2 @@ -8766,7 +8766,7 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, asm volatile("movdqu (%3),%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -8794,7 +8794,7 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, asm volatile("vbroadcastf128 (%3),%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -8805,7 +8805,7 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -8823,7 +8823,7 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movq (%1),%%xmm2 \n" "movq 0x00(%1,%2,1),%%xmm1 \n" "add $0x8,%1 \n" @@ -8857,7 +8857,7 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "movq (%1),%%xmm2 \n" "movq 0x00(%1,%2,1),%%xmm1 \n" "add $0x8,%1 \n" @@ -8891,7 +8891,7 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vpmovzxbw (%1),%%ymm1 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" "add $0x10,%1 \n" @@ -8908,7 +8908,7 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, "lea 0x40(%3),%3 \n" "sub $0x20,%4 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8928,7 +8928,7 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" + "1: \n" "vpmovzxbw (%1),%%ymm1 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" "add $0x10,%1 \n" @@ -8945,7 +8945,7 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, "lea 0x40(%3),%3 \n" "sub $0x20,%4 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8965,7 +8965,7 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, // 2 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" "lea 0x8(%0),%0 \n" "punpcklbw %%xmm3,%%xmm0 \n" @@ -9024,7 +9024,7 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, // 2 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels "lea 0x8(%0),%0 \n" "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats @@ -9042,7 +9042,7 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, "lea 0x8(%1),%1 \n" "sub $0x2,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -9067,7 +9067,7 @@ void HalfFloatRow_SSE2(const uint16_t* src, // 16 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm2 \n" // 8 shorts "add $0x10,%0 \n" "movdqa %%xmm2,%%xmm3 \n" @@ -9104,7 +9104,7 @@ void HalfFloatRow_AVX2(const uint16_t* src, // 16 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm2 \n" // 16 shorts "add $0x20,%0 \n" "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates @@ -9120,7 +9120,7 @@ void HalfFloatRow_AVX2(const uint16_t* src, "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -9144,7 +9144,7 @@ void HalfFloatRow_F16C(const uint16_t* src, // 16 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints "vpmovzxwd 0x10(%0),%%ymm3 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n" @@ -9158,7 +9158,7 @@ void HalfFloatRow_F16C(const uint16_t* src, "add $0x20,%0 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -9177,7 +9177,7 @@ void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { "sub %0,%1 \n" // 16 pixel loop. LABELALIGN - "1: \n" + "1: \n" "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints "vpmovzxwd 0x10(%0),%%ymm3 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n" @@ -9189,7 +9189,7 @@ void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { "add $0x20,%0 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -9207,7 +9207,7 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb, asm volatile( // 1 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movzb (%0),%1 \n" "lea 0x4(%0),%0 \n" "movzb 0x00(%3,%1,4),%1 \n" @@ -9240,7 +9240,7 @@ void RGBColorTableRow_X86(uint8_t* dst_argb, asm volatile( // 1 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movzb (%0),%1 \n" "lea 0x4(%0),%0 \n" "movzb 0x00(%3,%1,4),%1 \n" @@ -9279,7 +9279,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, // 4 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%2),%%xmm0 \n" "pmaddubsw %%xmm3,%%xmm0 \n" "phaddw %%xmm0,%%xmm0 \n" @@ -9379,7 +9379,7 @@ void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, "movdqa (%4),%%xmm4 \n" // 3 shuffler constants "movdqa 16(%4),%%xmm5 \n" "movdqa 32(%4),%%xmm6 \n" - "1: \n" + "1: \n" "movdqu (%0),%%xmm2 \n" // load 16 Y values "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values "lea 16(%0),%0 \n" @@ -9418,7 +9418,7 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, "vbroadcastf128 16(%4),%%ymm5 \n" "vbroadcastf128 32(%4),%%ymm6 \n" - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm2 \n" // load 32 Y values "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values "lea 32(%0),%0 \n" @@ -9437,7 +9437,7 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, "lea 96(%2),%2 \n" "sub $32,%3 \n" // 32 pixels per loop "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 @@ -9465,7 +9465,7 @@ void NV21ToYUV24Row_AVX512(const uint8_t* src_y, "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants "vmovdqa 32(%4),%%ymm5 \n" "vmovdqa 64(%4),%%ymm6 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm2 \n" // load 32 Y values "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values "lea 32(%0),%0 \n" @@ -9480,7 +9480,7 @@ void NV21ToYUV24Row_AVX512(const uint8_t* src_y, "lea 96(%2),%2 \n" "sub $32,%3 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 @@ -9502,7 +9502,7 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile("movdqu %3,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -9526,7 +9526,7 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -9537,7 +9537,7 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { "lea 0x40(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 @@ -9559,7 +9559,7 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, "pxor %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // load 16 U values "movdqu (%1),%%xmm1 \n" // load 16 V values "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row @@ -9605,7 +9605,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" // load 32 U values "vmovdqu (%1),%%ymm1 \n" // load 32 V values "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row @@ -9629,7 +9629,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, "lea 0x20(%2),%2 \n" "sub $0x20,%3 \n" // 32 src pixels per loop "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -9644,7 +9644,7 @@ void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { "pxor %%xmm1,%%xmm1 \n" LABELALIGN - "1: \n" + "1: \n" "movd (%0),%%xmm0 \n" // load float "maxss %%xmm1, %%xmm0 \n" // clamp to zero "add 4, %0 \n" @@ -9676,7 +9676,7 @@ void Convert8To8Row_AVX2(const uint8_t* src_y, // 32 pixels per loop. LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpunpckhbw %%ymm4,%%ymm0,%%ymm1 \n" // mutates "vpunpcklbw %%ymm4,%%ymm0,%%ymm0 \n" @@ -9688,7 +9688,7 @@ void Convert8To8Row_AVX2(const uint8_t* src_y, "add $0x20,%0 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 diff --git a/source/row_neon.cc b/source/row_neon.cc index 16ad3a936..cb86b3f42 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -143,7 +143,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV444 + "1: \n" READYUV444 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -165,7 +165,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV444 + "1: \n" READYUV444 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" @@ -188,7 +188,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -211,7 +211,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV444 + "1: \n" READYUV444 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vld1.8 {d6}, [%[src_a]]! \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" @@ -236,7 +236,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vld1.8 {d6}, [%[src_a]]! \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" @@ -261,7 +261,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 STORERGBA "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -283,7 +283,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" @@ -313,7 +313,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 ARGBTORGB565 "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. @@ -345,7 +345,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vmov.u8 d6, #0xff \n" ARGBTOARGB1555 "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555. @@ -379,7 +379,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "vmov.u8 d7, #0x0f \n" // vbic bits to clear - "1: \n" READYUV422 YUVTORGB + "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTOARGB4444 "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels @@ -401,7 +401,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV400 YUVTORGB + "1: \n" READYUV400 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" @@ -417,7 +417,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d23, #255 \n" - "1: \n" + "1: \n" "vld1.8 {d20}, [%0]! \n" "subs %2, %2, #8 \n" "vmov d21, d20 \n" @@ -439,7 +439,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 + "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -460,7 +460,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READNV21 YUVTORGB RGBTORGB8 + "1: \n" READNV21 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -481,7 +481,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 + "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" @@ -502,7 +502,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READNV21 YUVTORGB RGBTORGB8 + "1: \n" READNV21 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" @@ -523,7 +523,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 + "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565 "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. "bgt 1b \n" @@ -543,7 +543,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUY2 YUVTORGB RGBTORGB8 + "1: \n" READYUY2 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -562,7 +562,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READUYVY YUVTORGB RGBTORGB8 + "1: \n" READUYVY YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -580,7 +580,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop "vst1.8 {q0}, [%1]! \n" // store U @@ -604,7 +604,7 @@ void DetileRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes "subs %2, %2, #16 \n" // 16 processed per loop "pld [%0, #1792] \n" @@ -624,7 +624,7 @@ void DetileRow_16_NEON(const uint16_t* src, uint16_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels "subs %2, %2, #16 \n" // 16 processed per loop "pld [%0, #3584] \n" @@ -645,7 +645,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {d0, d1}, [%0], %4 \n" "subs %3, %3, #16 \n" "pld [%0, #1792] \n" @@ -670,7 +670,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, uint8_t* dst_yuy2, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y "pld [%0, #1792] \n" "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV @@ -696,7 +696,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, uint8_t* dst_yuy2, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV "subs %3, %3, #16 \n" @@ -718,7 +718,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {q14}, [%0]! \n" // Load lower bits. "vld1.8 {q9}, [%0]! \n" // Load upper bits row // by row. @@ -762,7 +762,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, uint8_t* dst_uv, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q1}, [%1]! \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop @@ -784,7 +784,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_b, int width) { asm volatile( - "1: \n" + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB "subs %4, %4, #16 \n" // 16 processed per loop @@ -809,7 +809,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r, uint8_t* dst_rgb, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load R "vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q2}, [%2]! \n" // load B @@ -835,7 +835,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB "subs %5, %5, #16 \n" // 16 processed per loop @@ -863,7 +863,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {q2}, [%0]! \n" // load R "vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q0}, [%2]! \n" // load B @@ -890,7 +890,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb, uint8_t* dst_b, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB "subs %4, %4, #16 \n" // 16 processed per loop @@ -916,7 +916,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, int width) { asm volatile( "vmov.u8 q3, #255 \n" // load A(255) - "1: \n" + "1: \n" "vld1.8 {q2}, [%0]! \n" // load R "vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q0}, [%2]! \n" // load B @@ -944,7 +944,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r, asm volatile( "vmov.u32 q14, #1023 \n" "vdup.32 q15, %5 \n" - "1: \n" + "1: \n" "vld1.16 {d4}, [%2]! \n" // B "vld1.16 {d2}, [%1]! \n" // G "vld1.16 {d0}, [%0]! \n" // R @@ -980,7 +980,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, int width) { asm volatile( "vmov.u32 q14, #1023 \n" - "1: \n" + "1: \n" "vld1.16 {d4}, [%2]! \n" // B "vld1.16 {d2}, [%1]! \n" // G "vld1.16 {d0}, [%0]! \n" // R @@ -996,7 +996,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) "vst1.8 {q2}, [%3]! \n" "bgt 1b \n" - "3: \n" + "3: \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -1019,7 +1019,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r, "vdup.u16 q15, %6 \n" "vdup.u16 q14, %7 \n" - "1: \n" + "1: \n" "vld1.16 {q2}, [%0]! \n" // R "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B @@ -1060,7 +1060,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r, "vmov.u8 q3, #0xff \n" // A (0xffff) "vdup.u16 q15, %5 \n" "vdup.u16 q14, %6 \n" - "1: \n" + "1: \n" "vld1.16 {q2}, [%0]! \n" // R "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B @@ -1095,7 +1095,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, asm volatile( "vdup.16 q15, %6 \n" - "1: \n" + "1: \n" "vld1.16 {q2}, [%0]! \n" // R "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B @@ -1132,7 +1132,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, "vdup.16 q15, %5 \n" "vmov.u8 d6, #0xff \n" // A (0xff) - "1: \n" + "1: \n" "vld1.16 {q2}, [%0]! \n" // R "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B @@ -1157,7 +1157,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 @@ -1174,7 +1174,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { asm volatile( "vdup.8 q0, %2 \n" // duplicate 16 bytes - "1: \n" + "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" @@ -1188,7 +1188,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { asm volatile( "vdup.u32 q0, %2 \n" // duplicate 4 ints - "1: \n" + "1: \n" "subs %1, %1, #4 \n" // 4 pixels per loop "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" @@ -1204,7 +1204,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { "add %0, %0, %2 \n" "sub %0, %0, #32 \n" // 32 bytes per loop - "1: \n" + "1: \n" "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32 "subs %2, #32 \n" // 32 pixels per loop. "vrev64.8 q0, q2 \n" @@ -1227,7 +1227,7 @@ void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { "add %0, %0, %2, lsl #1 \n" "sub %0, #16 \n" - "1: \n" + "1: \n" "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "subs %2, #8 \n" // 8 pixels per loop. "vrev64.8 q0, q0 \n" @@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, "add %0, %0, %3, lsl #1 \n" "sub %0, #16 \n" - "1: \n" + "1: \n" "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "subs %3, #8 \n" // 8 pixels per loop. "vrev64.8 q0, q0 \n" @@ -1270,7 +1270,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { "add %0, %0, %2, lsl #2 \n" "sub %0, #32 \n" - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32 "subs %2, #8 \n" // 8 pixels per loop. "vrev64.8 d0, d0 \n" @@ -1291,7 +1291,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, int width) { src_rgb24 += width * 3 - 24; asm volatile( - "1: \n" + "1: \n" "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 "subs %2, #8 \n" // 8 pixels per loop. "vrev64.8 d0, d0 \n" @@ -1311,7 +1311,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, int width) { asm volatile( "vmov.u8 d4, #255 \n" // Alpha - "1: \n" + "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. @@ -1327,7 +1327,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d4, #255 \n" // Alpha - "1: \n" + "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B @@ -1344,7 +1344,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( "vmov.u8 d0, #255 \n" // Alpha - "1: \n" + "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B @@ -1359,7 +1359,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { } void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( - "1: \n" + "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B @@ -1391,7 +1391,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, int width) { asm volatile( "vmov.u8 d3, #255 \n" // Alpha - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB @@ -1437,7 +1437,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, int width) { asm volatile( "vmov.u8 d3, #255 \n" // Alpha - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB @@ -1466,7 +1466,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, int width) { asm volatile( "vmov.u8 d3, #255 \n" // Alpha - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB @@ -1484,7 +1484,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop. @@ -1501,7 +1501,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B @@ -1517,7 +1517,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. @@ -1532,7 +1532,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. @@ -1550,7 +1550,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "vst1.8 {d1}, [%1]! \n" // store 8 U. @@ -1570,7 +1570,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "vst1.8 {d0}, [%1]! \n" // store 8 U. @@ -1592,7 +1592,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, int width) { asm volatile( "add %1, %0, %1 \n" // stride + src_yuy2 - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. @@ -1619,7 +1619,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, int width) { asm volatile( "add %1, %0, %1 \n" // stride + src_uyvy - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. @@ -1645,7 +1645,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, int width) { asm volatile( "add %1, %0, %1 \n" // stride + src_yuy2 - "1: \n" + "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "vld2.8 {q2, q3}, [%1]! \n" // load next row YUY2. @@ -1669,7 +1669,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, int width) { asm volatile( "vld1.8 {q2}, [%3] \n" // shuffler - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 4 pixels. "subs %2, %2, #4 \n" // 4 processed per loop "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels @@ -1690,7 +1690,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, uint8_t* dst_yuy2, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld1.8 {d1}, [%1]! \n" // load 8 Us "vld1.8 {d3}, [%2]! \n" // load 8 Vs @@ -1712,7 +1712,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, uint8_t* dst_uyvy, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld1.8 {d0}, [%1]! \n" // load 8 Us "vld1.8 {d2}, [%2]! \n" // load 8 Vs @@ -1732,7 +1732,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 @@ -1751,7 +1751,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, int width) { asm volatile( "vdup.32 d7, %2 \n" // dither4 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d0, d0, d7 \n" @@ -1771,7 +1771,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb1555, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 @@ -1790,7 +1790,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, asm volatile( "vmov.u8 d7, #0x0f \n" // bits to clear with // vbic. - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 @@ -1807,7 +1807,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels "subs %2, %2, #16 \n" // 16 processed per loop @@ -1847,7 +1847,7 @@ static void ARGBToUV444MatrixRow_NEON( "vneg.s8 d28, d28 \n" "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -1939,7 +1939,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -1984,7 +1984,7 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2029,7 +2029,7 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2075,7 +2075,7 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2121,7 +2121,7 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2166,7 +2166,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2211,7 +2211,7 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2256,7 +2256,7 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2301,7 +2301,7 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2346,7 +2346,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2392,7 +2392,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %4, %4, #16 \n" // 16 processed per loop. RGB565TOARGB @@ -2457,7 +2457,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %4, %4, #16 \n" // 16 processed per loop. RGB555TOARGB @@ -2522,7 +2522,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %4, %4, #16 \n" // 16 processed per loop. ARGB4444TOARGB @@ -2570,7 +2570,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB @@ -2596,7 +2596,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB @@ -2622,7 +2622,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB @@ -2644,7 +2644,7 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q2}, [%0]! \n" "subs %2, %2, #8 \n" // 8 processed per loop. @@ -2669,7 +2669,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, asm volatile( "vld1.8 {q4}, [%3] \n" // shuffler - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q2}, [%0]! \n" "subs %2, %2, #8 \n" // 8 processed per loop. @@ -2693,7 +2693,7 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" "vld1.16 {q2}, [%0]! \n" @@ -2721,7 +2721,7 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, asm volatile( "vld1.8 {d8}, [%3] \n" // shuffler - "1: \n" + "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" "vld1.16 {q2}, [%0]! \n" @@ -2777,7 +2777,7 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, "vdup.u8 d21, d0[1] \n" "vdup.u8 d22, d0[2] \n" "vdup.u16 q12, d0[2] \n" - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop. @@ -2827,7 +2827,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, "vdup.u8 d21, d0[1] \n" "vdup.u8 d22, d0[2] \n" "vdup.u16 q12, d0[2] \n" - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA "vld4.8 {d1, d3, d5, d7}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop. @@ -2871,7 +2871,7 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, "vdup.u8 d21, d0[1] \n" "vdup.u8 d22, d0[2] \n" "vdup.u16 q12, d0[2] \n" - "1: \n" + "1: \n" "vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of // RGB24. "vld3.8 {d3, d5, d7}, [%0]! \n" @@ -2928,7 +2928,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "rsb %4, #256 \n" "vdup.8 d4, %4 \n" // General purpose row blend. - "1: \n" + "1: \n" "vld1.8 {q0}, [%1]! \n" "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" @@ -2943,7 +2943,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "b 99f \n" // Blend 50 / 50. - "50: \n" + "50: \n" "vld1.8 {q0}, [%1]! \n" "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" @@ -2953,13 +2953,13 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "b 99f \n" // Blend 100 / 0 - Copy row unchanged. - "100: \n" + "100: \n" "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" - "99: \n" + "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 @@ -2988,7 +2988,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, "vdup.16 d17, %4 \n" "vdup.16 d16, %5 \n" // General purpose row blend. - "1: \n" + "1: \n" "vld1.16 {q0}, [%1]! \n" "vld1.16 {q1}, [%2]! \n" "subs %3, %3, #8 \n" @@ -3003,7 +3003,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, "b 99f \n" // Blend 50 / 50. - "50: \n" + "50: \n" "vld1.16 {q0}, [%1]! \n" "vld1.16 {q1}, [%2]! \n" "subs %3, %3, #8 \n" @@ -3013,13 +3013,13 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, "b 99f \n" // Blend 100 / 0 - Copy row unchanged. - "100: \n" + "100: \n" "vld1.16 {q0}, [%1]! \n" "subs %3, %3, #8 \n" "vst1.16 {q0}, [%0]! \n" "bgt 100b \n" - "99: \n" + "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_ptr1), // %2 @@ -3038,7 +3038,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, "subs %3, #8 \n" "blt 89f \n" // Blend 8 pixels. - "8: \n" + "8: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. "subs %3, %3, #8 \n" // 8 processed per loop. @@ -3056,12 +3056,12 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. "bge 8b \n" - "89: \n" + "89: \n" "adds %3, #8-1 \n" "blt 99f \n" // Blend 1 pixels. - "1: \n" + "1: \n" "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. "subs %3, %3, #1 \n" // 1 processed per loop. @@ -3079,7 +3079,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. "bge 1b \n" - "99: \n" + "99: \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 @@ -3097,7 +3097,7 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, "vmov.u16 q15, #0x00ff \n" // 255 for rounding up // Attenuate 8 pixels. - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q10, d0, d3 \n" // b * a @@ -3129,7 +3129,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, "vdup.u16 q10, %4 \n" // interval add // 8 pixel loop. - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. "subs %1, %1, #8 \n" // 8 processed per loop. "vmovl.u8 q0, d0 \n" // b (0 .. 255) @@ -3170,7 +3170,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, "vshr.u16 q0, q0, #1 \n" // scale / 2. // 8 pixel loop. - "1: \n" + "1: \n" "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q10, d20 \n" // b (0 .. 255) @@ -3202,7 +3202,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -3235,7 +3235,7 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { "vmov.u8 d28, #24 \n" // BB coefficient "vmov.u8 d29, #98 \n" // BG coefficient "vmov.u8 d30, #50 \n" // BR coefficient - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "subs %1, %1, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d20 \n" // B to Sepia B @@ -3271,7 +3271,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - "1: \n" + "1: \n" "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit @@ -3327,7 +3327,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB "subs %3, %3, #8 \n" // 8 processed per loop. @@ -3356,7 +3356,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB "subs %3, %3, #8 \n" // 8 processed per loop. @@ -3379,7 +3379,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB "subs %3, %3, #8 \n" // 8 processed per loop. @@ -3407,7 +3407,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx, asm volatile( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. - "1: \n" + "1: \n" "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. "vld1.8 {d1}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. @@ -3431,7 +3431,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, int width) { asm volatile( // 16 pixel loop. - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. "vld1.8 {q1}, [%1]! \n" // load 16 sobely. "subs %3, %3, #16 \n" // 16 processed per loop. @@ -3458,7 +3458,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, asm volatile( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. - "1: \n" + "1: \n" "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. "vld1.8 {d0}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. @@ -3483,7 +3483,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, uint8_t* dst_sobelx, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d1}, [%0],%6 \n" "subs %4, %4, #8 \n" // 8 pixels @@ -3521,7 +3521,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, uint8_t* dst_sobely, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d1}, [%1],%4 \n" "subs %3, %3, #8 \n" // 8 pixels @@ -3559,7 +3559,7 @@ void HalfFloatRow_NEON(const uint16_t* src, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.16 {q0, q1}, [%0]! \n" // load 16 shorts "subs %2, %2, #16 \n" // 16 pixels per loop "vmovl.u16 q8, d0 \n" @@ -3593,7 +3593,7 @@ void ByteToFloatRow_NEON(const uint8_t* src, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {d2}, [%0]! \n" // load 8 bytes "subs %2, %2, #8 \n" // 8 pixels per loop "vmovl.u8 q1, d2 \n" // 8 shorts @@ -3624,7 +3624,7 @@ void GaussCol_NEON(const uint16_t* src0, "vmov.u16 d6, #4 \n" // constant 4 "vmov.u16 d7, #6 \n" // constant 6 - "1: \n" + "1: \n" "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows "vld1.16 {q2}, [%4]! \n" "subs %6, %6, #8 \n" // 8 processed per loop @@ -3661,7 +3661,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { "vmov.u32 q10, #4 \n" // constant 4 "vmov.u32 q11, #6 \n" // constant 6 - "1: \n" + "1: \n" "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples "vld1.32 {q2}, [%0] \n" "subs %5, %5, #8 \n" // 8 processed per loop @@ -3696,7 +3696,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, uint8_t* dst_yuv24, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {q2}, [%0]! \n" // load 16 Y values "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values "subs %3, %3, #16 \n" // 16 pixels per loop @@ -3721,7 +3721,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int width) { asm volatile( "add %1, %0, %1 \n" // src_stride + src_AYUV - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. @@ -3752,7 +3752,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, int width) { asm volatile( "add %1, %0, %1 \n" // src_stride + src_AYUV - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. @@ -3781,7 +3781,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, // Similar to ARGBExtractAlphaRow_NEON void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels "subs %2, %2, #16 \n" // 16 processed per loop @@ -3797,7 +3797,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values "vld2.8 {d1, d3}, [%0]! \n" "subs %2, %2, #16 \n" // 16 pixels per loop @@ -3820,7 +3820,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_u_1 = src_u + src_stride_u; const uint8_t* src_v_1 = src_v + src_stride_v; asm volatile( - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 16 U values "vld1.8 {q1}, [%2]! \n" // load 16 V values "vld1.8 {q2}, [%1]! \n" @@ -3852,7 +3852,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, int shift = depth - 16; // Negative for right shift. asm volatile( "vdup.16 q2, %4 \n" - "1: \n" + "1: \n" "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV "subs %3, %3, #8 \n" // 8 src pixels per loop "vshl.u16 q0, q0, q2 \n" @@ -3876,7 +3876,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int shift = 16 - depth; asm volatile( "vdup.16 q2, %4 \n" - "1: \n" + "1: \n" "vld1.16 {q0}, [%0]! \n" // load 8 U "vld1.16 {q1}, [%1]! \n" // load 8 V "subs %3, %3, #8 \n" // 8 src pixels per loop @@ -3898,7 +3898,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, int width) { asm volatile( "vdup.16 q2, %3 \n" - "1: \n" + "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" "subs %2, %2, #16 \n" // 16 src pixels per loop @@ -3920,7 +3920,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, int width) { asm volatile( "vdup.16 d8, %3 \n" - "1: \n" + "1: \n" "vld1.16 {q2, q3}, [%0]! \n" "subs %2, %2, #16 \n" // 16 src pixels per loop "vmull.u16 q0, d4, d8 \n" @@ -3952,7 +3952,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y, int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr asm volatile( "vdup.16 q2, %3 \n" - "1: \n" + "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" "subs %2, %2, #16 \n" // 16 src pixels per loop @@ -3981,7 +3981,7 @@ void Convert8To8Row_NEON(const uint8_t* src_y, asm volatile( "vdup.8 d8, %3 \n" "vdup.8 q5, %4 \n" - "1: \n" + "1: \n" "vld1.8 {q2, q3}, [%0]! \n" "subs %2, %2, #32 \n" // 32 src pixels per loop "vmull.u8 q0, d4, d8 \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 71e132876..8ec539b4e 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -242,7 +242,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ - "1: \n" READYUV444 + "1: \n" READYUV444 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -264,7 +264,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV444 + "1: \n" READYUV444 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" @@ -291,7 +291,7 @@ void I210ToAR30Row_NEON(const uint16_t* src_y, asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV210 + "1: \n" READYUV210 "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -319,7 +319,7 @@ void I410ToAR30Row_NEON(const uint16_t* src_y, asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV410 + "1: \n" READYUV410 "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -346,7 +346,7 @@ void I212ToAR30Row_NEON(const uint16_t* src_y, asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV212 + "1: \n" READYUV212 "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -369,7 +369,7 @@ void I210ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV210 + "1: \n" READYUV210 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -392,7 +392,7 @@ void I410ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV410 + "1: \n" READYUV410 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -417,7 +417,7 @@ void I212ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV212 + "1: \n" READYUV212 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -440,7 +440,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -468,7 +468,7 @@ void P210ToARGBRow_NEON(const uint16_t* src_y, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kIndices]] \n" - "1: \n" // + "1: \n" // READYUVP210 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" @@ -497,7 +497,7 @@ void P410ToARGBRow_NEON(const uint16_t* src_y, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kIndices]] \n" - "1: \n" // + "1: \n" // READYUVP410 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" @@ -524,7 +524,7 @@ void P210ToAR30Row_NEON(const uint16_t* src_y, "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP210 + "1: \n" READYUVP210 "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -550,7 +550,7 @@ void P410ToAR30Row_NEON(const uint16_t* src_y, "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP410 + "1: \n" READYUVP410 "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -577,7 +577,7 @@ void I422ToAR30Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -600,7 +600,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" + "1: \n" "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 "subs %w[width], %w[width], #8 \n" "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8 @@ -626,7 +626,7 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" + "1: \n" "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410 "subs %w[width], %w[width], #8 \n" "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 @@ -652,7 +652,7 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" + "1: \n" "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210 "subs %w[width], %w[width], #8 \n" "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 @@ -678,7 +678,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" + "1: \n" "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 "subs %w[width], %w[width], #8 \n" "prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8 @@ -704,7 +704,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v15.8b, #255 \n" /* A */ - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n" "b.gt 1b \n" @@ -726,7 +726,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" @@ -767,7 +767,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8_TOP ARGBTORGB565_FROM_TOP "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. @@ -808,7 +808,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8h, #0x80, lsl #8 \n" - "1: \n" // + "1: \n" // READYUV422 "subs %w[width], %w[width], #8 \n" // I4XXTORGB RGBTORGB8_TOP ARGBTOARGB1555_FROM_TOP "st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels @@ -838,7 +838,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 + "1: \n" READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "movi v19.8b, #255 \n" ARGBTOARGB4444 "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8 @@ -867,7 +867,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ "umull v4.8h, v1.8b, v28.8b \n" /* DB */ "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ - "1: \n" READYUV400 I400TORGB + "1: \n" READYUV400 I400TORGB "subs %w[width], %w[width], #8 \n" RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -883,7 +883,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( "movi v23.8b, #255 \n" - "1: \n" + "1: \n" "ld1 {v20.8b}, [%0], #8 \n" "subs %w2, %w2, #8 \n" "prfm pldl1keep, [%0, 448] \n" @@ -901,7 +901,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( "movi v20.8b, #255 \n" - "1: \n" + "1: \n" "ldr d16, [%0], #8 \n" "subs %w2, %w2, #8 \n" "zip1 v18.16b, v16.16b, v16.16b \n" // YY @@ -928,7 +928,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 + "1: \n" READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -951,7 +951,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 + "1: \n" READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -973,7 +973,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 + "1: \n" READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" @@ -995,7 +995,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 + "1: \n" READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" @@ -1017,7 +1017,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 + "1: \n" READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8_TOP ARGBTORGB565_FROM_TOP "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 @@ -1042,7 +1042,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV21InterleavedTable]] \n" - "1: \n" READYUY2 + "1: \n" READYUY2 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -1063,7 +1063,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12InterleavedTable]] \n" - "1: \n" READUYVY + "1: \n" READUYVY "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -1082,7 +1082,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "subs %w3, %w3, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" @@ -1107,7 +1107,7 @@ void DetileRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes "subs %w2, %w2, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 1792] \n" // 7 tiles of 256b ahead @@ -1127,7 +1127,7 @@ void DetileRow_16_NEON(const uint16_t* src, uint16_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels "subs %w2, %w2, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 3584] \n" // 7 tiles of 512b ahead @@ -1148,7 +1148,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v0.8b,v1.8b}, [%0], %4 \n" "subs %w3, %w3, #16 \n" "prfm pldl1keep, [%0, 1792] \n" @@ -1173,7 +1173,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, uint8_t* dst_yuy2, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys "subs %w3, %w3, #16 \n" // store 8 YUY2 "prfm pldl1keep, [%0, 1792] \n" @@ -1199,7 +1199,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, uint8_t* dst_yuy2, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs "subs %w3, %w3, #16 \n" @@ -1224,7 +1224,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, // tinyurl.com/mtk-10bit-video-format for format documentation. void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { asm volatile( - "1: \n" + "1: \n" "ld1 {v7.16b}, [%0], #16 \n" "ld1 {v0.16b-v3.16b}, [%0], #64 \n" "subs %2, %2, #80 \n" @@ -1265,7 +1265,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, uint8_t* dst_uv, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V "subs %w3, %w3, #16 \n" // 16 processed per loop @@ -1290,7 +1290,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int shift = 16 - depth; asm volatile( "dup v2.8h, %w4 \n" - "1: \n" + "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U "ld1 {v1.8h}, [%1], #16 \n" // load 8 V "subs %w3, %w3, #8 \n" // 8 src pixels per loop @@ -1314,7 +1314,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, uint8_t* dst_uv, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V "subs %w3, %w3, #16 \n" // 16 processed per loop @@ -1341,7 +1341,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int shift = 16 - depth; asm volatile( "dup v4.8h, %w4 \n" - "1: \n" + "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U "ld1 {v1.8h}, [%1], #16 \n" // load 8 V "subs %w3, %w3, #8 \n" // 8 src pixels per loop @@ -1369,7 +1369,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_b, int width) { asm volatile( - "1: \n" + "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" @@ -1394,7 +1394,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r, uint8_t* dst_rgb, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v2.16b}, [%2], #16 \n" // load B @@ -1422,7 +1422,7 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_a, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w5, %w5, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" @@ -1451,7 +1451,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v2.16b}, [%0], #16 \n" // load R @@ -1482,7 +1482,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v2.16b}, [%0], #16 \n" // load R @@ -1522,7 +1522,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_b, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" @@ -1548,7 +1548,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, int width) { asm volatile( "movi v3.16b, #255 \n" // load A(255) - "1: \n" + "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v0.16b}, [%2], #16 \n" // load B @@ -1579,7 +1579,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r, "movi v30.16b, #255 \n" "ushr v30.4s, v30.4s, #22 \n" // 1023 "dup v31.4s, %w5 \n" - "1: \n" + "1: \n" "ldr d2, [%2], #8 \n" // B "ldr d1, [%1], #8 \n" // G "ldr d0, [%0], #8 \n" // R @@ -1620,7 +1620,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, "dup v5.8h, %w[limit] \n" "movi v6.8h, #16 \n" // 1 << 4 "movi v7.8h, #4, lsl #8 \n" // 1 << 10 - "1: \n" + "1: \n" "ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr "ldr q1, [%1], #16 \n" // xxxxxxGggggggggg "ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb @@ -1656,7 +1656,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r, "dup v30.8h, %w7 \n" "dup v31.8h, %w6 \n" - "1: \n" + "1: \n" "ldr q2, [%0], #16 \n" // R "ldr q1, [%1], #16 \n" // G "ldr q0, [%2], #16 \n" // B @@ -1701,7 +1701,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r, "dup v30.8h, %w6 \n" "dup v31.8h, %w5 \n" - "1: \n" + "1: \n" "ldr q2, [%0], #16 \n" // R "ldr q1, [%1], #16 \n" // G "ldr q0, [%2], #16 \n" // B @@ -1738,7 +1738,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, int shift = 16 - depth; asm volatile( "dup v31.8h, %w6 \n" - "1: \n" + "1: \n" "ldr q0, [%0], #16 \n" // B "ldr q1, [%1], #16 \n" // G "ldr q2, [%2], #16 \n" // R @@ -1777,7 +1777,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, asm volatile( "dup v31.8h, %w5 \n" "movi v3.16b, #0xff \n" // A (0xff) - "1: \n" + "1: \n" "ldr q0, [%0], #16 \n" // B "ldr q1, [%1], #16 \n" // G "ldr q2, [%2], #16 \n" // R @@ -1804,7 +1804,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, // Copy multiple of 32. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #32 \n" // 32 processed per loop @@ -1822,7 +1822,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { asm volatile( "dup v0.16b, %w2 \n" // duplicate 16 bytes - "1: \n" + "1: \n" "subs %w1, %w1, #16 \n" // 16 bytes per loop "st1 {v0.16b}, [%0], #16 \n" // store "b.gt 1b \n" @@ -1835,7 +1835,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { asm volatile( "dup v0.4s, %w2 \n" // duplicate 4 ints - "1: \n" + "1: \n" "subs %w1, %w1, #4 \n" // 4 ints per loop "st1 {v0.16b}, [%0], #16 \n" // store "b.gt 1b \n" @@ -1855,7 +1855,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { "ld1 {v3.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw \n" "sub %0, %0, #32 \n" - "1: \n" + "1: \n" "ldr q2, [%0, 16] \n" "ldr q1, [%0], -32 \n" // src -= 32 "subs %w2, %w2, #32 \n" // 32 pixels per loop. @@ -1880,7 +1880,7 @@ void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { "ld1 {v4.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw #1 \n" "sub %0, %0, #32 \n" - "1: \n" + "1: \n" "ldr q1, [%0, 16] \n" "ldr q0, [%0], -32 \n" // src -= 32 "subs %w2, %w2, #16 \n" // 16 pixels per loop. @@ -1904,7 +1904,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, "ld1 {v4.16b}, [%4] \n" // shuffler "add %0, %0, %w3, sxtw #1 \n" "sub %0, %0, #32 \n" - "1: \n" + "1: \n" "ldr q1, [%0, 16] \n" "ldr q0, [%0], -32 \n" // src -= 32 "subs %w3, %w3, #16 \n" // 16 pixels per loop. @@ -1933,7 +1933,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { "ld1 {v4.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw #2 \n" "sub %0, %0, #32 \n" - "1: \n" + "1: \n" "ldr q1, [%0, 16] \n" "ldr q0, [%0], -32 \n" // src -= 32 "subs %w2, %w2, #8 \n" // 8 pixels per loop. @@ -1957,7 +1957,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, "add %0, %0, %w2, sxtw \n" "sub %0, %0, #48 \n" - "1: \n" + "1: \n" "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48 "subs %w2, %w2, #16 \n" // 16 pixels per loop. "tbl v0.16b, {v0.16b}, v3.16b \n" @@ -1978,7 +1978,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, int width) { asm volatile( "movi v4.8b, #255 \n" // Alpha - "1: \n" + "1: \n" "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of // RGB24. "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -1996,7 +1996,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( "movi v5.8b, #255 \n" // Alpha - "1: \n" + "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. "mov v3.8b, v1.8b \n" // move g @@ -2015,7 +2015,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( "movi v0.8b, #255 \n" // Alpha - "1: \n" + "1: \n" "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. "mov v2.8b, v4.8b \n" // move g @@ -2033,7 +2033,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( - "1: \n" + "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. "mov v3.8b, v1.8b \n" // move g @@ -2065,7 +2065,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, int width) { asm volatile( "movi v3.16b, #255 \n" // Alpha - "1: \n" + "1: \n" "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels "subs %w2, %w2, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB @@ -2112,7 +2112,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "ldp q0, q4, [%0], #32 \n" // load 16 ARGB1555 pixels "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 processed per loop @@ -2148,7 +2148,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB @@ -2180,7 +2180,7 @@ static void ABCDToAR30Row_NEON(const uint8_t* src_abcd, "movi v2.4s, #0xf, msl 16 \n" // 0xfffff "ldr q3, [%[kAR30Row_BoxShifts]] \n" "ldp q4, q5, [%[indices]] \n" - "1: \n" + "1: \n" "ldp q0, q20, [%[src]], #32 \n" "subs %w[width], %w[width], #8 \n" "tbl v1.16b, {v0.16b}, v5.16b \n" @@ -2219,7 +2219,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w2, %w2, #16 \n" // 16 pixels per loop. "prfm pldl1keep, [%0, 448] \n" @@ -2235,7 +2235,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "subs %w2, %w2, #8 \n" // 8 processed per loop. "mov v4.8b, v2.8b \n" // mov g @@ -2253,7 +2253,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %w2, %w2, #16 \n" // 16 processed per loop. "prfm pldl1keep, [%0, 448] \n" @@ -2269,7 +2269,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %w2, %w2, #16 \n" // 16 processed per loop. "prfm pldl1keep, [%0, 448] \n" @@ -2288,7 +2288,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "prfm pldl1keep, [%0, 448] \n" @@ -2309,7 +2309,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "prfm pldl1keep, [%0, 448] \n" @@ -2332,7 +2332,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, int width) { const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile( - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row @@ -2360,7 +2360,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, int width) { const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; asm volatile( - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row @@ -2387,7 +2387,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, int width) { const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile( - "1: \n" + "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row @@ -2411,7 +2411,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, int width) { asm volatile( "ld1 {v2.16b}, [%3] \n" // shuffler - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. "subs %w2, %w2, #4 \n" // 4 processed per loop "prfm pldl1keep, [%0, 448] \n" @@ -2432,7 +2432,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, uint8_t* dst_yuy2, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys "subs %w4, %w4, #16 \n" // 16 pixels "mov v2.8b, v1.8b \n" @@ -2456,7 +2456,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, uint8_t* dst_uyvy, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys "subs %w4, %w4, #16 \n" // 16 pixels "mov v3.8b, v2.8b \n" @@ -2478,7 +2478,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -2498,7 +2498,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, int width) { asm volatile( "dup v1.4s, %w3 \n" // dither4 - "1: \n" + "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "uqadd v16.8b, v16.8b, v1.8b \n" @@ -2518,7 +2518,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb1555, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v16.8h,v17.8h}, [%0], #32 \n" // load 8 pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 @@ -2535,7 +2535,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -2554,7 +2554,7 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { asm volatile( - "1: \n" + "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "mov v1.16b, v0.16b \n" @@ -2578,7 +2578,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "ldr q4, [%3] \n" // shuffler - "1: \n" + "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "tbl v0.16b, {v0.16b}, v4.16b \n" @@ -2600,7 +2600,7 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { asm volatile( - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "zip1 v2.16b, v0.16b, v0.16b \n" @@ -2626,7 +2626,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "ldp q6, q7, [%3] \n" // 2 shufflers - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64 @@ -2652,7 +2652,7 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, int width) { asm volatile( "ldr q4, [%3] \n" // shuffler - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -2676,7 +2676,7 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, int width) { asm volatile( "ldr q4, [%3] \n" // shuffler - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -2696,7 +2696,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 processed per loop @@ -2735,7 +2735,7 @@ static void ARGBToUV444MatrixRow_NEON( "neg v28.16b, v28.16b \n" "movi v29.16b, #0x80 \n" // 128.5 - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B @@ -2770,7 +2770,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( const struct RgbUVConstants* rgbuvconstants) { asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" "movi v29.16b, #0x80 \n" // 128.5 - "1: \n" + "1: \n" "ldp q0, q1, [%[src]], #32 \n" "subs %w[width], %w[width], #8 \n" // 8 processed per loop. "movi v2.4s, #0 \n" @@ -2885,7 +2885,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. @@ -2931,7 +2931,7 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "movi v23.8h, #20 \n" // VB coeff (-0.08131) "movi v24.8h, #107 \n" // VG coeff (-0.41869) "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. @@ -2976,7 +2976,7 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, "movi v23.8h, #20 \n" // VB coeff (-0.08131) "movi v24.8h, #107 \n" // VG coeff (-0.41869) "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. @@ -3021,7 +3021,7 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, "movi v23.8h, #20 \n" // VB coeff (-0.08131) "movi v24.8h, #107 \n" // VG coeff (-0.41869) "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" + "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. @@ -3066,7 +3066,7 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, "movi v23.8h, #20 \n" // VB coeff (-0.08131) "movi v24.8h, #107 \n" // VG coeff (-0.41869) "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" + "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. @@ -3106,7 +3106,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. @@ -3146,7 +3146,7 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. @@ -3186,7 +3186,7 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. @@ -3226,7 +3226,7 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" + "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. @@ -3266,7 +3266,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG - "1: \n" + "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. @@ -3307,7 +3307,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; asm volatile( RGBTOUV_SETUP_REG - "1: \n" + "1: \n" "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. RGB565TOARGB @@ -3351,7 +3351,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; asm volatile( RGBTOUV_SETUP_REG - "1: \n" + "1: \n" "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. RGB555TOARGB @@ -3395,7 +3395,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; asm volatile( RGBTOUV_SETUP_REG // sets v20-v25 - "1: \n" + "1: \n" "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. ARGB4444TORGB @@ -3438,7 +3438,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { "movi v25.16b, #129 \n" // G * 0.5078 coefficient "movi v26.16b, #66 \n" // R * 0.2578 coefficient "movi v27.16b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels. "subs %w2, %w2, #16 \n" // 16 processed per loop. RGB565TOARGB @@ -3471,7 +3471,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, "movi v5.16b, #129 \n" // G * 0.5078 coefficient "movi v6.16b, #66 \n" // R * 0.2578 coefficient "movi v7.16b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 // pixels. "subs %w2, %w2, #16 \n" // 16 processed per loop. @@ -3504,7 +3504,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, "movi v25.16b, #129 \n" // G * 0.5078 coefficient "movi v26.16b, #66 \n" // R * 0.2578 coefficient "movi v27.16b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 // pixels. "subs %w2, %w2, #16 \n" // 16 processed per loop. @@ -3544,7 +3544,7 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, "dup v7.16b, v0.b[1] \n" "dup v16.16b, v0.b[2] \n" "dup v17.8h, v0.h[2] \n" - "1: \n" + "1: \n" "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16 // pixels. "subs %w2, %w2, #16 \n" // 16 processed per loop. @@ -3576,7 +3576,7 @@ static void ARGBToYMatrixRow_NEON_DotProd( "ldr d0, [%3] \n" // load rgbconstants "dup v16.4s, v0.s[0] \n" "dup v17.8h, v0.h[2] \n" - "1: \n" + "1: \n" "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16 // pixels. "subs %w2, %w2, #16 \n" // 16 processed per loop. @@ -3680,7 +3680,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, "dup v7.16b, v0.b[1] \n" "dup v16.16b, v0.b[2] \n" "dup v17.8h, v0.h[2] \n" - "1: \n" + "1: \n" "ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16 // pixels. "subs %w2, %w2, #16 \n" // 16 processed per loop. @@ -3752,7 +3752,7 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, "dup v6.16b, v0.b[1] \n" "dup v7.16b, v0.b[2] \n" "dup v16.8h, v0.h[2] \n" - "1: \n" + "1: \n" "ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels. "subs %w2, %w2, #16 \n" // 16 processed per loop. "umull v0.8h, v2.8b, v5.8b \n" // B @@ -3807,7 +3807,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "dup v5.16b, %w4 \n" "dup v4.16b, %w5 \n" // General purpose row blend. - "1: \n" + "1: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" "subs %w3, %w3, #16 \n" @@ -3824,7 +3824,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "b 99f \n" // Blend 50 / 50. - "50: \n" + "50: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" "subs %w3, %w3, #16 \n" @@ -3836,14 +3836,14 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "b 99f \n" // Blend 100 / 0 - Copy row unchanged. - "100: \n" + "100: \n" "ld1 {v0.16b}, [%1], #16 \n" "subs %w3, %w3, #16 \n" "prfm pldl1keep, [%1, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 100b \n" - "99: \n" + "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_ptr1), // %2 @@ -3873,7 +3873,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, "dup v5.8h, %w4 \n" "dup v4.8h, %w5 \n" // General purpose row blend. - "1: \n" + "1: \n" "ld1 {v0.8h}, [%1], #16 \n" "ld1 {v1.8h}, [%2], #16 \n" "subs %w3, %w3, #8 \n" @@ -3890,7 +3890,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, "b 99f \n" // Blend 50 / 50. - "50: \n" + "50: \n" "ld1 {v0.8h}, [%1], #16 \n" "ld1 {v1.8h}, [%2], #16 \n" "subs %w3, %w3, #8 \n" @@ -3902,14 +3902,14 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, "b 99f \n" // Blend 100 / 0 - Copy row unchanged. - "100: \n" + "100: \n" "ld1 {v0.8h}, [%1], #16 \n" "subs %w3, %w3, #8 \n" "prfm pldl1keep, [%1, 448] \n" "st1 {v0.8h}, [%0], #16 \n" "b.gt 100b \n" - "99: \n" + "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_ptr1), // %2 @@ -3946,7 +3946,7 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, "dup v5.8h, %w4 \n" "dup v4.8h, %w5 \n" // General purpose row blend. - "1: \n" + "1: \n" "ld1 {v0.8h}, [%1], #16 \n" "ld1 {v1.8h}, [%2], #16 \n" "subs %w3, %w3, #8 \n" @@ -3965,7 +3965,7 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, "b 99f \n" // Blend 50 / 50. - "50: \n" + "50: \n" "ld1 {v0.8h}, [%1], #16 \n" "ld1 {v1.8h}, [%2], #16 \n" "subs %w3, %w3, #8 \n" @@ -3979,7 +3979,7 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, "b 99f \n" // Blend 100 / 0 - Copy row unchanged. - "100: \n" + "100: \n" "ldr q0, [%1], #16 \n" "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative "prfm pldl1keep, [%1, 448] \n" @@ -3988,7 +3988,7 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, "str d0, [%0], #8 \n" // store 8 pixels "b.gt 100b \n" - "99: \n" + "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_ptr1), // %2 @@ -4008,7 +4008,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, "subs %w3, %w3, #8 \n" "b.lt 89f \n" // Blend 8 pixels. - "8: \n" + "8: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -4031,12 +4031,12 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, // pixels "b.ge 8b \n" - "89: \n" + "89: \n" "adds %w3, %w3, #8-1 \n" "b.lt 99f \n" // Blend 1 pixels. - "1: \n" + "1: \n" "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel // ARGB0. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel @@ -4060,7 +4060,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. "b.ge 1b \n" - "99: \n" + "99: \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 @@ -4079,7 +4079,7 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, "movi v7.8h, #0x00ff \n" // 255 for rounding up // Attenuate 8 pixels. - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v3.8b \n" // b * a @@ -4112,7 +4112,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, "dup v6.8h, %w4 \n" // interval add // 8 pixel loop. - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. "subs %w1, %w1, #8 \n" // 8 processed per loop. "uxtl v0.8h, v0.8b \n" // b (0 .. 255) @@ -4155,7 +4155,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, "ushr v0.8h, v0.8h, #1 \n" // scale / 2. // 8 pixel loop. - "1: \n" + "1: \n" "ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v4.8h, v4.8b \n" @@ -4188,7 +4188,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { "movi v24.8b, #29 \n" // B * 0.1140 coefficient "movi v25.8b, #150 \n" // G * 0.5870 coefficient "movi v26.8b, #77 \n" // R * 0.2990 coefficient - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B @@ -4217,7 +4217,7 @@ void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb, asm volatile( "ld1r {v24.4s}, [%[coeffs]] \n" "ldr q25, [%[indices]] \n" - "1: \n" + "1: \n" "ldp q1, q3, [%[src]], #32 \n" // load 8 ARGB "subs %w[width], %w[width], #8 \n" // 8 processed per loop "movi v0.4s, #0 \n" @@ -4255,7 +4255,7 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { "movi v28.8b, #24 \n" // BB coefficient "movi v29.8b, #98 \n" // BG coefficient "movi v30.8b, #50 \n" // BR coefficient - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. "subs %w1, %w1, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B @@ -4288,7 +4288,7 @@ void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) { asm volatile( "ld3r {v20.4s, v21.4s, v22.4s}, [%[coeffs]] \n" "ldr d23, [%[indices]] \n" - "1: \n" + "1: \n" "ldp q0, q1, [%[dst]] \n" "subs %w1, %w1, #8 \n" "movi v2.4s, #0 \n" @@ -4333,7 +4333,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - "1: \n" + "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit @@ -4390,7 +4390,7 @@ void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb, asm volatile( "ld1 {v31.16b}, [%[matrix_argb]] \n" - "1: \n" + "1: \n" "ld1 {v0.16b, v1.16b}, [%[src_argb]], #32 \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -4446,7 +4446,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -4477,7 +4477,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 ARGB "ldp q4, q5, [%1], #32 \n" // load 8 more "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -4502,7 +4502,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 ARGB "ldp q4, q5, [%1], #32 \n" // load 8 more "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -4532,7 +4532,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx, asm volatile( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. - "1: \n" + "1: \n" "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -4558,7 +4558,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, int width) { asm volatile( // 16 pixel loop. - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. "subs %w3, %w3, #16 \n" // 16 processed per loop. @@ -4587,7 +4587,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, asm volatile( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. - "1: \n" + "1: \n" "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -4614,7 +4614,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, uint8_t* dst_sobelx, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v1.8b}, [%0],%6 \n" "subs %w4, %w4, #8 \n" // 8 pixels @@ -4655,7 +4655,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, uint8_t* dst_sobely, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.8b}, [%0],%4 \n" // left "ld1 {v1.8b}, [%1],%4 \n" "subs %w3, %w3, #8 \n" // 8 pixels @@ -4690,7 +4690,7 @@ void HalfFloatRow_NEON(const uint16_t* src, float scale, int width) { asm volatile( - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" // load 16 shorts "subs %w2, %w2, #16 \n" // 16 pixels per loop "uxtl v2.4s, v0.4h \n" @@ -4724,7 +4724,7 @@ void ByteToFloatRow_NEON(const uint8_t* src, float scale, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v1.8h, v1.8b \n" // 8 shorts @@ -4749,7 +4749,7 @@ void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16 float* dst, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v1.8h}, [%0], #16 \n" // load 8 halffloats "subs %w2, %w2, #8 \n" // 8 floats per loop "prfm pldl1keep, [%0, 448] \n" @@ -4773,7 +4773,7 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 asm volatile( "cmp %w2, #8 \n" // Is there 8 rows? "b.lo 2f \n" - "1: \n" + "1: \n" "ld1 {v0.h}[0], [%0], %3 \n" // load 8 halffloats "ld1 {v0.h}[1], [%0], %3 \n" "ld1 {v0.h}[2], [%0], %3 \n" @@ -4790,13 +4790,13 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 "b.gt 1b \n" "cmp %w2, #1 \n" // Is there 1 value? "b.lo 3f \n" - "2: \n" + "2: \n" "ld1 {v1.h}[0], [%0], %3 \n" // load 1 halffloats "subs %w2, %w2, #1 \n" // 1 floats per loop "fcvtl v2.4s, v1.4h \n" // 1 floats "str s2, [%1], #4 \n" // store 1 floats "b.gt 2b \n" - "3: \n" + "3: \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -4809,7 +4809,7 @@ void ConvertFP32ToFP16Row_NEON(const float* src, uint16_t* dst, // fp16 int width) { asm volatile( - "1: \n" + "1: \n" "ldp q2, q3, [%0], #32 \n" // load 8 floats "subs %w2, %w2, #8 \n" // 8 floats per loop "prfm pldl1keep, [%0, 448] \n" @@ -4833,7 +4833,7 @@ float ScaleMaxSamples_NEON(const float* src, "movi v5.4s, #0 \n" // max "movi v6.4s, #0 \n" - "1: \n" + "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v3.4s, v1.4s, %4.s[0] \n" // scale @@ -4863,7 +4863,7 @@ float ScaleSumSamples_NEON(const float* src, "movi v5.4s, #0 \n" // max "movi v6.4s, #0 \n" // max - "1: \n" + "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v3.4s, v1.4s, %4.s[0] \n" // scale @@ -4887,7 +4887,7 @@ float ScaleSumSamples_NEON(const float* src, void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "subs %w2, %w2, #8 \n" // 8 processed per loop "prfm pldl1keep, [%0, 448] \n" @@ -4914,7 +4914,7 @@ void GaussCol_NEON(const uint16_t* src0, "movi v6.8h, #4 \n" // constant 4 "movi v7.8h, #6 \n" // constant 6 - "1: \n" + "1: \n" "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows "ld1 {v2.8h}, [%4], #16 \n" "subs %w6, %w6, #8 \n" // 8 processed per loop @@ -4956,7 +4956,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { "movi v6.4s, #4 \n" // constant 4 "movi v7.4s, #6 \n" // constant 6 - "1: \n" + "1: \n" "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples "subs %w5, %w5, #8 \n" // 8 processed per loop "add v0.4s, v0.4s, v1.4s \n" // * 1 @@ -4998,7 +4998,7 @@ void GaussCol_F32_NEON(const float* src0, asm volatile( "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 - "1: \n" + "1: \n" "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows "ld1 {v2.4s, v3.4s}, [%1], #32 \n" "subs %w6, %w6, #8 \n" // 8 processed per loop @@ -5036,7 +5036,7 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) { asm volatile( "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 - "1: \n" + "1: \n" "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5 // rows "subs %w2, %w2, #8 \n" // 8 processed per loop @@ -5073,7 +5073,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, uint8_t* dst_yuv24, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values "subs %w3, %w3, #16 \n" // 16 pixels per loop @@ -5105,7 +5105,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, int width) { asm volatile( "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values "subs %w3, %w3, #16 \n" // 16 pixels per loop @@ -5135,7 +5135,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; asm volatile( - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv "subs %w3, %w3, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. @@ -5164,7 +5164,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; asm volatile( - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv "subs %w3, %w3, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. @@ -5189,7 +5189,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, // Copy row of AYUV Y's into Y void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 "subs %w2, %w2, #16 \n" // 16 pixels per loop "prfm pldl1keep, [%0, 448] \n" @@ -5205,7 +5205,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values "ld1 {v1.16b}, [%0], 16 \n" "subs %w2, %w2, #16 \n" // 16 pixels per loop @@ -5230,7 +5230,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_u_1 = src_u + src_stride_u; const uint8_t* src_v_1 = src_v + src_stride_v; asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values "ld1 {v2.16b}, [%1], #16 \n" @@ -5266,7 +5266,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, int shift = depth - 16; // Negative for right shift. asm volatile( "dup v2.8h, %w4 \n" - "1: \n" + "1: \n" "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ushl v0.8h, v0.8h, v2.8h \n" @@ -5289,7 +5289,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, int width) { asm volatile( "dup v2.8h, %w3 \n" - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" "subs %w2, %w2, #16 \n" // 16 src pixels per loop "mul v0.8h, v0.8h, v2.8h \n" @@ -5310,7 +5310,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, int width) { asm volatile( "dup v4.8h, %w3 \n" - "1: \n" + "1: \n" "ldp q2, q3, [%0], #32 \n" "subs %w2, %w2, #16 \n" // 16 src pixels per loop "umull v0.4s, v2.4h, v4.4h \n" @@ -5344,7 +5344,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y, int shift = 23 - __builtin_clz((int32_t)scale); asm volatile( "dup v2.8h, %w3 \n" - "1: \n" + "1: \n" "ldp q0, q1, [%0], #32 \n" "subs %w2, %w2, #16 \n" // 16 src pixels per loop "uqshl v0.8h, v0.8h, v2.8h \n" @@ -5372,7 +5372,7 @@ void Convert8To8Row_NEON(const uint8_t* src_y, asm volatile( "dup v4.16b, %w3 \n" // scale "dup v5.16b, %w4 \n" // bias - "1: \n" + "1: \n" "ldp q2, q3, [%0], #32 \n" "subs %w2, %w2, #32 \n" // 32 pixels per loop "umull v0.8h, v2.8b, v4.8b \n" diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index c5dabd409..f699a49bf 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -100,7 +100,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, asm volatile( // 16 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -130,7 +130,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, "pxor %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -161,7 +161,7 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, "pxor %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm2 \n" @@ -196,7 +196,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -208,7 +208,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -228,7 +228,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -242,7 +242,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -261,7 +261,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" @@ -283,7 +283,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -303,7 +303,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, "pslld $0x10,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -337,7 +337,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, "lea 0x00(%4,%4,2),%3 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%4,1),%%xmm2 \n" @@ -389,7 +389,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr, "vpslld $0x10,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" @@ -404,7 +404,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -423,7 +423,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" @@ -457,7 +457,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -482,7 +482,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, "m"(kShuf2) // %2 ); asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm2 \n" "lea 0x20(%0),%0 \n" @@ -527,7 +527,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "m"(kRound34) // %2 ); asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" @@ -592,7 +592,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ); asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" "pavgb %%xmm6,%%xmm7 \n" @@ -646,7 +646,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, "movdqa %4,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -683,7 +683,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "m"(kScaleAb2) // %3 ); asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n" "lea 0x10(%0),%0 \n" @@ -725,7 +725,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "m"(kScaleAc33) // %2 ); asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n" "movhlps %%xmm0,%%xmm1 \n" @@ -789,7 +789,7 @@ void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, "psllw $1,%%xmm6 \n" // all 2 LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm1 \n" // 01234567 "movq 1(%0),%%xmm2 \n" // 12345678 "movdqa %%xmm1,%%xmm3 \n" @@ -839,7 +839,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( - "1: \n" + "1: \n" "pxor %%xmm0,%%xmm0 \n" // 0 // above line "movq (%0),%%xmm1 \n" // 01234567 @@ -958,7 +958,7 @@ void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, "psllw $1,%%xmm4 \n" // all 2 LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // 01234567 (16) "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) @@ -1010,7 +1010,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, "movdqa %5,%%xmm6 \n" LABELALIGN - "1: \n" + "1: \n" // above line "movdqu (%0),%%xmm0 \n" // 01234567 (16) "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) @@ -1108,7 +1108,7 @@ void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, "pslld $1,%%xmm4 \n" // all 2 LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" // 0123 (16b) "movq 2(%0),%%xmm1 \n" // 1234 (16b) @@ -1161,7 +1161,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, "pslld $3,%%xmm6 \n" // all 8 LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) @@ -1269,7 +1269,7 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, "movdqa %3,%%xmm3 \n" LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" // 01234567 "movq 1(%0),%%xmm1 \n" // 12345678 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 @@ -1310,7 +1310,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "movdqa %5,%%xmm7 \n" LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" // 01234567 "movq 1(%0),%%xmm1 \n" // 12345678 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 @@ -1395,7 +1395,7 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, "vbroadcastf128 %3,%%ymm3 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" @@ -1417,7 +1417,7 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" // 16 sample to 32 sample "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1439,7 +1439,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "vbroadcastf128 %5,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" @@ -1498,7 +1498,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" // 16 sample to 32 sample "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1521,7 +1521,7 @@ void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b) "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b) @@ -1551,7 +1551,7 @@ void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, "lea 0x40(%1),%1 \n" // 16 sample to 32 sample "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1573,7 +1573,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) @@ -1613,7 +1613,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, "lea 0x20(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1634,7 +1634,7 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) @@ -1663,7 +1663,7 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1684,7 +1684,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) @@ -1747,7 +1747,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1765,7 +1765,7 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, // 16 pixel loop. LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm3 \n" "lea 0x10(%0),%0 \n" // src_ptr += 16 "movdqu (%1),%%xmm0 \n" @@ -1795,7 +1795,7 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr, asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm3 \n" "lea 0x20(%0),%0 \n" // src_ptr += 32 "vpermq $0xd8,%%ymm3,%%ymm3 \n" @@ -1808,7 +1808,7 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr, "lea 0x40(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -1854,7 +1854,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, "pextrw $0x3,%%xmm2,%k4 \n" LABELALIGN - "2: \n" + "2: \n" "movdqa %%xmm2,%%xmm1 \n" "paddd %%xmm3,%%xmm2 \n" "movzwl 0x00(%1,%3,1),%k2 \n" @@ -1881,7 +1881,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, "jge 2b \n" LABELALIGN - "29: \n" + "29: \n" "addl $0x1,%5 \n" "jl 99f \n" "movzwl 0x00(%1,%3,1),%k2 \n" @@ -1897,7 +1897,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, "packuswb %%xmm2,%%xmm2 \n" "movd %%xmm2,%k2 \n" "mov %b2,(%0) \n" - "99: \n" + "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "=&a"(temp_pixel), // %2 @@ -1931,7 +1931,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, (void)x; (void)dx; asm volatile( - "1: \n" + "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1956,7 +1956,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -1978,7 +1978,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -2002,7 +2002,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { asm volatile( - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm2 \n" @@ -2040,7 +2040,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, "lea 0x00(%1,%1,2),%4 \n" LABELALIGN - "1: \n" + "1: \n" "movd (%0),%%xmm0 \n" "movd 0x00(%0,%1,1),%%xmm1 \n" "punpckldq %%xmm1,%%xmm0 \n" @@ -2078,7 +2078,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, "lea 0x00(%0,%5,1),%5 \n" LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" "movhps 0x00(%0,%1,1),%%xmm0 \n" "movq 0x00(%0,%1,2),%%xmm1 \n" @@ -2134,7 +2134,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb, "jl 49f \n" LABELALIGN - "40: \n" + "40: \n" "movd 0x00(%3,%0,4),%%xmm0 \n" "movd 0x00(%3,%1,4),%%xmm1 \n" "pextrw $0x5,%%xmm2,%k0 \n" @@ -2152,7 +2152,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb, "sub $0x4,%4 \n" "jge 40b \n" - "49: \n" + "49: \n" "test $0x2,%4 \n" "je 29f \n" "movd 0x00(%3,%0,4),%%xmm0 \n" @@ -2161,12 +2161,12 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb, "punpckldq %%xmm1,%%xmm0 \n" "movq %%xmm0,(%2) \n" "lea 0x8(%2),%2 \n" - "29: \n" + "29: \n" "test $0x1,%4 \n" "je 99f \n" "movd 0x00(%3,%0,4),%%xmm0 \n" "movd %%xmm0,(%2) \n" - "99: \n" + "99: \n" : "=&a"(x0), // %0 "=&d"(x1), // %1 "+r"(dst_argb), // %2 @@ -2187,7 +2187,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, (void)x; (void)dx; asm volatile( - "1: \n" + "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -2248,7 +2248,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, "pextrw $0x3,%%xmm2,%k4 \n" LABELALIGN - "2: \n" + "2: \n" "movdqa %%xmm2,%%xmm1 \n" "paddd %%xmm3,%%xmm2 \n" "movq 0x00(%1,%3,4),%%xmm0 \n" @@ -2268,7 +2268,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, "jge 2b \n" LABELALIGN - "29: \n" + "29: \n" "add $0x1,%2 \n" "jl 99f \n" "psrlw $0x9,%%xmm2 \n" @@ -2281,7 +2281,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, "packuswb %%xmm0,%%xmm0 \n" "movd %%xmm0,(%0) \n" - LABELALIGN "99: \n" + LABELALIGN "99: \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 @@ -2296,7 +2296,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, // Divide num by div and return as 16.16 fixed point result. int FixedDiv_X86(int num, int div) { asm volatile( - "cdq \n" + "cdq \n" "shld $0x10,%%eax,%%edx \n" "shl $0x10,%%eax \n" "idiv %1 \n" @@ -2310,7 +2310,7 @@ int FixedDiv_X86(int num, int div) { // Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_X86(int num, int div) { asm volatile( - "cdq \n" + "cdq \n" "shld $0x10,%%eax,%%edx \n" "shl $0x10,%%eax \n" "sub $0x10001,%%eax \n" @@ -2350,7 +2350,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, "movdqa %5,%%xmm3 \n" // merge shuffler LABELALIGN - "1: \n" + "1: \n" "movdqu (%0),%%xmm0 \n" // 8 UV row 0 "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 "lea 0x10(%0),%0 \n" @@ -2390,7 +2390,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 "lea 0x20(%0),%0 \n" @@ -2407,7 +2407,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" // 8 UV "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2432,7 +2432,7 @@ void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, "movdqa %3,%%xmm3 \n" LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" // 00112233 (1u1v) "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) @@ -2473,7 +2473,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "movdqa %5,%%xmm7 \n" LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" // 00112233 (1u1v) "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) @@ -2557,7 +2557,7 @@ void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, "vbroadcastf128 %3,%%ymm3 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" "vmovdqu 2(%0),%%xmm1 \n" "vpermq $0b11011000,%%ymm0,%%ymm0 \n" @@ -2578,7 +2578,7 @@ void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" // 8 uv to 16 uv "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2600,7 +2600,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "vbroadcastf128 %5,%%ymm7 \n" LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" "vmovdqu 2(%0),%%xmm1 \n" "vpermq $0b11011000,%%ymm0,%%ymm0 \n" @@ -2657,7 +2657,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" // 8 uv to 16 uv "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2680,7 +2680,7 @@ void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, "pslld $1,%%xmm4 \n" // all 2 LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) @@ -2732,7 +2732,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, "pslld $3,%%xmm6 \n" // all 8 LABELALIGN - "1: \n" + "1: \n" "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) @@ -2822,7 +2822,7 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) @@ -2850,7 +2850,7 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, "lea 0x20(%1),%1 \n" // 4 uv to 8 uv "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2871,7 +2871,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 LABELALIGN - "1: \n" + "1: \n" "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) @@ -2932,7 +2932,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "lea 0x20(%1),%1 \n" // 4 uv to 8 uv "sub $0x8,%2 \n" "jg 1b \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 88378c575..0ed3287a6 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -30,7 +30,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" // load even pixels into q0, odd into q1 "vld2.8 {q0, q1}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop @@ -51,7 +51,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels "subs %2, %2, #16 \n" // 16 processed per loop "vrhadd.u8 q0, q0, q1 \n" // rounding half add @@ -73,7 +73,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, asm volatile( // change the stride to row 2 pointer "add %1, %0 \n" - "1: \n" + "1: \n" "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc "subs %3, %3, #16 \n" // 16 processed per loop @@ -102,7 +102,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #8 \n" // 8 processed per loop "vst1.8 {d2}, [%1]! \n" @@ -122,7 +122,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr3 = src_ptr + src_stride * 3; asm volatile( - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q1}, [%3]! \n" "vld1.8 {q2}, [%4]! \n" @@ -156,7 +156,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #24 \n" "vmov d2, d3 \n" // order d0, d1, d2 @@ -176,7 +176,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, asm volatile( "vmov.u8 d24, #3 \n" "add %3, %0 \n" - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "subs %2, %2, #24 \n" @@ -233,7 +233,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, asm volatile( "vmov.u8 d24, #3 \n" "add %3, %0 \n" - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "subs %2, %2, #24 \n" @@ -284,7 +284,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, (void)src_stride; asm volatile( "vld1.8 {q3}, [%3] \n" - "1: \n" + "1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n" "subs %2, %2, #12 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" @@ -311,7 +311,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, "vld1.8 {q14}, [%6] \n" "vld1.8 {q15}, [%7] \n" "add %3, %0 \n" - "1: \n" + "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 @@ -420,7 +420,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, "vld1.16 {q13}, [%4] \n" "vld1.8 {q14}, [%5] \n" "add %3, %0 \n" - "1: \n" + "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 @@ -512,7 +512,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, asm volatile( "vmov.u8 d30, #3 \n" - "1: \n" + "1: \n" "vld1.8 {d4}, [%0]! \n" // 01234567 "vld1.8 {d5}, [%3]! \n" // 12345678 @@ -550,7 +550,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, "vmov.u16 q15, #3 \n" "vmov.u8 d28, #3 \n" - "1: \n" + "1: \n" "vld1.8 {d4}, [%0]! \n" // 01234567 "vld1.8 {d5}, [%5]! \n" // 12345678 @@ -611,7 +611,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, asm volatile( "vmov.u16 q15, #3 \n" - "1: \n" + "1: \n" "vld1.16 {q1}, [%0]! \n" // 01234567 (16b) "vld1.16 {q0}, [%3]! \n" // 12345678 (16b) @@ -647,7 +647,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, asm volatile( "vmov.u16 q15, #3 \n" - "1: \n" + "1: \n" "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) "vld1.16 {q1}, [%5]! \n" // 12345678 (16b) @@ -698,7 +698,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, asm volatile( "vmov.u16 d31, #3 \n" - "1: \n" + "1: \n" "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) "vld1.16 {q1}, [%3]! \n" // 12345678 (16b) @@ -743,7 +743,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, "vmov.u16 d31, #3 \n" "vmov.u32 q14, #3 \n" - "1: \n" + "1: \n" "vld1.16 {d0}, [%0]! \n" // 0123 (16b) "vld1.16 {d1}, [%5]! \n" // 1234 (16b) "vmovl.u16 q2, d0 \n" // 0123 (32b) @@ -794,7 +794,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, asm volatile( "vmov.u8 d30, #3 \n" - "1: \n" + "1: \n" "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v) @@ -832,7 +832,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, "vmov.u16 q15, #3 \n" "vmov.u8 d28, #3 \n" - "1: \n" + "1: \n" "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v) @@ -893,7 +893,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, asm volatile( "vmov.u16 d30, #3 \n" - "1: \n" + "1: \n" "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16) "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16) @@ -939,7 +939,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, "vmov.u16 d30, #3 \n" "vmov.u32 q14, #3 \n" - "1: \n" + "1: \n" "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v) "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v) "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) @@ -989,7 +989,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { asm volatile( - "1: \n" + "1: \n" "vld1.16 {q1, q2}, [%1] \n" // load accumulator "vld1.8 {q0}, [%0]! \n" // load 16 bytes "vaddw.u8 q2, q2, d1 \n" // add @@ -1036,7 +1036,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx "vadd.s32 q2, q1, q3 \n" "vshl.i32 q0, q3, #1 \n" // 8 * dx - "1: \n" + "1: \n" LOAD2_DATA8_LANE(0) LOAD2_DATA8_LANE(1) LOAD2_DATA8_LANE(2) @@ -1087,7 +1087,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "subs %2, %2, #8 \n" // 8 processed per loop @@ -1115,7 +1115,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "subs %2, %2, #8 \n" // 8 processed per loop @@ -1138,7 +1138,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "subs %3, %3, #8 \n" // 8 processed per loop. @@ -1176,7 +1176,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, (void)src_stride; asm volatile( "mov r12, %3, lsl #2 \n" - "1: \n" + "1: \n" "vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[1]}, [%0], r12 \n" "vld1.32 {d1[0]}, [%0], r12 \n" @@ -1201,7 +1201,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, asm volatile( "mov r12, %4, lsl #2 \n" "add %1, %1, %0 \n" - "1: \n" + "1: \n" "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 "vld1.8 {d1}, [%1], r12 \n" "vld1.8 {d2}, [%0], r12 \n" @@ -1247,7 +1247,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, int tmp; const uint8_t* src_tmp = src_argb; asm volatile( - "1: \n" + "1: \n" // clang-format off LOAD1_DATA32_LANE(d0, 0) LOAD1_DATA32_LANE(d0, 1) @@ -1300,7 +1300,7 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, "vmov.i16 q15, #0x7f \n" // 0x7F // x , x + 1 * dx, x + 2 * dx, x + 3 * dx "vadd.s32 q8, q1, q0 \n" - "1: \n" + "1: \n" // d0, d1: a // d2, d3: b LOAD2_DATA32_LANE(d0, d2, 0) @@ -1350,7 +1350,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1369,7 +1369,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1390,7 +1390,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" - "1: \n" + "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV "subs %3, %3, #8 \n" // 8 processed per loop. @@ -1423,7 +1423,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, const uint8_t* src3_ptr = src_ptr + src_stepx * 6; (void)src_stride; asm volatile( - "1: \n" + "1: \n" "vld1.16 {d0[0]}, [%0], %6 \n" "vld1.16 {d0[1]}, [%1], %6 \n" "vld1.16 {d0[2]}, [%2], %6 \n" diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 848d55416..e7f9f6c03 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -27,7 +27,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" // load even pixels into v0, odd into v1 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" "subs %w2, %w2, #16 \n" // 16 processed per loop @@ -49,7 +49,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" // load even pixels into v0, odd into v1 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" "subs %w2, %w2, #16 \n" // 16 processed per loop @@ -73,7 +73,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" - "1: \n" + "1: \n" "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "subs %w3, %w3, #16 \n" // 16 processed per loop @@ -102,7 +102,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0 "subs %w2, %w2, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -123,7 +123,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr3 = src_ptr + src_stride * 3; asm volatile( - "1: \n" + "1: \n" "ldp q0, q4, [%0], #32 \n" // load up 16x8 "ldp q1, q5, [%2], #32 \n" "ldp q2, q6, [%3], #32 \n" @@ -175,7 +175,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, "ld1 {v29.16b}, [%[kShuf34_0]] \n" "ld1 {v30.16b}, [%[kShuf34_1]] \n" "ld1 {v31.16b}, [%[kShuf34_2]] \n" - "1: \n" + "1: \n" "ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n" "subs %w[width], %w[width], #48 \n" "tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n" @@ -201,7 +201,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, "movi v24.16b, #3 \n" "add %3, %3, %0 \n" - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1 "subs %w2, %w2, #48 \n" @@ -279,7 +279,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, "movi v20.16b, #3 \n" "add %3, %3, %0 \n" - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // src line 0 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%3], #64 \n" // src line 1 "subs %w2, %w2, #48 \n" @@ -339,7 +339,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, "subs %w[width], %w[width], #12 \n" "b.eq 2f \n" - "1: \n" + "1: \n" "ldp q0, q1, [%[src_ptr]], #32 \n" "subs %w[width], %w[width], #12 \n" "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" @@ -350,7 +350,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, // Store exactly 12 bytes on the final iteration to avoid writing past // the end of the array. - "2: \n" + "2: \n" "ldp q0, q1, [%[src_ptr]] \n" "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" "st1 {v2.8b}, [%[dst_ptr]], #8 \n" @@ -384,7 +384,7 @@ void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, "ld1 {v31.16b}, [%[tblArray4]] \n" "ld1 {v30.16b}, [%[div996]] \n" - "1: \n" + "1: \n" "ldp q20, q0, [%[src_ptr]], #32 \n" "ldp q21, q1, [%[src_ptr1]], #32 \n" "ldp q22, q2, [%[src_ptr2]], #32 \n" @@ -451,7 +451,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, "ld1 {v31.16b}, [%[tblArray3]] \n" "ld1 {v30.8h}, [%[div664]] \n" - "1: \n" + "1: \n" "ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ... "ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ... "subs %w[width], %w[width], #12 \n" @@ -500,7 +500,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, asm volatile( "movi v31.16b, #3 \n" - "1: \n" + "1: \n" "ldr q0, [%0], #16 \n" // 0123456789abcdef "ldr q1, [%1], #16 \n" // 123456789abcdefg "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -547,7 +547,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, "movi v31.8b, #3 \n" "movi v30.8h, #3 \n" - "1: \n" + "1: \n" "ldr d0, [%0], #8 \n" // 01234567 "ldr d1, [%2], #8 \n" // 12345678 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -602,7 +602,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, asm volatile( "movi v31.8h, #3 \n" - "1: \n" + "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -639,7 +639,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, asm volatile( "movi v31.8h, #3 \n" - "1: \n" + "1: \n" "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -693,7 +693,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, asm volatile( "movi v31.8h, #3 \n" - "1: \n" + "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -739,7 +739,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, "movi v31.4h, #3 \n" "movi v30.4s, #3 \n" - "1: \n" + "1: \n" "ldr d0, [%0], #8 \n" // 0123 (16b) "ldr d1, [%2], #8 \n" // 1234 (16b) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -793,7 +793,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, asm volatile( "movi v31.8b, #3 \n" - "1: \n" + "1: \n" "ldr d0, [%0], #8 \n" // 00112233 (1u1v) "ldr d1, [%1], #8 \n" // 11223344 (1u1v) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -833,7 +833,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, "movi v31.8b, #3 \n" "movi v30.8h, #3 \n" - "1: \n" + "1: \n" "ldr d0, [%0], #8 \n" "ldr d1, [%2], #8 \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -888,7 +888,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, asm volatile( "movi v31.8h, #3 \n" - "1: \n" + "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -936,7 +936,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, "movi v31.4h, #3 \n" "movi v30.4s, #3 \n" - "1: \n" + "1: \n" "ldr d0, [%0], #8 \n" "ldr d1, [%2], #8 \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -988,7 +988,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes "uaddw2 v2.8h, v2.8h, v0.16b \n" // add @@ -1042,7 +1042,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, "trn1 v20.8h, v1.8h, v0.8h \n" "trn1 v21.8h, v2.8h, v0.8h \n" - "1: \n" SCALE_FILTER_COLS_STEP_ADDR + "1: \n" SCALE_FILTER_COLS_STEP_ADDR "ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR @@ -1090,7 +1090,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n" "subs %w[width], %w[width], #8 \n" "prfm pldl1keep, [%[src], 448] \n" @@ -1112,7 +1112,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, (void)src_stride; const uint8_t* src_argb1 = src_argb + 32; asm volatile( - "1: \n" + "1: \n" "ld2 {v0.4s, v1.4s}, [%[src]] \n" "add %[src], %[src], #64 \n" "ld2 {v2.4s, v3.4s}, [%[src1]] \n" @@ -1136,7 +1136,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride; asm volatile( - "1: \n" + "1: \n" "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" "ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n" "uaddl v2.8h, v0.8b, v1.8b \n" @@ -1167,7 +1167,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, int64_t i = 0; (void)src_stride; asm volatile( - "1: \n" + "1: \n" "ldr w10, [%[src], %[i]] \n" "ldr w11, [%[src1], %[i]] \n" "ldr w12, [%[src2], %[i]] \n" @@ -1196,7 +1196,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, int dst_width) { asm volatile( "add %1, %1, %0 \n" - "1: \n" + "1: \n" "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 "ld1 {v1.8b}, [%1], %4 \n" "ld1 {v2.8b}, [%0], %4 \n" @@ -1248,7 +1248,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, int64_t dx64 = (int64_t)dx; // NOLINT int64_t tmp64; asm volatile ( - "1: \n" + "1: \n" // clang-format off LOAD1_DATA32_LANE(v0, 0) LOAD1_DATA32_LANE(v0, 1) @@ -1306,7 +1306,7 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, "add v5.4s, v1.4s, v0.4s \n" "ldr q18, [%[kIndices]] \n" - "1: \n" // + "1: \n" // SCALE_ARGB_FILTER_COLS_STEP_ADDR "ldr d1, [%6] \n" // SCALE_ARGB_FILTER_COLS_STEP_ADDR @@ -1359,7 +1359,7 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr, "subs %w[dst_width], %w[dst_width], #32 \n" "b.lt 2f \n" - "1: \n" + "1: \n" "ldp q0, q1, [%[src_ptr]] \n" "ldp q2, q3, [%[src_ptr], #32] \n" "ldp q4, q5, [%[src_ptr], #64] \n" @@ -1376,7 +1376,7 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr, "add %[dst_ptr], %[dst_ptr], #64 \n" "b.ge 1b \n" - "2: \n" + "2: \n" "adds %w[dst_width], %w[dst_width], #32 \n" "b.eq 99f \n" @@ -1386,7 +1386,7 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr, "uzp2 v1.8h, v2.8h, v3.8h \n" "stp q0, q1, [%[dst_ptr]] \n" - "99: \n" + "99: \n" : [src_ptr] "+r"(src_ptr), // %[src_ptr] [dst_ptr] "+r"(dst), // %[dst_ptr] [dst_width] "+r"(dst_width) // %[dst_width] @@ -1400,7 +1400,7 @@ void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n" "ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n" "subs %w[dst_width], %w[dst_width], #16 \n" @@ -1424,7 +1424,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, asm volatile( // change the stride to row 2 pointer "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 - "1: \n" + "1: \n" "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc "subs %w3, %w3, #8 \n" // 8 processed per loop @@ -1453,7 +1453,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -1472,7 +1472,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "subs %w2, %w2, #8 \n" // 8 processed per loop. "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add @@ -1493,7 +1493,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" - "1: \n" + "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV "subs %w3, %w3, #8 \n" // 8 processed per loop. "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. @@ -1526,7 +1526,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, const uint8_t* src3_ptr = src_ptr + src_stepx * 6; (void)src_stride; asm volatile( - "1: \n" + "1: \n" "ld1 {v0.h}[0], [%0], %6 \n" "ld1 {v1.h}[0], [%1], %6 \n" "ld1 {v2.h}[0], [%2], %6 \n"