diff --git a/README.chromium b/README.chromium index 4c281834f..6412e18ae 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1888 +Version: 1889 License: BSD License File: LICENSE Shipped: yes diff --git a/include/libyuv/macros_msa.h b/include/libyuv/macros_msa.h index b9a44fcce..6434a4da0 100644 --- a/include/libyuv/macros_msa.h +++ b/include/libyuv/macros_msa.h @@ -20,9 +20,9 @@ ({ \ const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ uint32_t val_m; \ - asm volatile("lw %[val_m], %[psrc_lw_m] \n" \ - : [val_m] "=r"(val_m) \ - : [psrc_lw_m] "m"(*psrc_lw_m)); \ + asm("lw %[val_m], %[psrc_lw_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ val_m; \ }) @@ -31,9 +31,9 @@ ({ \ const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ uint64_t val_m = 0; \ - asm volatile("ld %[val_m], %[psrc_ld_m] \n" \ - : [val_m] "=r"(val_m) \ - : [psrc_ld_m] "m"(*psrc_ld_m)); \ + asm("ld %[val_m], %[psrc_ld_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ val_m; \ }) #else // !(__mips == 64) @@ -55,9 +55,9 @@ ({ \ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ uint32_t val_m = (val); \ - asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ - : [pdst_sw_m] "=m"(*pdst_sw_m) \ - : [val_m] "r"(val_m)); \ + asm("sw %[val_m], %[pdst_sw_m] \n" \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_m] "r"(val_m)); \ }) #if (__mips == 64) @@ -65,9 +65,9 @@ ({ \ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ uint64_t val_m = (val); \ - asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ - : [pdst_sd_m] "=m"(*pdst_sd_m) \ - : [val_m] "r"(val_m)); \ + asm("sd %[val_m], %[pdst_sd_m] \n" \ + : [pdst_sd_m] "=m"(*pdst_sd_m) \ + : [val_m] "r"(val_m)); \ }) #else // !(__mips == 64) #define SD(val, pdst) \ @@ -86,8 +86,7 @@ uint8_t* psrc_lw_m = (uint8_t*)(psrc); \ uint32_t val_lw_m; \ \ - __asm__ volatile( \ - "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ + asm("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ \ : [val_lw_m] "=&r"(val_lw_m) \ @@ -102,8 +101,7 @@ uint8_t* psrc_ld_m = (uint8_t*)(psrc); \ uint64_t val_ld_m = 0; \ \ - __asm__ volatile( \ - "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ + asm("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ \ : [val_ld_m] "=&r"(val_ld_m) \ @@ -130,9 +128,9 @@ ({ \ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ uint32_t val_m = (val); \ - asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ - : [pdst_sw_m] "=m"(*pdst_sw_m) \ - : [val_m] "r"(val_m)); \ + asm("usw %[val_m], %[pdst_sw_m] \n" \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_m] "r"(val_m)); \ }) #define SD(val, pdst) \ diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d3099004a..001600c90 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1888 +#define LIBYUV_VERSION 1889 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 970f950f4..492969259 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -29,7 +29,8 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, int count) { uint64_t diff; - asm("xor %3,%3 \n" + asm volatile ( + "xor %3,%3 \n" "xor %%r8,%%r8 \n" "xor %%r9,%%r9 \n" "xor %%r10,%%r10 \n" @@ -76,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, int count) { uint32_t diff = 0u; - asm( + asm volatile ( // Process 16 bytes per loop. LABELALIGN "1: \n" @@ -120,7 +121,8 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a, int count) { uint32_t diff; - asm("movdqa %4,%%xmm2 \n" + asm volatile ( + "movdqa %4,%%xmm2 \n" "movdqa %5,%%xmm3 \n" "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" @@ -178,7 +180,8 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, int count) { uint32_t diff; - asm("vbroadcastf128 %4,%%ymm2 \n" + asm volatile ( + "vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %5,%%ymm3 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n" "vpxor %%ymm1,%%ymm1,%%ymm1 \n" @@ -231,7 +234,8 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; - asm("pxor %%xmm0,%%xmm0 \n" + asm volatile ( + "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm5,%%xmm5 \n" LABELALIGN @@ -296,7 +300,8 @@ static const uvec32 kHashMul3 = { uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t hash; - asm("movd %2,%%xmm0 \n" + asm volatile ( + "movd %2,%%xmm0 \n" "pxor %%xmm7,%%xmm7 \n" "movdqa %4,%%xmm6 \n" diff --git a/source/compare_neon.cc b/source/compare_neon.cc index afdd60121..c2aea6074 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -28,7 +28,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, int count) { uint32_t diff; - asm volatile( + asm volatile ( "vmov.u16 q4, #0 \n" // accumulator "1: \n" @@ -58,7 +58,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; - asm volatile( + asm volatile ( "vmov.u8 q8, #0 \n" "vmov.u8 q10, #0 \n" "vmov.u8 q9, #0 \n" diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index b61b9f7ac..07292deff 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -26,7 +26,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff; - asm volatile( + asm volatile ( "movi v4.8h, #0 \n" "1: \n" @@ -55,7 +55,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; - asm volatile( + asm volatile ( "movi v16.16b, #0 \n" "movi v17.16b, #0 \n" "movi v18.16b, #0 \n" @@ -157,7 +157,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff; - asm volatile( + asm volatile ( "movi v4.4s, #0 \n" "movi v5.4s, #0 \n" "movi v6.16b, #1 \n" @@ -190,7 +190,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a, int count) { // count is guaranteed to be a multiple of 32. uint32_t sse; - asm volatile( + asm volatile ( "movi v4.4s, #0 \n" "movi v5.4s, #0 \n" diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index 034161421..48926b687 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm( + asm volatile ( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN @@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm( + asm volatile ( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN @@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src, uint8_t* dst_b, int dst_stride_b, int width) { - asm( + asm volatile ( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN @@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm( + asm volatile ( // Main loop transpose 4x4. Read a column, write a row. "1: \n" "movdqu (%0),%%xmm0 \n" // a b c d @@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm( + asm volatile ( // Main loop transpose 2 blocks of 4x4. Read a column, write a row. "1: \n" "vmovdqu (%0),%%xmm0 \n" // a b c d diff --git a/source/rotate_msa.cc b/source/rotate_msa.cc index d4e62b12e..99bdca65b 100644 --- a/source/rotate_msa.cc +++ b/source/rotate_msa.cc @@ -51,6 +51,16 @@ extern "C" { out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ } +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + TransposeWx8_C(src, src_stride, dst, dst_stride, width); + TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, + width); +} + void TransposeUVWx16_C(const uint8_t* src, int src_stride, uint8_t* dst_a, diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index e0defb480..334a9f998 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -27,7 +27,7 @@ void TransposeWx8_NEON(const uint8_t* src, int dst_stride, int width) { const uint8_t* temp; - asm( + asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this @@ -95,7 +95,7 @@ void TransposeUVWx8_NEON(const uint8_t* src, int dst_stride_b, int width) { const uint8_t* temp; - asm( + asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this @@ -184,7 +184,7 @@ void Transpose4x4_32_NEON(const uint8_t* src, uint8_t* dst1 = dst + dst_stride; uint8_t* dst2 = dst1 + dst_stride; uint8_t* dst3 = dst2 + dst_stride; - asm volatile( + asm volatile ( // Main loop transpose 4x4. Read a column, write a row. "1: \n" "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n" diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc index 0062d6746..dbf08edac 100644 --- a/source/rotate_neon64.cc +++ b/source/rotate_neon64.cc @@ -27,7 +27,8 @@ void TransposeWx16_NEON(const uint8_t* src, int dst_stride, int width) { const uint8_t* src_temp; - asm("1: \n" + asm volatile ( + "1: \n" "mov %[src_temp], %[src] \n" "ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n" @@ -144,7 +145,7 @@ void TransposeUVWx8_NEON(const uint8_t* src, int dst_stride_b, int width) { const uint8_t* temp; - asm( + asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this @@ -238,7 +239,7 @@ void Transpose4x4_32_NEON(const uint8_t* src, uint8_t* dst1 = dst + dst_stride; uint8_t* dst2 = dst1 + dst_stride; uint8_t* dst3 = dst2 + dst_stride; - asm volatile( + asm volatile ( // Main loop transpose 4x4. Read a column, write a row. "1: \n" "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n" diff --git a/source/row_gcc.cc b/source/row_gcc.cc index b601b60b4..f8f41860a 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -161,7 +161,8 @@ static const lvec8 kShuffleNV21 = { #ifdef HAS_J400TOARGBROW_SSE2 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" LABELALIGN @@ -191,7 +192,8 @@ void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -228,7 +230,8 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, } void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -266,7 +269,8 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { // Same code as RAWToARGB with different shuffler and A in low bits void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff "psrld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -305,7 +309,8 @@ void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm("movdqa %3,%%xmm3 \n" + asm volatile ( + "movdqa %3,%%xmm3 \n" "movdqa %4,%%xmm4 \n" "movdqa %5,%%xmm5 \n" @@ -334,7 +339,8 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, } void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm("mov $0x1080108,%%eax \n" + asm volatile ( + "mov $0x1080108,%%eax \n" "movd %%eax,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "mov $0x20802080,%%eax \n" @@ -381,7 +387,8 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm("mov $0x1080108,%%eax \n" + asm volatile ( + "mov $0x1080108,%%eax \n" "movd %%eax,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "mov $0x42004200,%%eax \n" @@ -431,7 +438,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm("mov $0xf0f0f0f,%%eax \n" + asm volatile ( + "mov $0xf0f0f0f,%%eax \n" "movd %%eax,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" @@ -467,7 +475,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm("movdqa %3,%%xmm6 \n" + asm volatile ( + "movdqa %3,%%xmm6 \n" LABELALIGN "1: \n" @@ -504,7 +513,8 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm("movdqa %3,%%xmm6 \n" + asm volatile ( + "movdqa %3,%%xmm6 \n" LABELALIGN "1: \n" @@ -545,7 +555,8 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm("vbroadcastf128 %3,%%ymm6 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm6 \n" "vmovdqa %4,%%ymm7 \n" LABELALIGN @@ -604,7 +615,8 @@ static const ulvec8 kPermARGBToRGB24_2 = { 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u}; void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { - asm("vmovdqa %3,%%ymm5 \n" + asm volatile ( + "vmovdqa %3,%%ymm5 \n" "vmovdqa %4,%%ymm6 \n" "vmovdqa %5,%%ymm7 \n" @@ -637,7 +649,8 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBTORAWROW_AVX2 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm("vbroadcastf128 %3,%%ymm6 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm6 \n" "vmovdqa %4,%%ymm7 \n" LABELALIGN @@ -681,7 +694,8 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { #endif void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm("pcmpeqb %%xmm3,%%xmm3 \n" + asm volatile ( + "pcmpeqb %%xmm3,%%xmm3 \n" "psrld $0x1b,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "psrld $0x1a,%%xmm4 \n" @@ -720,7 +734,8 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, uint8_t* dst, uint32_t dither4, int width) { - asm("movd %3,%%xmm6 \n" + asm volatile ( + "movd %3,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm6 \n" "movdqa %%xmm6,%%xmm7 \n" "punpcklwd %%xmm6,%%xmm6 \n" @@ -767,7 +782,8 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, uint32_t dither4, int width) { - asm("vbroadcastss %3,%%xmm6 \n" + asm volatile ( + "vbroadcastss %3,%%xmm6 \n" "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" "vpermq $0xd8,%%ymm6,%%ymm6 \n" "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" @@ -808,7 +824,8 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, #endif // HAS_ARGBTORGB565DITHERROW_AVX2 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm("pcmpeqb %%xmm4,%%xmm4 \n" + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" "psrld $0x1b,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" "pslld $0x5,%%xmm5 \n" @@ -848,7 +865,8 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm("pcmpeqb %%xmm4,%%xmm4 \n" + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" "psllw $0xc,%%xmm4 \n" "movdqa %%xmm4,%%xmm3 \n" "psrlw $0x8,%%xmm3 \n" @@ -910,7 +928,8 @@ static const uint32_t kMaskAG10 = 0xc000ff00; static const uint32_t kMulAG10 = 64 * 65536 + 1028; void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm("movdqa %3,%%xmm2 \n" // shuffler for RB + asm volatile ( + "movdqa %3,%%xmm2 \n" // shuffler for RB "movd %4,%%xmm3 \n" // multipler for RB "movd %5,%%xmm4 \n" // mask for R10 B10 "movd %6,%%xmm5 \n" // mask for AG @@ -948,7 +967,8 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { } void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm("movdqa %3,%%xmm2 \n" // shuffler for RB + asm volatile ( + "movdqa %3,%%xmm2 \n" // shuffler for RB "movd %4,%%xmm3 \n" // multipler for RB "movd %5,%%xmm4 \n" // mask for R10 B10 "movd %6,%%xmm5 \n" // mask for AG @@ -987,7 +1007,8 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBTOAR30ROW_AVX2 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm("vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + asm volatile ( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastss %4,%%ymm3 \n" // multipler for RB "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 "vbroadcastss %6,%%ymm5 \n" // mask for AG @@ -1023,7 +1044,8 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ABGRTOAR30ROW_AVX2 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm("vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + asm volatile ( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastss %4,%%ymm3 \n" // multipler for RB "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 "vbroadcastss %6,%%ymm5 \n" // mask for AG @@ -1068,7 +1090,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1090,7 +1112,8 @@ void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm("movdqa %3,%%xmm2 \n" + asm volatile ( + "movdqa %3,%%xmm2 \n" "movdqa %4,%%xmm3 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" @@ -1114,7 +1137,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1136,7 +1159,8 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm("movdqa %3,%%xmm2 \n" + asm volatile ( + "movdqa %3,%%xmm2 \n" LABELALIGN "1: \n" @@ -1162,7 +1186,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" @@ -1187,7 +1211,8 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb, void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm("vbroadcastf128 %3,%%ymm2 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm2 \n" "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" @@ -1214,7 +1239,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb, void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -1240,7 +1265,8 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN + asm volatile ( + "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -1334,7 +1360,8 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -1354,7 +1381,8 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) @@ -1371,7 +1399,8 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Convert 16 ABGR pixels (64 bytes) to 16 YJ values. // Same as ABGRToYRow but different coefficients, no add 16. void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) @@ -1388,7 +1417,8 @@ void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) @@ -1411,7 +1441,8 @@ static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm("vbroadcastf128 %3,%%ymm4 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( @@ -1431,7 +1462,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { #ifdef HAS_ABGRTOYROW_AVX2 // Convert 32 ABGR pixels (128 bytes) to 32 Y values. void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm("vbroadcastf128 %3,%%ymm4 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( @@ -1451,7 +1483,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm("vbroadcastf128 %3,%%ymm4 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" @@ -1469,7 +1502,8 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { #ifdef HAS_ABGRTOYJROW_AVX2 // Convert 32 ABGR pixels (128 bytes) to 32 Y values. void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm("vbroadcastf128 %3,%%ymm4 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" @@ -1487,7 +1521,8 @@ void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { #ifdef HAS_RGBATOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm("vbroadcastf128 %3,%%ymm4 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" @@ -1507,7 +1542,8 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("movdqa %5,%%xmm3 \n" + asm volatile ( + "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -1579,7 +1615,8 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("vbroadcastf128 %5,%%ymm5 \n" + asm volatile ( + "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" @@ -1641,7 +1678,8 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("vbroadcastf128 %5,%%ymm5 \n" + asm volatile ( + "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" @@ -1703,7 +1741,8 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("vbroadcastf128 %5,%%ymm5 \n" + asm volatile ( + "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" @@ -1767,7 +1806,8 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("vbroadcastf128 %5,%%ymm5 \n" + asm volatile ( + "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" @@ -1830,7 +1870,8 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("movdqa %5,%%xmm3 \n" + asm volatile ( + "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -1895,7 +1936,8 @@ void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("movdqa %5,%%xmm3 \n" + asm volatile ( + "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -1959,7 +2001,8 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("movdqa %4,%%xmm3 \n" + asm volatile ( + "movdqa %4,%%xmm3 \n" "movdqa %5,%%xmm4 \n" "movdqa %6,%%xmm5 \n" "sub %1,%2 \n" @@ -2012,7 +2055,8 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, #endif // HAS_ARGBTOUV444ROW_SSSE3 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -2032,7 +2076,8 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("movdqa %5,%%xmm3 \n" + asm volatile ( + "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -2090,7 +2135,8 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, } void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -2106,7 +2152,8 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { } void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -2126,7 +2173,8 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("movdqa %5,%%xmm3 \n" + asm volatile ( + "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -2188,7 +2236,8 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("movdqa %5,%%xmm3 \n" + asm volatile ( + "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -2608,7 +2657,7 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP( + asm volatile (YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA444 @@ -2934,7 +2983,7 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP( + asm volatile (YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA210 @@ -2966,7 +3015,7 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP( + asm volatile (YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA410 @@ -3032,7 +3081,7 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP( + asm volatile (YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA422 @@ -3060,7 +3109,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP( + asm volatile (YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READNV12 @@ -3081,7 +3130,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP( + asm volatile (YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READNV21 @@ -3102,7 +3151,8 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm("movdqa %[kShuffleYUY2Y],%%xmm6 \n" + asm volatile ( + "movdqa %[kShuffleYUY2Y],%%xmm6 \n" "movdqa %[kShuffleYUY2UV],%%xmm7 \n" YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" @@ -3123,7 +3173,8 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm("movdqa %[kShuffleUYVYY],%%xmm6 \n" + asm volatile ( + "movdqa %[kShuffleUYVYY],%%xmm6 \n" "movdqa %[kShuffleUYVYUV],%%xmm7 \n" YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" @@ -3145,7 +3196,7 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP( + asm volatile (YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READP210 @@ -3166,7 +3217,7 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP( + asm volatile (YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READP410 @@ -4000,7 +4051,7 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP_AVX2( + asm volatile (YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA210_AVX2 @@ -4035,7 +4086,7 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP_AVX2( + asm volatile (YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA410_AVX2 @@ -4110,7 +4161,7 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP_AVX2( + asm volatile (YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA444_AVX2 @@ -4144,7 +4195,7 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP_AVX2( + asm volatile (YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA422_AVX2 @@ -4220,7 +4271,7 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP_AVX2( + asm volatile (YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READNV12_AVX2 @@ -4246,7 +4297,7 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP_AVX2( + asm volatile (YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READNV21_AVX2 @@ -4272,7 +4323,8 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm("vbroadcastf128 %[kShuffleYUY2Y],%%ymm6 \n" + asm volatile ( + "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6 \n" "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -4298,7 +4350,8 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm("vbroadcastf128 %[kShuffleUYVYY],%%ymm6 \n" + asm volatile ( + "vbroadcastf128 %[kShuffleUYVYY],%%ymm6 \n" "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -4325,7 +4378,7 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP_AVX2( + asm volatile (YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READP210_AVX2 @@ -4351,7 +4404,7 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP_AVX2( + asm volatile (YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READP410_AVX2 @@ -4448,7 +4501,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm("movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 + asm volatile ( + "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 "pslld $0x18,%%xmm4 \n" @@ -4492,7 +4546,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm("vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 + asm volatile ( + "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 "vpslld $0x18,%%ymm4,%%ymm4 \n" @@ -4535,7 +4590,8 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm("movdqa %3,%%xmm5 \n" + asm volatile ( + "movdqa %3,%%xmm5 \n" LABELALIGN "1: \n" @@ -4556,7 +4612,8 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_MIRRORROW_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm("vbroadcastf128 %3,%%ymm5 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm5 \n" LABELALIGN "1: \n" @@ -4583,7 +4640,8 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); - asm("movdqa %3,%%xmm5 \n" + asm volatile ( + "movdqa %3,%%xmm5 \n" LABELALIGN "1: \n" @@ -4604,7 +4662,8 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { #ifdef HAS_MIRRORUVROW_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); - asm("vbroadcastf128 %3,%%ymm5 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm5 \n" LABELALIGN "1: \n" @@ -4633,7 +4692,8 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_v, int width) { intptr_t temp_width = (intptr_t)(width); - asm("movdqa %4,%%xmm1 \n" + asm volatile ( + "movdqa %4,%%xmm1 \n" "lea -0x10(%0,%3,2),%0 \n" "sub %1,%2 \n" @@ -4672,7 +4732,8 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, int width) { intptr_t temp_width = (intptr_t)(width); src_rgb24 += width * 3 - 48; - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN @@ -4706,7 +4767,8 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm("lea -0x10(%0,%2,4),%0 \n" + asm volatile ( + "lea -0x10(%0,%2,4),%0 \n" LABELALIGN "1: \n" @@ -4730,7 +4792,8 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm("vmovdqu %3,%%ymm5 \n" + asm volatile ( + "vmovdqu %3,%%ymm5 \n" LABELALIGN "1: \n" @@ -4753,7 +4816,8 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -4790,7 +4854,8 @@ void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -4826,7 +4891,8 @@ void DetileRow_SSE2(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width) { - asm("1: \n" + asm volatile ( + "1: \n" "movdqu (%0),%%xmm0 \n" "sub $0x10,%2 \n" "lea (%0,%3),%0 \n" @@ -4846,7 +4912,8 @@ void DetileRow_16_SSE2(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width) { - asm("1: \n" + asm volatile ( + "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea (%0,%3,2),%0 \n" @@ -4868,7 +4935,8 @@ void DetileRow_16_AVX(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width) { - asm("1: \n" + asm volatile ( + "1: \n" "vmovdqu (%0),%%ymm0 \n" "lea (%0,%3,2),%0 \n" "vmovdqu %%ymm0,(%1) \n" @@ -4892,7 +4960,8 @@ void DetileToYUY2_SSE2(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm("1: \n" + asm volatile ( + "1: \n" "movdqu (%0),%%xmm0 \n" // Load 16 Y "sub $0x10,%3 \n" "lea (%0,%4),%0 \n" @@ -4930,7 +4999,8 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("movdqu %4,%%xmm1 \n" + asm volatile ( + "movdqu %4,%%xmm1 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "lea (%0, %5),%0 \n" @@ -4956,7 +5026,8 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" LABELALIGN "1: \n" @@ -4984,7 +5055,8 @@ void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" LABELALIGN "1: \n" @@ -5012,7 +5084,8 @@ void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" LABELALIGN "1: \n" @@ -5042,7 +5115,8 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, uint16_t* dst_uv, int depth, int width) { - asm("vmovd %4,%%xmm3 \n" + asm volatile ( + "vmovd %4,%%xmm3 \n" "vmovd %5,%%xmm4 \n" "sub %0,%1 \n" @@ -5080,7 +5154,8 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, int depth, int width) { depth = 16 - depth; - asm("vmovd %4,%%xmm3 \n" + asm volatile ( + "vmovd %4,%%xmm3 \n" "vbroadcastf128 %5,%%ymm4 \n" "sub %1,%2 \n" @@ -5125,7 +5200,8 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm("vmovd %3,%%xmm3 \n" + asm volatile ( + "vmovd %3,%%xmm3 \n" "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" @@ -5161,7 +5237,8 @@ void DivideRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm("vmovd %3,%%xmm3 \n" + asm volatile ( + "vmovd %3,%%xmm3 \n" "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" @@ -5197,7 +5274,8 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - asm("movd %3,%%xmm2 \n" + asm volatile ( + "movd %3,%%xmm2 \n" "punpcklwd %%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" @@ -5226,7 +5304,8 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - asm("vmovd %3,%%xmm2 \n" + asm volatile ( + "vmovd %3,%%xmm2 \n" "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" "vbroadcastss %%xmm2,%%ymm2 \n" @@ -5261,7 +5340,8 @@ void Convert8To16Row_SSE2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) { - asm("movd %3,%%xmm2 \n" + asm volatile ( + "movd %3,%%xmm2 \n" "punpcklwd %%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" @@ -5292,7 +5372,8 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) { - asm("vmovd %3,%%xmm2 \n" + asm volatile ( + "vmovd %3,%%xmm2 \n" "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" "vbroadcastss %%xmm2,%%ymm2 \n" @@ -5347,7 +5428,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -5421,7 +5502,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" @@ -5476,7 +5557,8 @@ void MergeARGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" @@ -5516,7 +5598,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movq (%2),%%xmm0 \n" // B @@ -5554,7 +5636,8 @@ void MergeARGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" @@ -5642,7 +5725,8 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm("sub %1,%2 \n" + asm volatile ( + "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" @@ -5692,7 +5776,7 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F @@ -5741,7 +5825,8 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm("movdqa %6,%%xmm3 \n" + asm volatile ( + "movdqa %6,%%xmm3 \n" "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" @@ -5786,7 +5871,8 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm("movdqa %5,%%xmm3 \n" + asm volatile ( + "movdqa %5,%%xmm3 \n" LABELALIGN "1: \n" @@ -5826,7 +5912,8 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm("sub %1,%2 \n" + asm volatile ( + "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" "vmovdqa %7,%%ymm3 \n" @@ -5876,7 +5963,8 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm("vmovdqa %6,%%ymm3 \n" + asm volatile ( + "vmovdqa %6,%%ymm3 \n" "vbroadcastf128 %5,%%ymm4 \n" LABELALIGN @@ -5922,7 +6010,8 @@ void MergeXR30Row_AVX2(const uint16_t* src_r, int depth, int width) { int shift = depth - 10; - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "sub %0,%2 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" @@ -5987,7 +6076,8 @@ void MergeAR64Row_AVX2(const uint16_t* src_r, int shift = 16 - depth; int mask = (1 << depth) - 1; mask = (mask << 16) + mask; - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" "vmovdqa %8,%%ymm5 \n" @@ -6057,7 +6147,8 @@ void MergeXR64Row_AVX2(const uint16_t* src_r, int shift = 16 - depth; int mask = (1 << depth) - 1; mask = (mask << 16) + mask; - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "sub %0,%2 \n" "vmovdqa %7,%%ymm5 \n" "vmovd %5,%%xmm6 \n" @@ -6119,7 +6210,8 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r, int depth, int width) { int shift = depth - 8; - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" "vbroadcastf128 %7,%%ymm5 \n" @@ -6174,7 +6266,8 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, int depth, int width) { int shift = depth - 8; - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "sub %0,%2 \n" "vbroadcastf128 %6,%%ymm5 \n" "vmovd %5,%%xmm6 \n" @@ -6217,7 +6310,8 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm("test $0xf,%0 \n" + asm volatile ( + "test $0xf,%0 \n" "jne 2f \n" "test $0xf,%1 \n" "jne 2f \n" @@ -6256,7 +6350,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_COPYROW_AVX void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -6279,7 +6373,8 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { // Multiple of 1. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); - asm("rep movsb \n" + asm volatile ( + "rep movsb \n" : "+S"(src), // %0 "+D"(dst), // %1 "+c"(width_tmp) // %2 @@ -6291,7 +6386,8 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm("pcmpeqb %%xmm0,%%xmm0 \n" + asm volatile ( + "pcmpeqb %%xmm0,%%xmm0 \n" "pslld $0x18,%%xmm0 \n" "pcmpeqb %%xmm1,%%xmm1 \n" "psrld $0x8,%%xmm1 \n" @@ -6325,7 +6421,8 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm("vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + asm volatile ( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN @@ -6354,7 +6451,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0), %%xmm0 \n" "movdqu 0x10(%0), %%xmm1 \n" @@ -6383,7 +6480,8 @@ static const uvec8 kShuffleAlphaShort_AVX2 = { void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm("vmovdqa %3,%%ymm4 \n" + asm volatile ( + "vmovdqa %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" LABELALIGN @@ -6418,7 +6516,8 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm("pcmpeqb %%xmm0,%%xmm0 \n" + asm volatile ( + "pcmpeqb %%xmm0,%%xmm0 \n" "pslld $0x18,%%xmm0 \n" "pcmpeqb %%xmm1,%%xmm1 \n" "psrld $0x8,%%xmm1 \n" @@ -6454,7 +6553,8 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm("vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + asm volatile ( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN @@ -6484,7 +6584,8 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width >> 2); const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm("rep stosl \n" + asm volatile ( + "rep stosl \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -6493,7 +6594,8 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); - asm("rep stosb \n" + asm volatile ( + "rep stosb \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v8) // %2 @@ -6502,7 +6604,8 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); - asm("rep stosl \n" + asm volatile ( + "rep stosl \n" : "+D"(dst_argb), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -6512,7 +6615,8 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { #ifdef HAS_YUY2TOYROW_SSE2 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" LABELALIGN @@ -6538,7 +6642,7 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -6566,7 +6670,8 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6604,7 +6709,8 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6635,7 +6741,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, } void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -6659,7 +6765,8 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6697,7 +6804,8 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6730,7 +6838,8 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, #ifdef HAS_YUY2TOYROW_AVX2 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" LABELALIGN @@ -6758,7 +6867,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -6786,7 +6895,8 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -6825,7 +6935,8 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -6859,7 +6970,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, } void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -6884,7 +6995,8 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -6923,7 +7035,8 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -6967,7 +7080,8 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm("pcmpeqb %%xmm7,%%xmm7 \n" + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $0xf,%%xmm7 \n" "pcmpeqb %%xmm6,%%xmm6 \n" "psrlw $0x8,%%xmm6 \n" @@ -7054,7 +7168,8 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0, const uint8_t* alpha, uint8_t* dst, int width) { - asm("pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" "psllw $0x8,%%xmm5 \n" "mov $0x80808080,%%eax \n" "movd %%eax,%%xmm6 \n" @@ -7105,7 +7220,8 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, const uint8_t* alpha, uint8_t* dst, int width) { - asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsllw $0x8,%%ymm5,%%ymm5 \n" "mov $0x80808080,%%eax \n" "vmovd %%eax,%%xmm6 \n" @@ -7164,7 +7280,8 @@ static const vec8 kAttenuateShuffle = {6, -128, 6, -128, 6, -128, void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" "pxor %%xmm6,%%xmm6 \n" @@ -7218,7 +7335,8 @@ static const lvec8 kAttenuateShuffle_AVX2 = { void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm("vmovdqa %3,%%ymm4 \n" + asm volatile ( + "vmovdqa %3,%%ymm4 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpslld $0x18,%%ymm5,%%ymm5 \n" "vpxor %%ymm6,%%ymm6,%%ymm6 \n" @@ -7311,7 +7429,8 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uintptr_t alpha; - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "vbroadcastf128 %5,%%ymm5 \n" // 8 pixel loop. @@ -7372,7 +7491,8 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" // 8 pixel loop. @@ -7433,7 +7553,8 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { - asm("movdqa %2,%%xmm2 \n" + asm volatile ( + "movdqa %2,%%xmm2 \n" "movdqa %3,%%xmm3 \n" "movdqa %4,%%xmm4 \n" @@ -7493,7 +7614,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { - asm("movdqu (%3),%%xmm5 \n" + asm volatile ( + "movdqu (%3),%%xmm5 \n" "pshufd $0x00,%%xmm5,%%xmm2 \n" "pshufd $0x55,%%xmm5,%%xmm3 \n" "pshufd $0xaa,%%xmm5,%%xmm4 \n" @@ -7557,7 +7679,8 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int interval_size, int interval_offset, int width) { - asm("movd %2,%%xmm2 \n" + asm volatile ( + "movd %2,%%xmm2 \n" "movd %3,%%xmm3 \n" "movd %4,%%xmm4 \n" "pshuflw $0x40,%%xmm2,%%xmm2 \n" @@ -7607,7 +7730,8 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { - asm("movd %3,%%xmm2 \n" + asm volatile ( + "movd %3,%%xmm2 \n" "punpcklbw %%xmm2,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm2 \n" @@ -7642,7 +7766,8 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm("pxor %%xmm5,%%xmm5 \n" + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN @@ -7679,7 +7804,8 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile ( + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // 4 pixel loop. LABELALIGN @@ -7831,7 +7957,8 @@ void SobelXRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" "pxor %%xmm5,%%xmm5 \n" @@ -7884,7 +8011,8 @@ void SobelYRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "sub %0,%2 \n" "pxor %%xmm5,%%xmm5 \n" @@ -7936,7 +8064,8 @@ void SobelRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -7982,7 +8111,8 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -8016,7 +8146,8 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" // 8 pixel loop. @@ -8063,7 +8194,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width) { - asm("pxor %%xmm0,%%xmm0 \n" + asm volatile ( + "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" "sub $0x4,%3 \n" "jl 49f \n" @@ -8142,7 +8274,8 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, int area, uint8_t* dst, int count) { - asm("movd %5,%%xmm5 \n" + asm volatile ( + "movd %5,%%xmm5 \n" "cvtdq2ps %%xmm5,%%xmm5 \n" "rcpss %%xmm5,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" @@ -8276,7 +8409,8 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, int width) { intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; - asm("movq (%3),%%xmm2 \n" + asm volatile ( + "movq (%3),%%xmm2 \n" "movq 0x08(%3),%%xmm7 \n" "shl $0x10,%1 \n" "add $0x4,%1 \n" @@ -8360,7 +8494,8 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { - asm("sub %1,%0 \n" + asm volatile ( + "sub %1,%0 \n" "cmp $0x0,%3 \n" "je 100f \n" "cmp $0x80,%3 \n" @@ -8440,7 +8575,8 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { - asm("sub %1,%0 \n" + asm volatile ( + "sub %1,%0 \n" "cmp $0x0,%3 \n" "je 100f \n" "cmp $0x80,%3 \n" @@ -8516,7 +8652,8 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm("movdqu (%3),%%xmm5 \n" + asm volatile ( + "movdqu (%3),%%xmm5 \n" LABELALIGN "1: \n" @@ -8544,7 +8681,8 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm("vbroadcastf128 (%3),%%ymm5 \n" + asm volatile ( + "vbroadcastf128 (%3),%%ymm5 \n" LABELALIGN "1: \n" @@ -8573,7 +8711,8 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm("sub %1,%2 \n" + asm volatile ( + "sub %1,%2 \n" LABELALIGN "1: \n" @@ -8607,7 +8746,8 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm("sub %1,%2 \n" + asm volatile ( + "sub %1,%2 \n" LABELALIGN "1: \n" @@ -8641,7 +8781,8 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm("sub %1,%2 \n" + asm volatile ( + "sub %1,%2 \n" LABELALIGN "1: \n" @@ -8678,7 +8819,8 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm("sub %1,%2 \n" + asm volatile ( + "sub %1,%2 \n" LABELALIGN "1: \n" @@ -8714,7 +8856,8 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { - asm("pxor %%xmm3,%%xmm3 \n" + asm volatile ( + "pxor %%xmm3,%%xmm3 \n" // 2 pixel loop. LABELALIGN @@ -8768,7 +8911,8 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { - asm("vbroadcastf128 (%3),%%ymm4 \n" + asm volatile ( + "vbroadcastf128 (%3),%%ymm4 \n" "vbroadcastf128 0x10(%3),%%ymm5 \n" "vbroadcastf128 0x20(%3),%%ymm6 \n" "vbroadcastf128 0x30(%3),%%ymm7 \n" @@ -8810,7 +8954,8 @@ void HalfFloatRow_SSE2(const uint16_t* src, float scale, int width) { scale *= kScaleBias; - asm("movd %3,%%xmm4 \n" + asm volatile ( + "movd %3,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" "sub %0,%1 \n" @@ -8847,7 +8992,8 @@ void HalfFloatRow_AVX2(const uint16_t* src, float scale, int width) { scale *= kScaleBias; - asm("vbroadcastss %3, %%ymm4 \n" + asm volatile ( + "vbroadcastss %3, %%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" "sub %0,%1 \n" @@ -8887,7 +9033,8 @@ void HalfFloatRow_F16C(const uint16_t* src, uint16_t* dst, float scale, int width) { - asm("vbroadcastss %3, %%ymm4 \n" + asm volatile ( + "vbroadcastss %3, %%ymm4 \n" "sub %0,%1 \n" // 16 pixel loop. @@ -8921,7 +9068,8 @@ void HalfFloatRow_F16C(const uint16_t* src, #ifdef HAS_HALFFLOATROW_F16C void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" @@ -9017,7 +9165,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, uint32_t lumacoeff) { uintptr_t pixel_temp; uintptr_t table_temp; - asm("movd %6,%%xmm3 \n" + asm volatile ( + "movd %6,%%xmm3 \n" "pshufd $0x0,%%xmm3,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "psllw $0x8,%%xmm4 \n" @@ -9120,7 +9269,8 @@ void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "movdqa (%4),%%xmm4 \n" // 3 shuffler constants "movdqa 16(%4),%%xmm5 \n" "movdqa 32(%4),%%xmm6 \n" @@ -9157,7 +9307,8 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants "vbroadcastf128 16(%4),%%ymm5 \n" "vbroadcastf128 32(%4),%%ymm6 \n" @@ -9204,7 +9355,8 @@ void NV21ToYUV24Row_AVX512(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm("sub %0,%1 \n" + asm volatile ( + "sub %0,%1 \n" "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants "vmovdqa 32(%4),%%ymm5 \n" "vmovdqa 64(%4),%%ymm6 \n" LABELALIGN @@ -9242,7 +9394,8 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm("movdqu %3,%%xmm5 \n" + asm volatile ( + "movdqu %3,%%xmm5 \n" LABELALIGN "1: \n" @@ -9266,7 +9419,8 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { #ifdef HAS_SWAPUVROW_AVX2 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm("vbroadcastf128 %3,%%ymm5 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm5 \n" LABELALIGN "1: \n" @@ -9295,7 +9449,8 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, int src_stride_v, uint8_t* dst_uv, int width) { - asm("pcmpeqb %%xmm4,%%xmm4 \n" + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -9340,7 +9495,8 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, int src_stride_v, uint8_t* dst_uv, int width) { - asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" @@ -9381,7 +9537,8 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, } void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { - asm("pxor %%xmm1,%%xmm1 \n" + asm volatile ( + "pxor %%xmm1,%%xmm1 \n" LABELALIGN "1: \n" diff --git a/source/row_lasx.cc b/source/row_lasx.cc index be85022e8..6d49aa5e8 100644 --- a/source/row_lasx.cc +++ b/source/row_lasx.cc @@ -2037,7 +2037,7 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, int width, const struct RgbConstants* rgbconstants) { int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; - asm volatile( + asm volatile ( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants @@ -2099,7 +2099,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, int width, const struct RgbConstants* rgbconstants) { int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; - asm volatile( + asm volatile ( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants @@ -2163,7 +2163,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; - asm volatile( + asm volatile ( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants diff --git a/source/row_lsx.cc b/source/row_lsx.cc index fa088c9e7..09f206cab 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -2805,8 +2805,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( - "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants + asm("vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants @@ -2864,8 +2863,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( - "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants + asm("vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants @@ -2922,8 +2920,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; - asm volatile( - "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants + asm("vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants diff --git a/source/row_neon.cc b/source/row_neon.cc index 8adbbbb5d..ef9e1c3c5 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -140,7 +140,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV444 YUVTORGB @@ -164,7 +164,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "1: \n" READYUV444 YUVTORGB RGBTORGB8 @@ -187,7 +187,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB @@ -212,7 +212,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "1: \n" READYUV444 YUVTORGB RGBTORGB8 @@ -238,7 +238,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB RGBTORGB8 @@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB @@ -285,7 +285,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB @@ -316,7 +316,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB @@ -348,7 +348,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB RGBTORGB8 @@ -381,7 +381,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "vmov.u8 d7, #0x0f \n" // vbic bits to clear @@ -404,7 +404,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV400 YUVTORGB @@ -421,7 +421,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, } void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 d23, #255 \n" "1: \n" "vld1.8 {d20}, [%0]! \n" @@ -442,7 +442,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 @@ -463,7 +463,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV21 YUVTORGB RGBTORGB8 @@ -484,7 +484,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 @@ -505,7 +505,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV21 YUVTORGB RGBTORGB8 @@ -526,7 +526,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 @@ -546,7 +546,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUY2 YUVTORGB RGBTORGB8 @@ -565,7 +565,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READUYVY YUVTORGB RGBTORGB8 @@ -585,7 +585,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop @@ -609,7 +609,7 @@ void DetileRow_NEON(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes "subs %2, %2, #16 \n" // 16 processed per loop @@ -629,7 +629,7 @@ void DetileRow_16_NEON(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels "subs %2, %2, #16 \n" // 16 processed per loop @@ -650,7 +650,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "1: \n" "vld2.8 {d0, d1}, [%0], %4 \n" "subs %3, %3, #16 \n" @@ -675,7 +675,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y "pld [%0, #1792] \n" @@ -701,7 +701,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV @@ -723,7 +723,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, #endif void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q14}, [%0]! \n" // Load lower bits. "vld1.8 {q9}, [%0]! \n" // Load upper bits row @@ -767,7 +767,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q1}, [%1]! \n" // load V @@ -789,7 +789,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile( + asm volatile ( "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB @@ -814,7 +814,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q0}, [%0]! \n" // load R "vld1.8 {q1}, [%1]! \n" // load G @@ -840,7 +840,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB @@ -868,7 +868,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q2}, [%0]! \n" // load R "vld1.8 {q1}, [%1]! \n" // load G @@ -895,7 +895,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB @@ -920,7 +920,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 q3, #255 \n" // load A(255) "1: \n" "vld1.8 {q2}, [%0]! \n" // load R @@ -947,7 +947,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r, int depth, int width) { int shift = 10 - depth; - asm volatile( + asm volatile ( "vmov.u32 q14, #1023 \n" "vdup.32 q15, %5 \n" "1: \n" @@ -984,7 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, uint8_t* dst_ar30, int /* depth */, int width) { - asm volatile( + asm volatile ( "vmov.u32 q14, #1023 \n" "1: \n" "vld1.16 {d4}, [%2]! \n" // B @@ -1021,7 +1021,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; - asm volatile( + asm volatile ( "vdup.u16 q15, %6 \n" "vdup.u16 q14, %7 \n" @@ -1061,7 +1061,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; - asm volatile( + asm volatile ( "vmov.u8 q3, #0xff \n" // A (0xffff) "vdup.u16 q15, %5 \n" @@ -1098,7 +1098,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, int depth, int width) { int shift = 8 - depth; - asm volatile( + asm volatile ( "vdup.16 q15, %6 \n" "1: \n" @@ -1134,7 +1134,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, int depth, int width) { int shift = 8 - depth; - asm volatile( + asm volatile ( "vdup.16 q15, %5 \n" "vmov.u8 d6, #0xff \n" // A (0xff) @@ -1162,7 +1162,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop @@ -1178,7 +1178,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { - asm volatile( + asm volatile ( "vdup.8 q0, %2 \n" // duplicate 16 bytes "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop @@ -1192,7 +1192,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { // ARGBSetRow writes 'width' pixels using an 32 bit value repeated. void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { - asm volatile( + asm volatile ( "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #4 \n" // 4 pixels per loop @@ -1205,7 +1205,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { } void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( + asm volatile ( // Start at end of source row. "add %0, %0, %2 \n" "sub %0, %0, #32 \n" // 32 bytes per loop @@ -1227,7 +1227,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { } void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile ( // Start at end of source row. "mov r12, #-16 \n" "add %0, %0, %2, lsl #1 \n" @@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( // Start at end of source row. "mov r12, #-16 \n" "add %0, %0, %3, lsl #1 \n" @@ -1272,7 +1272,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, } void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "add %0, %0, %2, lsl #2 \n" "sub %0, #32 \n" @@ -1296,7 +1296,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { src_rgb24 += width * 3 - 24; - asm volatile( + asm volatile ( "1: \n" "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 "subs %2, #8 \n" // 8 pixels per loop. @@ -1315,7 +1315,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 d4, #255 \n" // Alpha "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. @@ -1331,7 +1331,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, } void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 d4, #255 \n" // Alpha "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. @@ -1348,7 +1348,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { } void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile( + asm volatile ( "vmov.u8 d0, #255 \n" // Alpha "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. @@ -1364,7 +1364,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { ); } void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm volatile( + asm volatile ( "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1395,7 +1395,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 d3, #255 \n" // Alpha "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. @@ -1441,7 +1441,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 d3, #255 \n" // Alpha "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. @@ -1470,7 +1470,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 d3, #255 \n" // Alpha "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. @@ -1489,7 +1489,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" @@ -1506,7 +1506,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1522,7 +1522,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { } void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. @@ -1537,7 +1537,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { } void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. @@ -1555,7 +1555,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. @@ -1575,7 +1575,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. @@ -1596,7 +1596,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "add %1, %0, %1 \n" // stride + src_yuy2 "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. @@ -1623,7 +1623,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "add %1, %0, %1 \n" // stride + src_uyvy "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. @@ -1649,7 +1649,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile ( "add %1, %0, %1 \n" // stride + src_yuy2 "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. @@ -1673,7 +1673,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile( + asm volatile ( "vld1.8 {q2}, [%3] \n" // shuffler "1: \n" "vld1.8 {q0}, [%0]! \n" // load 4 pixels. @@ -1695,7 +1695,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile( + asm volatile ( "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld1.8 {d1}, [%1]! \n" // load 8 Us @@ -1717,7 +1717,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile( + asm volatile ( "1: \n" "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld1.8 {d0}, [%1]! \n" // load 8 Us @@ -1737,7 +1737,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width) { - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1755,7 +1755,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, uint32_t dither4, int width) { - asm volatile( + asm volatile ( "vdup.32 d7, %2 \n" // dither4 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. @@ -1776,7 +1776,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb1555, int width) { - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1793,7 +1793,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { - asm volatile( + asm volatile ( "vmov.u8 d7, #0x0f \n" // bits to clear with // vbic. "1: \n" @@ -1812,7 +1812,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels @@ -1838,7 +1838,7 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) { - asm volatile( + asm volatile ( "vld1.8 {d0}, [%4] \n" // load rgbuvconstants "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient @@ -2366,7 +2366,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient @@ -2432,7 +2432,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient @@ -2498,7 +2498,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient @@ -2550,7 +2550,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, } void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient @@ -2576,7 +2576,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient @@ -2602,7 +2602,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient @@ -2628,7 +2628,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q2}, [%0]! \n" @@ -2651,7 +2651,7 @@ static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm volatile( + asm volatile ( "vld1.8 {q4}, [%3] \n" // shuffler "1: \n" @@ -2677,7 +2677,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" @@ -2703,7 +2703,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vld1.8 {d8}, [%3] \n" // shuffler "1: \n" @@ -2756,7 +2756,7 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile ( "vld1.8 {d0}, [%3] \n" // load rgbconstants "vdup.u8 d20, d0[0] \n" "vdup.u8 d21, d0[1] \n" @@ -2806,7 +2806,7 @@ void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile ( "vld1.8 {d0}, [%3] \n" // load rgbconstants "vdup.u8 d20, d0[0] \n" "vdup.u8 d21, d0[1] \n" @@ -2850,7 +2850,7 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile ( "vld1.8 {d0}, [%3] \n" // load rgbconstants "vdup.u8 d20, d0[0] \n" "vdup.u8 d21, d0[1] \n" @@ -2902,7 +2902,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, int dst_width, int source_y_fraction) { int y1_fraction = source_y_fraction; - asm volatile( + asm volatile ( "cmp %4, #0 \n" "beq 100f \n" "add %2, %1 \n" @@ -2964,7 +2964,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint16_t* src_ptr1 = src_ptr + src_stride; - asm volatile( + asm volatile ( "cmp %4, #0 \n" "beq 100f \n" "cmp %4, #128 \n" @@ -3019,7 +3019,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "subs %3, #8 \n" "blt 89f \n" // Blend 8 pixels. @@ -3078,7 +3078,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u16 q15, #0x00ff \n" // 255 for rounding up // Attenuate 8 pixels. @@ -3107,7 +3107,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int interval_size, int interval_offset, int width) { - asm volatile( + asm volatile ( "vdup.u16 q8, %2 \n" "vshr.u16 q8, q8, #1 \n" // scale >>= 1 "vdup.u16 q9, %3 \n" // interval multiply. @@ -3149,7 +3149,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { - asm volatile( + asm volatile ( "vdup.u32 q0, %3 \n" // duplicate scale value. "vzip.u8 d0, d1 \n" // d0 aarrggbb. "vshr.u16 q0, q0, #1 \n" // scale / 2. @@ -3183,7 +3183,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // Similar to ARGBToYJ but stores ARGB. // C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient @@ -3210,7 +3210,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 d20, #17 \n" // BB coefficient "vmov.u8 d21, #68 \n" // BG coefficient "vmov.u8 d22, #35 \n" // BR coefficient @@ -3251,7 +3251,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { - asm volatile( + asm volatile ( "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16. @@ -3310,7 +3310,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( // 8 pixel loop. "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -3339,7 +3339,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( // 8 pixel loop. "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -3362,7 +3362,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( // 8 pixel loop. "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -3389,7 +3389,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" @@ -3414,7 +3414,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( // 16 pixel loop. "1: \n" "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. @@ -3440,7 +3440,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" @@ -3467,7 +3467,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d1}, [%0],%6 \n" @@ -3505,7 +3505,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d1}, [%1],%4 \n" @@ -3542,7 +3542,7 @@ void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float /*unused*/, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q1}, [%0]! \n" // load 8 shorts @@ -3568,7 +3568,7 @@ void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q1}, [%0]! \n" // load 8 shorts @@ -3594,7 +3594,7 @@ void ByteToFloatRow_NEON(const uint8_t* src, float* dst, float scale, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {d2}, [%0]! \n" // load 8 bytes @@ -3623,7 +3623,7 @@ void GaussCol_NEON(const uint16_t* src0, const uint16_t* src4, uint32_t* dst, int width) { - asm volatile( + asm volatile ( "vmov.u16 d6, #4 \n" // constant 4 "vmov.u16 d7, #6 \n" // constant 6 @@ -3660,7 +3660,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src1 = src + 1; const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; - asm volatile( + asm volatile ( "vmov.u32 q10, #4 \n" // constant 4 "vmov.u32 q11, #6 \n" // constant 6 @@ -3698,7 +3698,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q2}, [%0]! \n" // load 16 Y values "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values @@ -3722,7 +3722,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile ( "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. @@ -3753,7 +3753,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width) { - asm volatile( + asm volatile ( "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. @@ -3783,7 +3783,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, // Copy row of AYUV Y's into Y. // Similar to ARGBExtractAlphaRow_NEON void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels @@ -3799,7 +3799,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile( + asm volatile ( "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values "vld2.8 {d1, d3}, [%0]! \n" @@ -3822,7 +3822,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, int width) { const uint8_t* src_u_1 = src_u + src_stride_u; const uint8_t* src_v_1 = src_v + src_stride_v; - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q0}, [%0]! \n" // load 16 U values "vld1.8 {q1}, [%2]! \n" // load 16 V values @@ -3853,7 +3853,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, int depth, int width) { int shift = depth - 16; // Negative for right shift. - asm volatile( + asm volatile ( "vdup.16 q2, %4 \n" "1: \n" "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV @@ -3877,7 +3877,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int depth, int width) { int shift = 16 - depth; - asm volatile( + asm volatile ( "vdup.16 q2, %4 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" // load 8 U @@ -3899,7 +3899,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile( + asm volatile ( "vdup.16 q2, %3 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" @@ -3921,7 +3921,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile( + asm volatile ( "vdup.16 d8, %3 \n" "1: \n" "vld1.16 {q2, q3}, [%0]! \n" @@ -3953,7 +3953,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y, int scale, int width) { int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr - asm volatile( + asm volatile ( "vdup.16 q2, %3 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 372b1efc2..70b44d226 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -210,7 +210,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ "1: \n" READYUV444 I4XXTORGB @@ -234,7 +234,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "1: \n" READYUV444 I4XXTORGB RGBTORGB8 @@ -261,7 +261,7 @@ void I210ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; - asm(YUVTORGB_SETUP + asm volatile (YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "dup v23.8h, %w[alpha] \n" "1: \n" READYUV210 NVTORGB @@ -289,7 +289,7 @@ void I410ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; - asm(YUVTORGB_SETUP + asm volatile (YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "dup v23.8h, %w[alpha] \n" "1: \n" READYUV410 NVTORGB @@ -313,7 +313,7 @@ void I210ToARGBRow_NEON(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP + asm volatile (YUVTORGB_SETUP "movi v19.8b, #255 \n" "1: \n" READYUV210 NVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" @@ -335,7 +335,7 @@ void I410ToARGBRow_NEON(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP + asm volatile (YUVTORGB_SETUP "movi v19.8b, #255 \n" "1: \n" READYUV410 NVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" @@ -357,7 +357,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ "1: \n" READYUV422 I4XXTORGB @@ -382,7 +382,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "1: \n" "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 @@ -408,7 +408,7 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP + asm volatile (YUVTORGB_SETUP "1: \n" "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410 "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 @@ -433,7 +433,7 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP + asm volatile (YUVTORGB_SETUP "1: \n" "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210 "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 @@ -458,7 +458,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "1: \n" "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 @@ -483,7 +483,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "movi v15.8b, #255 \n" /* A */ "1: \n" READYUV422 I4XXTORGB @@ -507,7 +507,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "1: \n" READYUV422 I4XXTORGB RGBTORGB8 @@ -549,7 +549,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "1: \n" READYUV422 I4XXTORGB RGBTORGB8_TOP @@ -591,7 +591,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - asm(YUVTORGB_SETUP + asm volatile (YUVTORGB_SETUP "movi v19.8h, #0x80, lsl #8 \n" "1: \n" // READYUV422 I4XXTORGB RGBTORGB8_TOP @@ -621,7 +621,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "1: \n" READYUV422 I4XXTORGB RGBTORGB8 @@ -645,7 +645,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "movi v1.16b, #128 \n" "movi v19.8b, #255 \n" @@ -668,7 +668,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, #if LIBYUV_USE_ST4 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v23.8b, #255 \n" "1: \n" "ld1 {v20.8b}, [%0], #8 \n" @@ -686,7 +686,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { } #else void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v20.8b, #255 \n" "1: \n" "ldr d16, [%0], #8 \n" @@ -711,7 +711,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" @@ -734,7 +734,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" @@ -757,7 +757,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 NVTORGB RGBTORGB8 @@ -779,7 +779,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 NVTORGB RGBTORGB8 @@ -801,7 +801,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 NVTORGB @@ -825,7 +825,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV21InterleavedTable]] \n" @@ -846,7 +846,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( + asm volatile ( YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12InterleavedTable]] \n" @@ -868,7 +868,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "subs %w3, %w3, #16 \n" // 16 processed per loop @@ -893,7 +893,7 @@ void DetileRow_NEON(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes "subs %w2, %w2, #16 \n" // 16 processed per loop @@ -913,7 +913,7 @@ void DetileRow_16_NEON(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels "subs %w2, %w2, #16 \n" // 16 processed per loop @@ -934,7 +934,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "1: \n" "ld2 {v0.8b,v1.8b}, [%0], %4 \n" "subs %w3, %w3, #16 \n" @@ -959,7 +959,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys "prfm pldl1keep, [%0, 1792] \n" @@ -985,7 +985,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs @@ -1010,7 +1010,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, // Unpack MT2T into tiled P010 64 pixels at a time. See // tinyurl.com/mtk-10bit-video-format for format documentation. void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v7.16b}, [%0], #16 \n" "ld1 {v0.16b-v3.16b}, [%0], #64 \n" @@ -1051,7 +1051,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V @@ -1075,7 +1075,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int depth, int width) { int shift = 16 - depth; - asm volatile( + asm volatile ( "dup v2.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U @@ -1100,7 +1100,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V @@ -1126,7 +1126,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int depth, int width) { int shift = 16 - depth; - asm volatile( + asm volatile ( "dup v4.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U @@ -1155,7 +1155,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile( + asm volatile ( "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB "subs %w4, %w4, #16 \n" // 16 processed per loop @@ -1180,7 +1180,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G @@ -1208,7 +1208,7 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w5, %w5, #16 \n" // 16 processed per loop @@ -1237,7 +1237,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v1.16b}, [%1], #16 \n" // load G @@ -1268,7 +1268,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v1.16b}, [%1], #16 \n" // load G @@ -1308,7 +1308,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w4, %w4, #16 \n" // 16 processed per loop @@ -1333,7 +1333,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v3.16b, #255 \n" // load A(255) "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load R @@ -1362,7 +1362,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r, int depth, int width) { int shift = 10 - depth; - asm volatile( + asm volatile ( "movi v30.16b, #255 \n" "ushr v30.4s, v30.4s, #22 \n" // 1023 "dup v31.4s, %w5 \n" @@ -1403,7 +1403,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, // Neon has no "shift left and accumulate/orr", so use a multiply-add to // perform the shift instead. int limit = 1023; - asm volatile( + asm volatile ( "dup v5.8h, %w[limit] \n" "movi v6.8h, #16 \n" // 1 << 4 "movi v7.8h, #4, lsl #8 \n" // 1 << 10 @@ -1439,7 +1439,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; - asm volatile( + asm volatile ( "dup v30.8h, %w7 \n" "dup v31.8h, %w6 \n" @@ -1482,7 +1482,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; - asm volatile( + asm volatile ( "movi v3.16b, #0xff \n" // A (0xffff) "dup v30.8h, %w6 \n" @@ -1523,7 +1523,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, int width) { // Shift is 8 - depth, +8 so the result is in the top half of each lane. int shift = 16 - depth; - asm volatile( + asm volatile ( "dup v31.8h, %w6 \n" "1: \n" "ldr q0, [%0], #16 \n" // B @@ -1561,7 +1561,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, int width) { // Shift is 8 - depth, +8 so the result is in the top half of each lane. int shift = 16 - depth; - asm volatile( + asm volatile ( "dup v31.8h, %w5 \n" "movi v3.16b, #0xff \n" // A (0xff) "1: \n" @@ -1590,7 +1590,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, // Copy multiple of 32. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( + asm volatile ( "1: \n" "ldp q0, q1, [%0], #32 \n" "prfm pldl1keep, [%0, 448] \n" @@ -1607,7 +1607,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { - asm volatile( + asm volatile ( "dup v0.16b, %w2 \n" // duplicate 16 bytes "1: \n" "subs %w1, %w1, #16 \n" // 16 bytes per loop @@ -1620,7 +1620,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { } void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { - asm volatile( + asm volatile ( "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" "subs %w1, %w1, #4 \n" // 4 ints per loop @@ -1637,7 +1637,7 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( + asm volatile ( // Start at end of source row. "ld1 {v3.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw \n" @@ -1662,7 +1662,7 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile ( // Start at end of source row. "ld1 {v4.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw #1 \n" @@ -1686,7 +1686,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( // Start at end of source row. "ld1 {v4.16b}, [%4] \n" // shuffler "add %0, %0, %w3, sxtw #1 \n" @@ -1715,7 +1715,7 @@ static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u}; void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( // Start at end of source row. "ld1 {v4.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw #2 \n" @@ -1738,7 +1738,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { - asm volatile( + asm volatile ( "ld1 {v3.16b}, [%4] \n" // shuffler "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. "add %0, %0, %w2, sxtw \n" @@ -1763,7 +1763,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v4.8b, #255 \n" // Alpha "1: \n" "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of @@ -1781,7 +1781,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, } void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v5.8b, #255 \n" // Alpha "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b @@ -1800,7 +1800,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { } void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile( + asm volatile ( "movi v0.8b, #255 \n" // Alpha "1: \n" "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b @@ -1819,7 +1819,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { } void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm volatile( + asm volatile ( "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -1849,7 +1849,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v3.8b, #255 \n" // Alpha "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. @@ -1892,7 +1892,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "prfm pldl1keep, [%0, 448] \n" @@ -1926,7 +1926,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -1955,7 +1955,7 @@ static void ABCDToAR30Row_NEON(const uint8_t* src_abcd, uint8_t* dst_ar30, int width, const uint8_t* indices) { - asm volatile( + asm volatile ( "movi v2.4s, #0xf, msl 16 \n" // 0xfffff "ldr q3, [%[kAR30Row_BoxShifts]] \n" "ldp q4, q5, [%[indices]] \n" @@ -1997,7 +1997,7 @@ void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w2, %w2, #16 \n" // 16 pixels per loop. @@ -2013,7 +2013,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -2031,7 +2031,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { } void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %w2, %w2, #16 \n" // 16 processed per loop. @@ -2047,7 +2047,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { } void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %w2, %w2, #16 \n" // 16 processed per loop. @@ -2066,7 +2066,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. @@ -2087,7 +2087,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. @@ -2110,7 +2110,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. @@ -2138,7 +2138,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. @@ -2165,7 +2165,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_uv, int width) { const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; - asm volatile( + asm volatile ( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. @@ -2188,7 +2188,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile( + asm volatile ( "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. @@ -2210,7 +2210,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile( + asm volatile ( "1: \n" "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys "subs %w4, %w4, #16 \n" // 16 pixels @@ -2234,7 +2234,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile( + asm volatile ( "1: \n" "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys "mov v3.8b, v2.8b \n" @@ -2256,7 +2256,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels @@ -2275,7 +2275,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, uint32_t dither4, int width) { - asm volatile( + asm volatile ( "dup v1.4s, %w3 \n" // dither4 "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB @@ -2296,7 +2296,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb1555, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels @@ -2314,7 +2314,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels @@ -2333,7 +2333,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile( + asm volatile ( "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels "mov v1.16b, v0.16b \n" @@ -2356,7 +2356,7 @@ static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm volatile( + asm volatile ( "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels @@ -2379,7 +2379,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile( + asm volatile ( "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -2404,7 +2404,7 @@ static const uvec8 kShuffleARGBToAB64[2] = { void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm volatile( + asm volatile ( "ldp q6, q7, [%3] \n" // 2 shufflers "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 pixels @@ -2430,7 +2430,7 @@ static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15, void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels @@ -2454,7 +2454,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15, void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels @@ -2475,7 +2475,7 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 "prfm pldl1keep, [%0, 448] \n" @@ -2501,7 +2501,7 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) { - asm volatile( + asm volatile ( "ldr d0, [%4] \n" // load rgbuvconstants "dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient "dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient @@ -3009,7 +3009,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_v, int width) { const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; - asm volatile( + asm volatile ( RGBTOUV_SETUP_REG "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. @@ -3067,7 +3067,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_v, int width) { const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; - asm volatile( + asm volatile ( RGBTOUV_SETUP_REG "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. @@ -3125,7 +3125,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_v, int width) { const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; - asm volatile( + asm volatile ( RGBTOUV_SETUP_REG // sets v20-v25 "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. @@ -3179,7 +3179,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, } void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "movi v24.8b, #25 \n" // B * 0.1016 coefficient "movi v25.8b, #129 \n" // G * 0.5078 coefficient "movi v26.8b, #66 \n" // R * 0.2578 coefficient @@ -3207,7 +3207,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "movi v4.8b, #25 \n" // B * 0.1016 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient "movi v6.8b, #66 \n" // R * 0.2578 coefficient @@ -3234,7 +3234,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "movi v24.8b, #25 \n" // B * 0.1016 coefficient "movi v25.8b, #129 \n" // G * 0.5078 coefficient "movi v26.8b, #66 \n" // R * 0.2578 coefficient @@ -3268,7 +3268,7 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile ( "ldr d0, [%3] \n" // load rgbconstants "dup v6.16b, v0.b[0] \n" "dup v7.16b, v0.b[1] \n" @@ -3302,7 +3302,7 @@ ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile ( "ldr d0, [%3] \n" // load rgbconstants "dup v16.4s, v0.s[0] \n" "dup v17.8h, v0.h[2] \n" @@ -3404,7 +3404,7 @@ void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile ( "ldr d0, [%3] \n" // load rgbconstants "dup v6.16b, v0.b[0] \n" "dup v7.16b, v0.b[1] \n" @@ -3476,7 +3476,7 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile ( "ldr d0, [%3] \n" // load rgbconstants "dup v5.16b, v0.b[0] \n" "dup v6.16b, v0.b[1] \n" @@ -3528,7 +3528,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; - asm volatile( + asm volatile ( "cmp %w4, #0 \n" "b.eq 100f \n" "cmp %w4, #128 \n" @@ -3594,7 +3594,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint16_t* src_ptr1 = src_ptr + src_stride; - asm volatile( + asm volatile ( "cmp %w4, #0 \n" "b.eq 100f \n" "cmp %w4, #128 \n" @@ -3666,7 +3666,7 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, const uint16_t* src_ptr1 = src_ptr + src_stride; int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr - asm volatile( + asm volatile ( "dup v6.8h, %w6 \n" "cmp %w4, #0 \n" "b.eq 100f \n" @@ -3734,7 +3734,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "subs %w3, %w3, #8 \n" "b.lt 89f \n" // Blend 8 pixels. @@ -3805,7 +3805,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v7.8h, #0x00ff \n" // 255 for rounding up // Attenuate 8 pixels. @@ -3835,7 +3835,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int interval_size, int interval_offset, int width) { - asm volatile( + asm volatile ( "dup v4.8h, %w2 \n" "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 "dup v5.8h, %w3 \n" // interval multiply. @@ -3878,7 +3878,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { - asm volatile( + asm volatile ( "dup v0.4s, %w3 \n" // duplicate scale value. "zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b aarrggbbaarrggbb. "ushr v0.8h, v0.8h, #1 \n" // scale / 2. @@ -3913,7 +3913,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // Similar to ARGBToYJ but stores ARGB. // C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v24.8b, #29 \n" // B * 0.1140 coefficient "movi v25.8b, #150 \n" // G * 0.5870 coefficient "movi v26.8b, #77 \n" // R * 0.2990 coefficient @@ -3942,7 +3942,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { // r = (r * 50 + g * 98 + b * 24) >> 7 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v20.8b, #17 \n" // BB coefficient "movi v21.8b, #68 \n" // BG coefficient "movi v22.8b, #35 \n" // BR coefficient @@ -3984,7 +3984,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { - asm volatile( + asm volatile ( "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. @@ -4100,7 +4100,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( // 8 pixel loop. "1: \n" "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB @@ -4131,7 +4131,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( // 8 pixel loop. "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 ARGB @@ -4156,7 +4156,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( // 8 pixel loop. "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 ARGB @@ -4185,7 +4185,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" @@ -4212,7 +4212,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( // 16 pixel loop. "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. @@ -4240,7 +4240,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile( + asm volatile ( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" @@ -4269,7 +4269,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v1.8b}, [%0],%6 \n" @@ -4310,7 +4310,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.8b}, [%0],%4 \n" // left "ld1 {v1.8b}, [%1],%4 \n" @@ -4346,7 +4346,7 @@ void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float /*unused*/, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop @@ -4370,7 +4370,7 @@ void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop @@ -4396,7 +4396,7 @@ void ByteToFloatRow_NEON(const uint8_t* src, float* dst, float scale, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes "subs %w2, %w2, #8 \n" // 8 pixels per loop @@ -4421,7 +4421,7 @@ void ByteToFloatRow_NEON(const uint8_t* src, void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16 float* dst, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v1.8h}, [%0], #16 \n" // load 8 halffloats "subs %w2, %w2, #8 \n" // 8 floats per loop @@ -4443,7 +4443,7 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 int src_stride, // stride in elements float* dst, int width) { - asm volatile( + asm volatile ( "cmp %w2, #8 \n" // Is there 8 rows? "b.lo 2f \n" "1: \n" @@ -4481,7 +4481,7 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 void ConvertFP32ToFP16Row_NEON(const float* src, uint16_t* dst, // fp16 int width) { - asm volatile( + asm volatile ( "1: \n" "ldp q2, q3, [%0], #32 \n" // load 8 floats "subs %w2, %w2, #8 \n" // 8 floats per loop @@ -4502,7 +4502,7 @@ float ScaleMaxSamples_NEON(const float* src, float scale, int width) { float fmax; - asm volatile( + asm volatile ( "movi v5.4s, #0 \n" // max "movi v6.4s, #0 \n" @@ -4532,7 +4532,7 @@ float ScaleSumSamples_NEON(const float* src, float scale, int width) { float fsum; - asm volatile( + asm volatile ( "movi v5.4s, #0 \n" // max "movi v6.4s, #0 \n" // max @@ -4559,7 +4559,7 @@ float ScaleSumSamples_NEON(const float* src, } void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "prfm pldl1keep, [%0, 448] \n" @@ -4583,7 +4583,7 @@ void GaussCol_NEON(const uint16_t* src0, const uint16_t* src4, uint32_t* dst, int width) { - asm volatile( + asm volatile ( "movi v6.8h, #4 \n" // constant 4 "movi v7.8h, #6 \n" // constant 6 @@ -4625,7 +4625,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src1 = src + 1; const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; - asm volatile( + asm volatile ( "movi v6.4s, #4 \n" // constant 4 "movi v7.4s, #6 \n" // constant 6 @@ -4668,7 +4668,7 @@ void GaussCol_F32_NEON(const float* src0, const float* src4, float* dst, int width) { - asm volatile( + asm volatile ( "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 "1: \n" @@ -4706,7 +4706,7 @@ void GaussCol_F32_NEON(const float* src0, // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussRow_F32_NEON(const float* src, float* dst, int width) { - asm volatile( + asm volatile ( "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 "1: \n" @@ -4745,7 +4745,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values @@ -4776,7 +4776,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile( + asm volatile ( "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values @@ -4806,7 +4806,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_uv, int width) { const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv @@ -4835,7 +4835,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, uint8_t* dst_vu, int width) { const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv @@ -4861,7 +4861,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, // Copy row of AYUV Y's into Y void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 "subs %w2, %w2, #16 \n" // 16 pixels per loop @@ -4877,7 +4877,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values "ld1 {v1.16b}, [%0], 16 \n" @@ -4902,7 +4902,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, int width) { const uint8_t* src_u_1 = src_u + src_stride_u; const uint8_t* src_v_1 = src_v + src_stride_v; - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values @@ -4937,7 +4937,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, int depth, int width) { int shift = depth - 16; // Negative for right shift. - asm volatile( + asm volatile ( "dup v2.8h, %w4 \n" "1: \n" "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV @@ -4960,7 +4960,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile( + asm volatile ( "dup v2.8h, %w3 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" @@ -4981,7 +4981,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile( + asm volatile ( "dup v4.8h, %w3 \n" "1: \n" "ldp q2, q3, [%0], #32 \n" @@ -5015,7 +5015,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y, // saturate, then we can just use UZP2 to narrow rather than a pair of // saturating narrow instructions. int shift = 23 - __builtin_clz((int32_t)scale); - asm volatile( + asm volatile ( "dup v2.8h, %w3 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 3b04ec5f7..0533866c0 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -47,7 +47,7 @@ extern "C" { // register) is set to round-to-nearest-up mode(0). #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ { \ - asm volatile("csrwi vxrm, 0"); \ + asm volatile ("csrwi vxrm, 0"); \ ub = yuvconst->kUVCoeff[0]; \ vr = yuvconst->kUVCoeff[1]; \ ug = yuvconst->kUVCoeff[2]; \ @@ -1238,7 +1238,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y, vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl); // To match behavior on other platforms, vxrm (fixed-point rounding mode // register) sets to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); if (is_yb_positive) { v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl); } else { @@ -1632,7 +1632,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, } // To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); // Blend 50 / 50. if (y1_fraction == 128) { do { diff --git a/source/row_sve.cc b/source/row_sve.cc index f6c1fe624..89a86d53b 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -139,7 +139,8 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { uint64_t vl; - asm("cnth %[vl] \n" + asm volatile ( + "cnth %[vl] \n" "ptrue p0.b \n" YUVTORGB_SVE_SETUP "dup z19.b, #255 \n" /* A */ "subs %w[width], %w[width], %w[vl] \n" @@ -181,7 +182,8 @@ void I400ToARGBRow_SVE2(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { uint64_t vl; - asm("cnth %[vl] \n" + asm volatile ( + "cnth %[vl] \n" "ptrue p0.b \n" "dup z19.b, #255 \n" // A YUVTORGB_SVE_SETUP @@ -229,7 +231,8 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { uint64_t vl; - asm("cnth %[vl] \n" + asm volatile ( + "cnth %[vl] \n" "ptrue p0.b \n" YUVTORGB_SVE_SETUP "dup z19.b, #255 \n" /* A */ "subs %w[width], %w[width], %w[vl] \n" @@ -273,7 +276,8 @@ void I422ToRGBARow_SVE2(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { uint64_t vl; - asm("cnth %[vl] \n" + asm volatile ( + "cnth %[vl] \n" "ptrue p0.b \n" YUVTORGB_SVE_SETUP "dup z19.b, #255 \n" // A "subs %w[width], %w[width], %w[vl] \n" @@ -318,7 +322,8 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { uint64_t vl; - asm("cnth %[vl] \n" + asm volatile ( + "cnth %[vl] \n" "ptrue p0.b \n" YUVTORGB_SVE_SETUP "subs %w[width], %w[width], %w[vl] \n" "b.lt 2f \n" @@ -366,7 +371,8 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { uint64_t vl; - asm("cnth %[vl] \n" + asm volatile ( + "cnth %[vl] \n" "ptrue p0.b \n" YUVTORGB_SVE_SETUP "subs %w[width], %w[width], %w[vl] \n" "b.lt 2f \n" @@ -416,11 +422,13 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, uint32_t nv_v_start, uint32_t nv_v_step) { uint64_t vl; - asm("cnth %0" : "=r"(vl)); + asm volatile ( + "cnth %0" : "=r"(vl)); int width_last_y = width & (vl - 1); width_last_y = width_last_y == 0 ? vl : width_last_y; int width_last_uv = width_last_y + (width_last_y & 1); - asm("ptrue p0.b \n" YUVTORGB_SVE_SETUP + asm volatile ( + "ptrue p0.b \n" YUVTORGB_SVE_SETUP "index z22.s, %w[nv_u_start], %w[nv_u_step] \n" "index z23.s, %w[nv_v_start], %w[nv_v_step] \n" "dup z19.b, #255 \n" // A @@ -534,7 +542,7 @@ void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, const int16_t* uvconstants) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; uint64_t vl; - asm volatile( + asm volatile ( "ptrue p0.b \n" "ld1rd {z24.d}, p0/z, [%[uvconstants]] \n" "ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n" @@ -746,7 +754,8 @@ void ARGBToRGB565Row_SVE2(const uint8_t* src_argb, unsigned bsl_mask = 0x7e0; uint64_t vl; width *= 2; - asm("mov z3.h, #3 \n" + asm volatile ( + "mov z3.h, #3 \n" "dup z4.h, %w[bsl_mask] \n" "cntb %[vl] \n" @@ -787,7 +796,8 @@ void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb, unsigned bsl_mask = 0x7e0; uint64_t vl; width *= 2; - asm("mov z3.h, #3 \n" + asm volatile ( + "mov z3.h, #3 \n" "dup z4.h, %w[bsl_mask] \n" "dup z2.s, %w[dither4] \n" "zip1 z2.b, z2.b, z2.b \n" @@ -844,7 +854,8 @@ void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { uint64_t vl; - asm("mov z4.h, #0x0300 \n" + asm volatile ( + "mov z4.h, #0x0300 \n" "ptrue p0.b \n" "cnth %x[vl] \n" @@ -912,7 +923,8 @@ void AYUVToUVRow_SVE2(const uint8_t* src_ayuv, // Output a row of UV values, filtering 2x2 rows of AYUV. const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv; int vl; - asm("cntb %x[vl] \n" + asm volatile ( + "cntb %x[vl] \n" "subs %w[width], %w[width], %w[vl] \n" "b.lt 2f \n" @@ -950,7 +962,8 @@ void AYUVToVURow_SVE2(const uint8_t* src_ayuv, // Output a row of VU values, filtering 2x2 rows of AYUV. const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv; int vl; - asm("cntb %x[vl] \n" + asm volatile ( + "cntb %x[vl] \n" "cmp %w[width], %w[vl] \n" "subs %w[width], %w[width], %w[vl] \n" "b.lt 2f \n" @@ -990,10 +1003,12 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2, uint32_t nv_v_start = 0x0003'0003U; uint32_t nv_v_step = 0x0004'0004U; uint64_t vl; - asm("cnth %0" : "=r"(vl)); + asm volatile ( + "cnth %0" : "=r"(vl)); int width_last_y = width & (vl - 1); int width_last_uv = width_last_y + (width_last_y & 1); - asm("ptrue p0.b \n" + asm volatile ( + "ptrue p0.b \n" "index z22.s, %w[nv_u_start], %w[nv_u_step] \n" "index z23.s, %w[nv_v_start], %w[nv_v_step] \n" "dup z19.b, #255 \n" // A @@ -1047,10 +1062,12 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy, uint32_t nv_v_start = 0x0002'0002U; uint32_t nv_v_step = 0x0004'0004U; uint64_t vl; - asm("cnth %0" : "=r"(vl)); + asm volatile ( + "cnth %0" : "=r"(vl)); int width_last_y = width & (vl - 1); int width_last_uv = width_last_y + (width_last_y & 1); - asm("ptrue p0.b \n" + asm volatile ( + "ptrue p0.b \n" "index z22.s, %w[nv_u_start], %w[nv_u_step] \n" "index z23.s, %w[nv_v_start], %w[nv_v_step] \n" "dup z19.b, #255 \n" // A diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 304770d0c..9dfe64a93 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -193,7 +193,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, "m"(kShuf1), // %1 "m"(kShuf2) // %2 ); - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm2 \n" @@ -515,7 +515,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "m"(kMadd11), // %1 "m"(kRound34) // %2 ); - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" @@ -578,7 +578,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "m"(kRound34) // %2 ); - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" @@ -667,7 +667,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAb2), // %2 "m"(kScaleAb2) // %3 ); - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n" @@ -708,7 +708,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAc3), // %1 "m"(kScaleAc33) // %2 ); - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n" @@ -821,7 +821,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "pxor %%xmm0,%%xmm0 \n" // 0 // above line @@ -1900,7 +1900,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, int dx) { (void)x; (void)dx; - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -1925,7 +1925,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1947,7 +1947,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1971,7 +1971,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -2153,7 +2153,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, int dx) { (void)x; (void)dx; - asm(LABELALIGN + asm volatile (LABELALIGN "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" diff --git a/source/scale_neon.cc b/source/scale_neon.cc index ccc751062..309d7b0bf 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -28,7 +28,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" // load even pixels into q0, odd into q1 "vld2.8 {q0, q1}, [%0]! \n" @@ -49,7 +49,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels "subs %2, %2, #16 \n" // 16 processed per loop @@ -69,7 +69,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { - asm volatile( + asm volatile ( // change the stride to row 2 pointer "add %1, %0 \n" "1: \n" @@ -100,7 +100,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #8 \n" // 8 processed per loop @@ -120,7 +120,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr3 = src_ptr + src_stride * 3; - asm volatile( + asm volatile ( "1: \n" "vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q1}, [%3]! \n" @@ -154,7 +154,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #24 \n" @@ -172,7 +172,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( + asm volatile ( "vmov.u8 d24, #3 \n" "add %3, %0 \n" "1: \n" @@ -229,7 +229,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( + asm volatile ( "vmov.u8 d24, #3 \n" "add %3, %0 \n" "1: \n" @@ -281,7 +281,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "vld1.8 {q3}, [%3] \n" "1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n" @@ -305,7 +305,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride * 2; - asm volatile( + asm volatile ( "vld1.16 {q13}, [%5] \n" "vld1.8 {q14}, [%6] \n" "vld1.8 {q15}, [%7] \n" @@ -415,7 +415,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( + asm volatile ( "vld1.16 {q13}, [%4] \n" "vld1.8 {q14}, [%5] \n" "add %3, %0 \n" @@ -508,7 +508,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 1; - asm volatile( + asm volatile ( "vmov.u8 d30, #3 \n" "1: \n" @@ -545,7 +545,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp1 = src_ptr1 + 1; - asm volatile( + asm volatile ( "vmov.u16 q15, #3 \n" "vmov.u8 d28, #3 \n" @@ -607,7 +607,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; - asm volatile( + asm volatile ( "vmov.u16 q15, #3 \n" "1: \n" @@ -643,7 +643,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; - asm volatile( + asm volatile ( "vmov.u16 q15, #3 \n" "1: \n" @@ -694,7 +694,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; - asm volatile( + asm volatile ( "vmov.u16 d31, #3 \n" "1: \n" @@ -738,7 +738,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; - asm volatile( + asm volatile ( "vmov.u16 d31, #3 \n" "vmov.u32 q14, #3 \n" @@ -790,7 +790,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 2; - asm volatile( + asm volatile ( "vmov.u8 d30, #3 \n" "1: \n" @@ -827,7 +827,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp1 = src_ptr1 + 2; - asm volatile( + asm volatile ( "vmov.u16 q15, #3 \n" "vmov.u8 d28, #3 \n" @@ -889,7 +889,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 2; - asm volatile( + asm volatile ( "vmov.u16 d30, #3 \n" "1: \n" @@ -934,7 +934,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp1 = src_ptr1 + 2; - asm volatile( + asm volatile ( "vmov.u16 d30, #3 \n" "vmov.u32 q14, #3 \n" @@ -987,7 +987,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile( + asm volatile ( "1: \n" "vld1.16 {q1, q2}, [%1] \n" // load accumulator "vld1.8 {q0}, [%0]! \n" // load 16 bytes @@ -1086,7 +1086,7 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { - asm volatile( + asm volatile ( "cmp %4, #0 \n" "beq 100f \n" "add %2, %1 \n" @@ -1170,7 +1170,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB @@ -1198,7 +1198,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB @@ -1219,7 +1219,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { - asm volatile( + asm volatile ( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" @@ -1258,7 +1258,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "mov r12, %3, lsl #2 \n" "1: \n" "vld1.32 {d0[0]}, [%0], r12 \n" @@ -1282,7 +1282,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width) { - asm volatile( + asm volatile ( "mov r12, %4, lsl #2 \n" "add %1, %1, %0 \n" "1: \n" @@ -1330,7 +1330,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, int dx) { int tmp; const uint8_t* src_tmp = src_argb; - asm volatile( + asm volatile ( "1: \n" // clang-format off LOAD1_DATA32_LANE(d0, 0) @@ -1433,7 +1433,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV @@ -1452,7 +1452,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV @@ -1471,7 +1471,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { - asm volatile( + asm volatile ( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" @@ -1506,7 +1506,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, const uint8_t* src2_ptr = src_ptr + src_stepx * 4; const uint8_t* src3_ptr = src_ptr + src_stepx * 6; (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "vld1.16 {d0[0]}, [%0], %6 \n" "vld1.16 {d0[1]}, [%1], %6 \n" diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 12b4b4d09..da1e3d436 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -26,7 +26,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" // load even pixels into v0, odd into v1 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" @@ -48,7 +48,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" // load even pixels into v0, odd into v1 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" @@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { - asm volatile( + asm volatile ( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" @@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "subs %w2, %w2, #8 \n" // 8 processed per loop @@ -122,7 +122,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr3 = src_ptr + src_stride * 3; - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 "ld1 {v1.16b}, [%2], #16 \n" @@ -159,7 +159,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "subs %w2, %w2, #24 \n" @@ -178,7 +178,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( + asm volatile ( "movi v20.8b, #3 \n" "add %3, %3, %0 \n" "1: \n" @@ -237,7 +237,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( + asm volatile ( "movi v20.8b, #3 \n" "add %3, %3, %0 \n" "1: \n" @@ -292,7 +292,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "ld1 {v3.16b}, [%3] \n" "1: \n" "ld1 {v0.16b,v1.16b}, [%0], #32 \n" @@ -317,7 +317,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr1 = src_ptr + src_stride * 2; ptrdiff_t tmp_src_stride = src_stride; - asm volatile( + asm volatile ( "ld1 {v29.8h}, [%5] \n" "ld1 {v30.16b}, [%6] \n" "ld1 {v31.8h}, [%7] \n" @@ -439,7 +439,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, int dst_width) { // TODO(fbarchard): use src_stride directly for clang 3.5+. ptrdiff_t tmp_src_stride = src_stride; - asm volatile( + asm volatile ( "ld1 {v30.8h}, [%4] \n" "ld1 {v31.16b}, [%5] \n" "add %2, %2, %0 \n" @@ -539,7 +539,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 1; - asm volatile( + asm volatile ( "movi v31.8b, #3 \n" "1: \n" @@ -578,7 +578,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp1 = src_ptr1 + 1; - asm volatile( + asm volatile ( "movi v31.8b, #3 \n" "movi v30.8h, #3 \n" @@ -634,7 +634,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; - asm volatile( + asm volatile ( "movi v31.8h, #3 \n" "1: \n" @@ -671,7 +671,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; - asm volatile( + asm volatile ( "movi v31.8h, #3 \n" "1: \n" @@ -725,7 +725,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; - asm volatile( + asm volatile ( "movi v31.8h, #3 \n" "1: \n" @@ -770,7 +770,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; - asm volatile( + asm volatile ( "movi v31.4h, #3 \n" "movi v30.4s, #3 \n" @@ -825,7 +825,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 2; - asm volatile( + asm volatile ( "movi v31.8b, #3 \n" "1: \n" @@ -864,7 +864,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp1 = src_ptr1 + 2; - asm volatile( + asm volatile ( "movi v31.8b, #3 \n" "movi v30.8h, #3 \n" @@ -920,7 +920,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 2; - asm volatile( + asm volatile ( "movi v31.8h, #3 \n" "1: \n" @@ -967,7 +967,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp1 = src_ptr1 + 2; - asm volatile( + asm volatile ( "movi v31.4h, #3 \n" "movi v30.4s, #3 \n" @@ -1022,7 +1022,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile( + asm volatile ( "1: \n" "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes @@ -1123,7 +1123,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" @@ -1145,7 +1145,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" @@ -1169,7 +1169,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride; - asm volatile( + asm volatile ( "1: \n" "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" "ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n" @@ -1200,7 +1200,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, const uint8_t* src_argb3 = src_argb + src_stepx * 12; int64_t i = 0; (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "ldr w10, [%[src], %[i]] \n" "ldr w11, [%[src1], %[i]] \n" @@ -1232,7 +1232,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width) { - asm volatile( + asm volatile ( "add %1, %1, %0 \n" "1: \n" "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 @@ -1287,7 +1287,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, int64_t x64 = (int64_t)x; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT int64_t tmp64; - asm volatile( + asm volatile ( "1: \n" // clang-format off LOAD1_DATA32_LANE(v0, 0) @@ -1394,7 +1394,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { - asm volatile( + asm volatile ( // change the stride to row 2 pointer "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "1: \n" @@ -1426,7 +1426,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { - asm volatile( + asm volatile ( "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "movi v0.8h, #9 \n" // constants "movi v1.4s, #3 \n" @@ -1477,7 +1477,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -1496,7 +1496,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -1515,7 +1515,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { - asm volatile( + asm volatile ( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" @@ -1550,7 +1550,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, const uint8_t* src2_ptr = src_ptr + src_stepx * 4; const uint8_t* src3_ptr = src_ptr + src_stepx * 6; (void)src_stride; - asm volatile( + asm volatile ( "1: \n" "ld1 {v0.h}[0], [%0], %6 \n" "ld1 {v1.h}[0], [%1], %6 \n" diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc index 5a6f6e5fc..6ed58de2f 100644 --- a/source/scale_rvv.cc +++ b/source/scale_rvv.cc @@ -100,7 +100,7 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb, const uint32_t* src = (const uint32_t*)(src_argb); // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); do { vuint8m4_t v_odd, v_even, v_dst; vuint32m4_t v_odd_32, v_even_32; @@ -165,7 +165,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb, const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); do { vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst; vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16; @@ -262,7 +262,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, const int stride_byte = src_stepx * 4; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); do { vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst; vuint16m8_t v_row0_sum, v_row1_sum, v_sum; @@ -340,7 +340,7 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr, (void)src_stride; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); do { vuint8m4_t v_s0, v_s1, v_dst; size_t vl = __riscv_vsetvl_e8m4(w); @@ -395,7 +395,7 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr, size_t w = (size_t)dst_width; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); do { size_t vl = __riscv_vsetvl_e8m4(w); vuint8m4_t v_s0, v_s1, v_t0, v_t1; @@ -528,7 +528,7 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr, size_t w = (size_t)dst_width; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); do { vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint8m2_t v_t0, v_t1, v_t2, v_t3; @@ -698,7 +698,7 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr, const uint8_t* t = src_ptr + src_stride; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); do { vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16; @@ -827,7 +827,7 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, const uint8_t* t = src_ptr + src_stride; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); do { vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3; @@ -1490,7 +1490,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv, (void)src_stride; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); do { vuint8m4_t v_u0v0, v_u1v1, v_avg; vuint16m4_t v_u0v0_16, v_u1v1_16; @@ -1559,7 +1559,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv, size_t w = (size_t)dst_width; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile("csrwi vxrm, 0"); + asm volatile ("csrwi vxrm, 0"); do { vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0; vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;