From b0dfa70114d607fba655d5af6f481cc5d0a559b9 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 17 Jun 2024 12:51:24 -0700 Subject: [PATCH] RVV remove unused variables - ARM Planar test use regular asm volatile syntax - x86 row functions remove volatile from asm Bug: 347111119, 347112532 Change-Id: I535b3dfa1a7a19824503bd95584a63b047b0e9a1 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5637058 Reviewed-by: Justin Green --- source/compare_gcc.cc | 48 +- source/rotate_gcc.cc | 10 +- source/row_gcc.cc | 1677 ++++++++++++++++---------------------- source/row_rvv.cc | 3 - source/scale_gcc.cc | 350 ++++---- unit_test/planar_test.cc | 11 +- 6 files changed, 890 insertions(+), 1209 deletions(-) diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 33cbe25de..970f950f4 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -21,14 +21,15 @@ extern "C" { // This module is for GCC x86 and x64. #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +// "memory" clobber prevents the reads from being removed + #if defined(__x86_64__) uint32_t HammingDistance_SSE42(const uint8_t* src_a, const uint8_t* src_b, int count) { - uint64_t diff = 0u; + uint64_t diff; - asm volatile( - "xor %3,%3 \n" + asm("xor %3,%3 \n" "xor %%r8,%%r8 \n" "xor %%r9,%%r9 \n" "xor %%r10,%%r10 \n" @@ -63,9 +64,9 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 - "=r"(diff) // %3 + "=&r"(diff) // %3 : - : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); + : "cc", "memory", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); return (uint32_t)(diff); } @@ -75,7 +76,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, int count) { uint32_t diff = 0u; - asm volatile( + asm( // Process 16 bytes per loop. LABELALIGN "1: \n" @@ -104,7 +105,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, "+r"(count), // %2 "+r"(diff) // %3 : - : "memory", "cc", "ecx", "edx"); + : "cc", "memory", "ecx", "edx"); return diff; } @@ -117,10 +118,9 @@ static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; uint32_t HammingDistance_SSSE3(const uint8_t* src_a, const uint8_t* src_b, int count) { - uint32_t diff = 0u; + uint32_t diff; - asm volatile( - "movdqa %4,%%xmm2 \n" + asm("movdqa %4,%%xmm2 \n" "movdqa %5,%%xmm3 \n" "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" @@ -166,7 +166,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a, "=r"(diff) // %3 : "m"(kNibbleMask), // %4 "m"(kBitCount) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); return diff; @@ -176,10 +176,9 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a, uint32_t HammingDistance_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { - uint32_t diff = 0u; + uint32_t diff; - asm volatile( - "vbroadcastf128 %4,%%ymm2 \n" + asm("vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %5,%%ymm3 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n" "vpxor %%ymm1,%%ymm1,%%ymm1 \n" @@ -214,7 +213,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xaa,%%ymm0,%%ymm1 \n" "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" - "vmovd %%xmm0, %3 \n" + "vmovd %%xmm0,%3 \n" "vzeroupper \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 @@ -222,7 +221,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, "=r"(diff) // %3 : "m"(kNibbleMask), // %4 "m"(kBitCount) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); return diff; } @@ -232,8 +231,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; - asm volatile( - "pxor %%xmm0,%%xmm0 \n" + asm("pxor %%xmm0,%%xmm0 \n" "pxor %%xmm5,%%xmm5 \n" LABELALIGN @@ -261,13 +259,12 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a, "pshufd $0x1,%%xmm0,%%xmm1 \n" "paddd %%xmm1,%%xmm0 \n" "movd %%xmm0,%3 \n" - : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 - "=g"(sse) // %3 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + "=r"(sse) // %3 + : + : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); return sse; } @@ -299,8 +296,7 @@ static const uvec32 kHashMul3 = { uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t hash; - asm volatile( - "movd %2,%%xmm0 \n" + asm("movd %2,%%xmm0 \n" "pxor %%xmm7,%%xmm7 \n" "movdqa %4,%%xmm6 \n" @@ -341,13 +337,13 @@ uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { : "+r"(src), // %0 "+r"(count), // %1 "+rm"(seed), // %2 - "=g"(hash) // %3 + "=r"(hash) // %3 : "m"(kHash16x33), // %4 "m"(kHashMul0), // %5 "m"(kHashMul1), // %6 "m"(kHashMul2), // %7 "m"(kHashMul3) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); return hash; } diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index fd5eee05f..034161421 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm volatile( + asm( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN @@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm volatile( + asm( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN @@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src, uint8_t* dst_b, int dst_stride_b, int width) { - asm volatile( + asm( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN @@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm volatile( + asm( // Main loop transpose 4x4. Read a column, write a row. "1: \n" "movdqu (%0),%%xmm0 \n" // a b c d @@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm volatile( + asm( // Main loop transpose 2 blocks of 4x4. Read a column, write a row. "1: \n" "vmovdqu (%0),%%xmm0 \n" // a b c d diff --git a/source/row_gcc.cc b/source/row_gcc.cc index a8ff21344..b601b60b4 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -161,8 +161,7 @@ static const lvec8 kShuffleNV21 = { #ifdef HAS_J400TOARGBROW_SSE2 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" LABELALIGN @@ -192,8 +191,7 @@ void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + asm("pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -230,8 +228,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, } void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + asm("pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -269,8 +266,7 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { // Same code as RAWToARGB with different shuffler and A in low bits void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff + asm("pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff "psrld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -309,8 +305,7 @@ void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm volatile( - "movdqa %3,%%xmm3 \n" + asm("movdqa %3,%%xmm3 \n" "movdqa %4,%%xmm4 \n" "movdqa %5,%%xmm5 \n" @@ -339,8 +334,7 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, } void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0x1080108,%%eax \n" + asm("mov $0x1080108,%%eax \n" "movd %%eax,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "mov $0x20802080,%%eax \n" @@ -387,8 +381,7 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0x1080108,%%eax \n" + asm("mov $0x1080108,%%eax \n" "movd %%eax,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "mov $0x42004200,%%eax \n" @@ -438,8 +431,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0xf0f0f0f,%%eax \n" + asm("mov $0xf0f0f0f,%%eax \n" "movd %%eax,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" @@ -475,9 +467,9 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" + asm("movdqa %3,%%xmm6 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -504,18 +496,17 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRGB24) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" + asm("movdqa %3,%%xmm6 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -542,12 +533,11 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRAW) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #ifdef HAS_ARGBTORGB24ROW_AVX2 @@ -555,8 +545,7 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm6 \n" + asm("vbroadcastf128 %3,%%ymm6 \n" "vmovdqa %4,%%ymm7 \n" LABELALIGN @@ -615,8 +604,7 @@ static const ulvec8 kPermARGBToRGB24_2 = { 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u}; void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vmovdqa %3,%%ymm5 \n" + asm("vmovdqa %3,%%ymm5 \n" "vmovdqa %4,%%ymm6 \n" "vmovdqa %5,%%ymm7 \n" @@ -649,8 +637,7 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBTORAWROW_AVX2 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm6 \n" + asm("vbroadcastf128 %3,%%ymm6 \n" "vmovdqa %4,%%ymm7 \n" LABELALIGN @@ -694,8 +681,7 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { #endif void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" + asm("pcmpeqb %%xmm3,%%xmm3 \n" "psrld $0x1b,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "psrld $0x1a,%%xmm4 \n" @@ -734,8 +720,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, uint8_t* dst, uint32_t dither4, int width) { - asm volatile( - "movd %3,%%xmm6 \n" + asm("movd %3,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm6 \n" "movdqa %%xmm6,%%xmm7 \n" "punpcklwd %%xmm6,%%xmm6 \n" @@ -782,8 +767,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, uint32_t dither4, int width) { - asm volatile( - "vbroadcastss %3,%%xmm6 \n" + asm("vbroadcastss %3,%%xmm6 \n" "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" "vpermq $0xd8,%%ymm6,%%ymm6 \n" "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" @@ -824,8 +808,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, #endif // HAS_ARGBTORGB565DITHERROW_AVX2 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" + asm("pcmpeqb %%xmm4,%%xmm4 \n" "psrld $0x1b,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" "pslld $0x5,%%xmm5 \n" @@ -865,8 +848,7 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" + asm("pcmpeqb %%xmm4,%%xmm4 \n" "psllw $0xc,%%xmm4 \n" "movdqa %%xmm4,%%xmm3 \n" "psrlw $0x8,%%xmm3 \n" @@ -928,8 +910,7 @@ static const uint32_t kMaskAG10 = 0xc000ff00; static const uint32_t kMulAG10 = 64 * 65536 + 1028; void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB + asm("movdqa %3,%%xmm2 \n" // shuffler for RB "movd %4,%%xmm3 \n" // multipler for RB "movd %5,%%xmm4 \n" // mask for R10 B10 "movd %6,%%xmm5 \n" // mask for AG @@ -967,8 +948,7 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { } void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB + asm("movdqa %3,%%xmm2 \n" // shuffler for RB "movd %4,%%xmm3 \n" // multipler for RB "movd %5,%%xmm4 \n" // mask for R10 B10 "movd %6,%%xmm5 \n" // mask for AG @@ -1007,8 +987,7 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBTOAR30ROW_AVX2 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + asm("vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastss %4,%%ymm3 \n" // multipler for RB "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 "vbroadcastss %6,%%ymm5 \n" // mask for AG @@ -1044,8 +1023,7 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ABGRTOAR30ROW_AVX2 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + asm("vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastss %4,%%ymm3 \n" // multipler for RB "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 "vbroadcastss %6,%%ymm5 \n" // mask for AG @@ -1090,7 +1068,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1102,18 +1080,17 @@ void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, "lea 0x20(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" + asm("movdqa %3,%%xmm2 \n" "movdqa %4,%%xmm3 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" @@ -1137,7 +1114,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1149,19 +1126,19 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile("movdqa %3,%%xmm2 \n" + asm("movdqa %3,%%xmm2 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1174,18 +1151,18 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #ifdef HAS_ARGBTOAR64ROW_AVX2 void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" @@ -1198,11 +1175,11 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb, "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } #endif @@ -1210,8 +1187,7 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb, void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm2 \n" + asm("vbroadcastf128 %3,%%ymm2 \n" "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" @@ -1238,7 +1214,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb, void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -1252,11 +1228,11 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } #endif @@ -1264,7 +1240,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN + asm("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -1279,16 +1255,14 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif -// clang-format off - // TODO(mraptis): Consider passing R, G, B multipliers as parameter. // round parameter is register containing value to add before shift. #define RGBTOY(round) \ @@ -1313,8 +1287,10 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "phaddw %%xmm0,%%xmm6 \n" \ "phaddw %%xmm2,%%xmm1 \n" \ "prefetcht0 1280(%0) \n" \ - "paddw %%" #round ",%%xmm6 \n" \ - "paddw %%" #round ",%%xmm1 \n" \ + "paddw %%" #round \ + ",%%xmm6 \n" \ + "paddw %%" #round \ + ",%%xmm1 \n" \ "psrlw $0x8,%%xmm6 \n" \ "psrlw $0x8,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm6 \n" \ @@ -1341,8 +1317,10 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ "prefetcht0 1280(%0) \n" \ - "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ - "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ + "vpaddw %%" #round \ + ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ + "vpaddw %%" #round \ + ",%%ymm2,%%ymm2 \n" \ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ @@ -1353,13 +1331,10 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "jg 1b \n" \ "vzeroupper \n" -// clang-format on - #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -1379,8 +1354,7 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) @@ -1397,8 +1371,7 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Convert 16 ABGR pixels (64 bytes) to 16 YJ values. // Same as ABGRToYRow but different coefficients, no add 16. void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) @@ -1415,8 +1388,7 @@ void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) @@ -1439,8 +1411,7 @@ static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" + asm("vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( @@ -1460,8 +1431,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { #ifdef HAS_ABGRTOYROW_AVX2 // Convert 32 ABGR pixels (128 bytes) to 32 Y values. void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" + asm("vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( @@ -1481,8 +1451,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" + asm("vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" @@ -1500,8 +1469,7 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { #ifdef HAS_ABGRTOYJROW_AVX2 // Convert 32 ABGR pixels (128 bytes) to 32 Y values. void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" + asm("vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" @@ -1519,8 +1487,7 @@ void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { #ifdef HAS_RGBATOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" + asm("vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" @@ -1540,8 +1507,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" + asm("movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -1613,8 +1579,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" + asm("vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" @@ -1676,8 +1641,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" + asm("vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" @@ -1739,8 +1703,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" + asm("vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" @@ -1804,8 +1767,7 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" + asm("vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" @@ -1868,8 +1830,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" + asm("movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -1934,8 +1895,7 @@ void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" + asm("movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -1999,8 +1959,7 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "movdqa %4,%%xmm3 \n" + asm("movdqa %4,%%xmm3 \n" "movdqa %5,%%xmm4 \n" "movdqa %6,%%xmm5 \n" "sub %1,%2 \n" @@ -2053,8 +2012,7 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, #endif // HAS_ARGBTOUV444ROW_SSSE3 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -2074,8 +2032,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" + asm("movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -2133,8 +2090,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, } void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -2150,8 +2106,7 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { } void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -2171,8 +2126,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" + asm("movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -2234,8 +2188,7 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" + asm("movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" @@ -2624,7 +2577,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -2655,33 +2608,26 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + asm(YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA444 - YUVTORGB(yuvconstants) - STOREARGB + LABELALIGN "1: \n" READYUVA444 + YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width]"+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width]"+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); } #endif // HAS_I444ALPHATOARGBROW_SSSE3 @@ -2691,7 +2637,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" @@ -2727,7 +2673,7 @@ void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" @@ -2763,7 +2709,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -2792,7 +2738,7 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants @@ -2827,7 +2773,7 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -2857,7 +2803,7 @@ void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -2887,7 +2833,7 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -2922,7 +2868,7 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -2957,7 +2903,7 @@ void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -2988,8 +2934,7 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( + asm(YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA210 @@ -3021,32 +2966,26 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + asm(YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA410 - YUVTORGB(yuvconstants) - STOREARGB + LABELALIGN "1: \n" READYUVA410 + YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width] "+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width] "+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); - // clang-format on + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); } #endif @@ -3057,7 +2996,7 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -3093,33 +3032,26 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + asm(YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA422 - YUVTORGB(yuvconstants) - STOREARGB + LABELALIGN "1: \n" READYUVA422 + YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width]"+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width]"+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); } #endif // HAS_I422ALPHATOARGBROW_SSSE3 @@ -3128,27 +3060,20 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + asm(YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READNV12 - YUVTORGB(yuvconstants) - STOREARGB + LABELALIGN "1: \n" READNV12 + YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[uv_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); } void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, @@ -3156,88 +3081,63 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + asm(YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READNV21 - YUVTORGB(yuvconstants) - STOREARGB + LABELALIGN "1: \n" READNV21 + YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [vu_buf]"+r"(vu_buf), // %[vu_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleNV21]"m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [y_buf] "+r"(y_buf), // %[y_buf] + [vu_buf] "+r"(vu_buf), // %[vu_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] + [kShuffleNV21] "m"(kShuffleNV21) + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); } void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - "movdqa %[kShuffleYUY2Y],%%xmm6 \n" - "movdqa %[kShuffleYUY2UV],%%xmm7 \n" - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("movdqa %[kShuffleYUY2Y],%%xmm6 \n" + "movdqa %[kShuffleYUY2UV],%%xmm7 \n" YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUY2 - YUVTORGB(yuvconstants) - STOREARGB + LABELALIGN "1: \n" READYUY2 + YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" - : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleYUY2Y]"m"(kShuffleYUY2Y), - [kShuffleYUY2UV]"m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); - // clang-format on + : [yuy2_buf] "+r"(yuy2_buf), // %[yuy2_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] + [kShuffleYUY2Y] "m"(kShuffleYUY2Y), [kShuffleYUY2UV] "m"(kShuffleYUY2UV) + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5", "xmm6", "xmm7"); } void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - "movdqa %[kShuffleUYVYY],%%xmm6 \n" - "movdqa %[kShuffleUYVYUV],%%xmm7 \n" - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("movdqa %[kShuffleUYVYY],%%xmm6 \n" + "movdqa %[kShuffleUYVYUV],%%xmm7 \n" YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READUYVY - YUVTORGB(yuvconstants) - STOREARGB + LABELALIGN "1: \n" READUYVY + YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" - : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleUYVYY]"m"(kShuffleUYVYY), - [kShuffleUYVYUV]"m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [uyvy_buf] "+r"(uyvy_buf), // %[uyvy_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] + [kShuffleUYVYY] "m"(kShuffleUYVYY), [kShuffleUYVYUV] "m"(kShuffleUYVYUV) + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); } void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, @@ -3245,8 +3145,7 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( + asm(YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READP210 @@ -3267,8 +3166,7 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( + asm(YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READP410 @@ -3289,7 +3187,7 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" @@ -3320,7 +3218,7 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" @@ -3352,7 +3250,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -3763,7 +3661,7 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -3797,7 +3695,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -3837,7 +3735,7 @@ void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX512BW(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n" @@ -3876,7 +3774,7 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants @@ -3916,7 +3814,7 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -3951,7 +3849,7 @@ void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -3986,7 +3884,7 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants @@ -4026,7 +3924,7 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants @@ -4066,7 +3964,7 @@ void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -4102,8 +4000,7 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( + asm(YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA210_AVX2 @@ -4138,8 +4035,7 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( + asm(YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA410_AVX2 @@ -4173,7 +4069,7 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants @@ -4214,34 +4110,27 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + asm(YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA444_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 + LABELALIGN "1: \n" READYUVA444_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "subl $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width]"+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width]"+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } #endif // HAS_I444ALPHATOARGBROW_AVX2 @@ -4255,34 +4144,27 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" + asm(YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA422_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 + LABELALIGN "1: \n" READYUVA422_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "subl $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width]"+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width]"+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } #endif // HAS_I422ALPHATOARGBROW_AVX2 @@ -4295,7 +4177,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -4338,28 +4220,21 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm(YUVTORGB_SETUP_AVX2( + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - READNV12_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 + LABELALIGN "1: \n" READNV12_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[uv_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); } #endif // HAS_NV12TOARGBROW_AVX2 @@ -4371,29 +4246,22 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm(YUVTORGB_SETUP_AVX2( + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - READNV21_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 + LABELALIGN "1: \n" READNV21_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [vu_buf]"+r"(vu_buf), // %[vu_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleNV21]"m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [y_buf] "+r"(y_buf), // %[y_buf] + [vu_buf] "+r"(vu_buf), // %[vu_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] + [kShuffleNV21] "m"(kShuffleNV21) + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); } #endif // HAS_NV21TOARGBROW_AVX2 @@ -4404,31 +4272,22 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6 \n" - "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7 \n" - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm("vbroadcastf128 %[kShuffleYUY2Y],%%ymm6 \n" + "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - READYUY2_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 + LABELALIGN "1: \n" READYUY2_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" - : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleYUY2Y]"m"(kShuffleYUY2Y), - [kShuffleYUY2UV]"m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); - // clang-format on + : [yuy2_buf] "+r"(yuy2_buf), // %[yuy2_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] + [kShuffleYUY2Y] "m"(kShuffleYUY2Y), [kShuffleYUY2UV] "m"(kShuffleYUY2UV) + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_YUY2TOARGBROW_AVX2 @@ -4439,31 +4298,22 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - "vbroadcastf128 %[kShuffleUYVYY],%%ymm6 \n" - "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7 \n" - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm("vbroadcastf128 %[kShuffleUYVYY],%%ymm6 \n" + "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - READUYVY_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 + LABELALIGN "1: \n" READUYVY_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" - : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleUYVYY]"m"(kShuffleUYVYY), - [kShuffleUYVYUV]"m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); - // clang-format on + : [uyvy_buf] "+r"(uyvy_buf), // %[uyvy_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] + [kShuffleUYVYY] "m"(kShuffleUYVYY), [kShuffleUYVYUV] "m"(kShuffleUYVYUV) + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_UYVYTOARGBROW_AVX2 @@ -4475,28 +4325,21 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm(YUVTORGB_SETUP_AVX2( + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - READP210_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 + LABELALIGN "1: \n" READP210_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[uv_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); } #endif // HAS_P210TOARGBROW_AVX2 @@ -4508,28 +4351,21 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - // clang-format off - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm(YUVTORGB_SETUP_AVX2( + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - READP410_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 + LABELALIGN "1: \n" READP410_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - // clang-format on + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[uv_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); } #endif // HAS_P410TOARGBROW_AVX2 @@ -4541,7 +4377,7 @@ void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" @@ -4578,7 +4414,7 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" @@ -4612,8 +4448,7 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 + asm("movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 "pslld $0x18,%%xmm4 \n" @@ -4657,8 +4492,7 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 + asm("vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 "vpslld $0x18,%%ymm4,%%ymm4 \n" @@ -4701,9 +4535,9 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" + asm("movdqa %3,%%xmm5 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu -0x10(%0,%2,1),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" @@ -4711,20 +4545,20 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", "xmm0", "xmm5"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm("vbroadcastf128 %3,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" @@ -4734,11 +4568,11 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", "xmm0", "xmm5"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_AVX2 @@ -4749,9 +4583,9 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" + asm("movdqa %3,%%xmm5 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu -0x10(%0,%2,2),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" @@ -4759,20 +4593,20 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { "lea 0x10(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorUV) // %3 - : "memory", "cc", "xmm0", "xmm5"); + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_MIRRORUVROW_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm("vbroadcastf128 %3,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" @@ -4782,11 +4616,11 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorUV) // %3 - : "memory", "cc", "xmm0", "xmm5"); + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORUVROW_AVX2 @@ -4799,8 +4633,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_v, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile( - "movdqa %4,%%xmm1 \n" + asm("movdqa %4,%%xmm1 \n" "lea -0x10(%0,%3,2),%0 \n" "sub %1,%2 \n" @@ -4839,8 +4672,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, int width) { intptr_t temp_width = (intptr_t)(width); src_rgb24 += width * 3 - 48; - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN @@ -4874,9 +4706,9 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("lea -0x10(%0,%2,4),%0 \n" + asm("lea -0x10(%0,%2,4),%0 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "pshufd $0x1b,%%xmm0,%%xmm0 \n" @@ -4885,11 +4717,11 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc", "xmm0"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBMIRRORROW_SSE2 @@ -4898,9 +4730,9 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("vmovdqu %3,%%ymm5 \n" + asm("vmovdqu %3,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" @@ -4908,11 +4740,11 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kARGBShuffleMirror_AVX2) // %3 - : "memory", "cc", "xmm0", "xmm5"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_ARGBMIRRORROW_AVX2 @@ -4921,8 +4753,7 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -4959,8 +4790,7 @@ void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -4996,8 +4826,7 @@ void DetileRow_SSE2(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width) { - asm volatile( - "1: \n" + asm("1: \n" "movdqu (%0),%%xmm0 \n" "sub $0x10,%2 \n" "lea (%0,%3),%0 \n" @@ -5017,8 +4846,7 @@ void DetileRow_16_SSE2(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width) { - asm volatile( - "1: \n" + asm("1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea (%0,%3,2),%0 \n" @@ -5040,8 +4868,7 @@ void DetileRow_16_AVX(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width) { - asm volatile( - "1: \n" + asm("1: \n" "vmovdqu (%0),%%ymm0 \n" "lea (%0,%3,2),%0 \n" "vmovdqu %%ymm0,(%1) \n" @@ -5065,8 +4892,7 @@ void DetileToYUY2_SSE2(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm volatile( - "1: \n" + asm("1: \n" "movdqu (%0),%%xmm0 \n" // Load 16 Y "sub $0x10,%3 \n" "lea (%0,%4),%0 \n" @@ -5104,8 +4930,7 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "movdqu %4,%%xmm1 \n" + asm("movdqu %4,%%xmm1 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "lea (%0, %5),%0 \n" @@ -5131,9 +4956,9 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm("sub %0,%1 \n" - LABELALIGN + LABELALIGN "1: \n" "vpmovzxbw (%0),%%zmm0 \n" "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" @@ -5145,12 +4970,12 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_AVX512BW @@ -5159,9 +4984,9 @@ void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm("sub %0,%1 \n" - LABELALIGN + LABELALIGN "1: \n" "vpmovzxbw (%0),%%ymm0 \n" "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" @@ -5173,12 +4998,12 @@ void MergeUVRow_AVX2(const uint8_t* src_u, "sub $0x10,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_AVX2 @@ -5187,9 +5012,9 @@ void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm("sub %0,%1 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" @@ -5202,12 +5027,12 @@ void MergeUVRow_SSE2(const uint8_t* src_u, "lea 0x20(%2),%2 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_SSE2 @@ -5217,12 +5042,9 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, uint16_t* dst_uv, int depth, int width) { - // clang-format off - asm volatile ( - "vmovd %4,%%xmm3 \n" + asm("vmovd %4,%%xmm3 \n" "vmovd %5,%%xmm4 \n" - "sub %0,%1 \n" // 8 pixels per loop. @@ -5239,14 +5061,13 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"(16 - depth), // %4 - "r"(32 - depth) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); - // clang-format on + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(16 - depth), // %4 + "r"(32 - depth) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_MERGEUVROW_AVX2 @@ -5259,14 +5080,12 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, int depth, int width) { depth = 16 - depth; - // clang-format off - asm volatile ( - "vmovd %4,%%xmm3 \n" + asm("vmovd %4,%%xmm3 \n" "vbroadcastf128 %5,%%ymm4 \n" "sub %1,%2 \n" - // 16 pixels per loop. - LABELALIGN + // 16 pixels per loop. + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -5286,14 +5105,13 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, "sub $0x10,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(depth), // %4 - "m"(kSplitUVShuffle16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); - // clang-format on + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(depth), // %4 + "m"(kSplitUVShuffle16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_SPLITUVROW_16_AVX2 @@ -5307,15 +5125,13 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - // clang-format off - asm volatile ( - "vmovd %3,%%xmm3 \n" + asm("vmovd %3,%%xmm3 \n" "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" - // 32 pixels per loop. - LABELALIGN + // 32 pixels per loop. + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -5327,12 +5143,11 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm3"); - // clang-format on + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm3"); } #endif // HAS_MULTIPLYROW_16_AVX2 @@ -5346,15 +5161,13 @@ void DivideRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - // clang-format off - asm volatile ( - "vmovd %3,%%xmm3 \n" + asm("vmovd %3,%%xmm3 \n" "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" - // 32 pixels per loop. - LABELALIGN + // 32 pixels per loop. + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -5366,13 +5179,12 @@ void DivideRow_16_AVX2(const uint16_t* src_y, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width), // %2 - "+r"(scale) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm3"); - // clang-format on + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width), // %2 + "+r"(scale) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm3"); } #endif // HAS_MULTIPLYROW_16_AVX2 @@ -5385,14 +5197,12 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - // clang-format off - asm volatile ( - "movd %3,%%xmm2 \n" + asm("movd %3,%%xmm2 \n" "punpcklwd %%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" - // 32 pixels per loop. - LABELALIGN + // 32 pixels per loop. + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -5404,12 +5214,11 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y, "add $0x10,%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); - // clang-format on + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #ifdef HAS_CONVERT16TO8ROW_AVX2 @@ -5417,14 +5226,12 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - // clang-format off - asm volatile ( - "vmovd %3,%%xmm2 \n" + asm("vmovd %3,%%xmm2 \n" "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" "vbroadcastss %%xmm2,%%ymm2 \n" - // 32 pixels per loop. - LABELALIGN + // 32 pixels per loop. + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -5438,12 +5245,11 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); - // clang-format on + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_CONVERT16TO8ROW_AVX2 @@ -5455,14 +5261,12 @@ void Convert8To16Row_SSE2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) { - // clang-format off - asm volatile ( - "movd %3,%%xmm2 \n" + asm("movd %3,%%xmm2 \n" "punpcklwd %%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" - // 32 pixels per loop. - LABELALIGN + // 32 pixels per loop. + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -5476,12 +5280,11 @@ void Convert8To16Row_SSE2(const uint8_t* src_y, "add $0x20,%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); - // clang-format on + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #ifdef HAS_CONVERT8TO16ROW_AVX2 @@ -5489,14 +5292,12 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) { - // clang-format off - asm volatile ( - "vmovd %3,%%xmm2 \n" + asm("vmovd %3,%%xmm2 \n" "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" "vbroadcastss %%xmm2,%%ymm2 \n" - // 32 pixels per loop. - LABELALIGN + // 32 pixels per loop. + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" @@ -5511,12 +5312,11 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); - // clang-format on + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_CONVERT8TO16ROW_AVX2 @@ -5547,7 +5347,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -5584,13 +5384,13 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, "lea 0x30(%0),%0 \n" "sub $0x10,%4 \n" "jg 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "r"(&kSplitRGBShuffle[0]) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "r"(&kSplitRGBShuffle[0]) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_SPLITRGBROW_SSSE3 @@ -5621,7 +5421,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" @@ -5659,13 +5459,13 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, "lea 0x30(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_rgb), // %3 - "+r"(width) // %4 - : "r"(&kMergeRGBShuffle[0]) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : "r"(&kMergeRGBShuffle[0]) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGERGBROW_SSSE3 @@ -5676,8 +5476,7 @@ void MergeARGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" @@ -5717,7 +5516,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movq (%2),%%xmm0 \n" // B @@ -5738,13 +5537,13 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r, "lea 32(%3),%3 \n" "sub $0x8,%4 \n" "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEARGBROW_SSE2 @@ -5755,8 +5554,7 @@ void MergeARGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" @@ -5800,7 +5598,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) { - asm volatile( + asm( LABELALIGN "1: \n" @@ -5844,8 +5642,7 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm volatile( - "sub %1,%2 \n" + asm("sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" @@ -5895,8 +5692,7 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile( - LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F @@ -5945,8 +5741,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm volatile( - "movdqa %6,%%xmm3 \n" + asm("movdqa %6,%%xmm3 \n" "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" @@ -5991,8 +5786,7 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" + asm("movdqa %5,%%xmm3 \n" LABELALIGN "1: \n" @@ -6032,8 +5826,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm volatile( - "sub %1,%2 \n" + asm("sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" "vmovdqa %7,%%ymm3 \n" @@ -6083,8 +5876,7 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile( - "vmovdqa %6,%%ymm3 \n" + asm("vmovdqa %6,%%ymm3 \n" "vbroadcastf128 %5,%%ymm4 \n" LABELALIGN @@ -6130,8 +5922,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r, int depth, int width) { int shift = depth - 10; - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "sub %0,%2 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" @@ -6196,8 +5987,7 @@ void MergeAR64Row_AVX2(const uint16_t* src_r, int shift = 16 - depth; int mask = (1 << depth) - 1; mask = (mask << 16) + mask; - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" "vmovdqa %8,%%ymm5 \n" @@ -6267,8 +6057,7 @@ void MergeXR64Row_AVX2(const uint16_t* src_r, int shift = 16 - depth; int mask = (1 << depth) - 1; mask = (mask << 16) + mask; - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "sub %0,%2 \n" "vmovdqa %7,%%ymm5 \n" "vmovd %5,%%xmm6 \n" @@ -6330,8 +6119,7 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r, int depth, int width) { int shift = depth - 8; - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" "vbroadcastf128 %7,%%ymm5 \n" @@ -6386,8 +6174,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, int depth, int width) { int shift = depth - 8; - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "sub %0,%2 \n" "vbroadcastf128 %6,%%ymm5 \n" "vmovd %5,%%xmm6 \n" @@ -6430,8 +6217,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "test $0xf,%0 \n" + asm("test $0xf,%0 \n" "jne 2f \n" "test $0xf,%1 \n" "jne 2f \n" @@ -6470,7 +6256,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_COPYROW_AVX void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -6481,11 +6267,11 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { "sub $0x40,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_COPYROW_AVX @@ -6493,20 +6279,19 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { // Multiple of 1. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep movsb \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 - : - : "memory", "cc"); + asm("rep movsb \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc"); } #endif // HAS_COPYROW_ERMS #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" + asm("pcmpeqb %%xmm0,%%xmm0 \n" "pslld $0x18,%%xmm0 \n" "pcmpeqb %%xmm1,%%xmm1 \n" "psrld $0x8,%%xmm1 \n" @@ -6540,8 +6325,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + asm("vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN @@ -6570,7 +6354,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0), %%xmm0 \n" "movdqu 0x10(%0), %%xmm1 \n" @@ -6583,11 +6367,11 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, "lea 0x8(%1), %1 \n" "sub $0x8, %2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 @@ -6599,8 +6383,7 @@ static const uvec8 kShuffleAlphaShort_AVX2 = { void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm volatile( - "vmovdqa %3,%%ymm4 \n" + asm("vmovdqa %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" LABELALIGN @@ -6635,8 +6418,7 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" + asm("pcmpeqb %%xmm0,%%xmm0 \n" "pslld $0x18,%%xmm0 \n" "pcmpeqb %%xmm1,%%xmm1 \n" "psrld $0x8,%%xmm1 \n" @@ -6672,8 +6454,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + asm("vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN @@ -6703,36 +6484,35 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width >> 2); const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm volatile("rep stosl \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + asm("rep stosl \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep stosb \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v8) // %2 - : "memory", "cc"); + asm("rep stosb \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); } void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep stosl \n" - : "+D"(dst_argb), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + asm("rep stosl \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_SSE2 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" LABELALIGN @@ -6758,7 +6538,7 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -6774,11 +6554,11 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(stride_yuy2)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, @@ -6786,8 +6566,7 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6825,8 +6604,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6857,7 +6635,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, } void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -6869,11 +6647,11 @@ void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, @@ -6881,8 +6659,7 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6920,8 +6697,7 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6954,8 +6730,7 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, #ifdef HAS_YUY2TOYROW_AVX2 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" LABELALIGN @@ -6983,7 +6758,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -6999,11 +6774,11 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(stride_yuy2)) // %3 - : "memory", "cc", "xmm0", "xmm1"); + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 + : "memory", "cc", "xmm0", "xmm1"); } void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, @@ -7011,8 +6786,7 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -7051,8 +6825,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -7086,7 +6859,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, } void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -7100,19 +6873,18 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -7151,8 +6923,7 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -7196,8 +6967,7 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7 \n" + asm("pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $0xf,%%xmm7 \n" "pcmpeqb %%xmm6,%%xmm6 \n" "psrlw $0x8,%%xmm6 \n" @@ -7284,8 +7054,7 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0, const uint8_t* alpha, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("pcmpeqb %%xmm5,%%xmm5 \n" "psllw $0x8,%%xmm5 \n" "mov $0x80808080,%%eax \n" "movd %%eax,%%xmm6 \n" @@ -7336,8 +7105,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, const uint8_t* alpha, uint8_t* dst, int width) { - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsllw $0x8,%%ymm5,%%ymm5 \n" "mov $0x80808080,%%eax \n" "vmovd %%eax,%%xmm6 \n" @@ -7396,8 +7164,7 @@ static const vec8 kAttenuateShuffle = {6, -128, 6, -128, 6, -128, void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" "pxor %%xmm6,%%xmm6 \n" @@ -7451,8 +7218,7 @@ static const lvec8 kAttenuateShuffle_AVX2 = { void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( - "vmovdqa %3,%%ymm4 \n" + asm("vmovdqa %3,%%ymm4 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpslld $0x18,%%ymm5,%%ymm5 \n" "vpxor %%ymm6,%%ymm6,%%ymm6 \n" @@ -7497,7 +7263,7 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uintptr_t alpha; - asm volatile( + asm( // 4 pixel loop. LABELALIGN "1: \n" @@ -7545,8 +7311,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uintptr_t alpha; - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "vbroadcastf128 %5,%%ymm5 \n" // 8 pixel loop. @@ -7607,8 +7372,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" // 8 pixel loop. @@ -7669,8 +7433,7 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { - asm volatile( - "movdqa %2,%%xmm2 \n" + asm("movdqa %2,%%xmm2 \n" "movdqa %3,%%xmm3 \n" "movdqa %4,%%xmm4 \n" @@ -7730,8 +7493,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { - asm volatile( - "movdqu (%3),%%xmm5 \n" + asm("movdqu (%3),%%xmm5 \n" "pshufd $0x00,%%xmm5,%%xmm2 \n" "pshufd $0x55,%%xmm5,%%xmm3 \n" "pshufd $0xaa,%%xmm5,%%xmm4 \n" @@ -7795,8 +7557,7 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int interval_size, int interval_offset, int width) { - asm volatile( - "movd %2,%%xmm2 \n" + asm("movd %2,%%xmm2 \n" "movd %3,%%xmm3 \n" "movd %4,%%xmm4 \n" "pshuflw $0x40,%%xmm2,%%xmm2 \n" @@ -7846,8 +7607,7 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { - asm volatile( - "movd %3,%%xmm2 \n" + asm("movd %3,%%xmm2 \n" "punpcklbw %%xmm2,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm2 \n" @@ -7882,10 +7642,10 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" + asm("pxor %%xmm5,%%xmm5 \n" - // 4 pixel loop. - LABELALIGN + // 4 pixel loop. + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" @@ -7904,12 +7664,12 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_ARGBMULTIPLYROW_SSE2 @@ -7919,10 +7679,10 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm("vpxor %%ymm5,%%ymm5,%%ymm5 \n" - // 4 pixel loop. - LABELALIGN + // 4 pixel loop. + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm1 \n" "lea 0x20(%0),%0 \n" @@ -7940,12 +7700,12 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_ARGBMULTIPLYROW_AVX2 @@ -7955,7 +7715,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm( // 4 pixel loop. LABELALIGN "1: \n" @@ -7983,7 +7743,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm( // 4 pixel loop. LABELALIGN "1: \n" @@ -8011,7 +7771,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm( // 4 pixel loop. LABELALIGN "1: \n" @@ -8039,7 +7799,7 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile( + asm( // 4 pixel loop. LABELALIGN "1: \n" @@ -8071,8 +7831,7 @@ void SobelXRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" "pxor %%xmm5,%%xmm5 \n" @@ -8125,8 +7884,7 @@ void SobelYRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "sub %0,%2 \n" "pxor %%xmm5,%%xmm5 \n" @@ -8178,8 +7936,7 @@ void SobelRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -8225,8 +7982,7 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -8260,8 +8016,7 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" // 8 pixel loop. @@ -8308,8 +8063,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width) { - asm volatile( - "pxor %%xmm0,%%xmm0 \n" + asm("pxor %%xmm0,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" "sub $0x4,%3 \n" "jl 49f \n" @@ -8388,8 +8142,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, int area, uint8_t* dst, int count) { - asm volatile( - "movd %5,%%xmm5 \n" + asm("movd %5,%%xmm5 \n" "cvtdq2ps %%xmm5,%%xmm5 \n" "rcpss %%xmm5,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" @@ -8523,8 +8276,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, int width) { intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; - asm volatile( - "movq (%3),%%xmm2 \n" + asm("movq (%3),%%xmm2 \n" "movq 0x08(%3),%%xmm7 \n" "shl $0x10,%1 \n" "add $0x4,%1 \n" @@ -8608,8 +8360,7 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { - asm volatile( - "sub %1,%0 \n" + asm("sub %1,%0 \n" "cmp $0x0,%3 \n" "je 100f \n" "cmp $0x80,%3 \n" @@ -8689,8 +8440,7 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { - asm volatile( - "sub %1,%0 \n" + asm("sub %1,%0 \n" "cmp $0x0,%3 \n" "je 100f \n" "cmp $0x80,%3 \n" @@ -8766,9 +8516,9 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile("movdqu (%3),%%xmm5 \n" + asm("movdqu (%3),%%xmm5 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -8780,11 +8530,11 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_SSSE3 @@ -8794,9 +8544,9 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile("vbroadcastf128 (%3),%%ymm5 \n" + asm("vbroadcastf128 (%3),%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -8809,11 +8559,11 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_AVX2 @@ -8823,9 +8573,9 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile("sub %1,%2 \n" + asm("sub %1,%2 \n" - LABELALIGN + LABELALIGN "1: \n" "movq (%1),%%xmm2 \n" "movq 0x00(%1,%2,1),%%xmm1 \n" @@ -8841,13 +8591,13 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, "lea 0x20(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOYUY2ROW_SSE2 @@ -8857,9 +8607,9 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile("sub %1,%2 \n" + asm("sub %1,%2 \n" - LABELALIGN + LABELALIGN "1: \n" "movq (%1),%%xmm2 \n" "movq 0x00(%1,%2,1),%%xmm1 \n" @@ -8875,13 +8625,13 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, "lea 0x20(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOUYVYROW_SSE2 @@ -8891,9 +8641,9 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile("sub %1,%2 \n" + asm("sub %1,%2 \n" - LABELALIGN + LABELALIGN "1: \n" "vpmovzxbw (%1),%%ymm1 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" @@ -8912,13 +8662,13 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, "sub $0x20,%4 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOYUY2ROW_AVX2 @@ -8928,9 +8678,9 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile("sub %1,%2 \n" + asm("sub %1,%2 \n" - LABELALIGN + LABELALIGN "1: \n" "vpmovzxbw (%1),%%ymm1 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" @@ -8949,13 +8699,13 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, "sub $0x20,%4 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOUYVYROW_AVX2 @@ -8964,10 +8714,10 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { - asm volatile("pxor %%xmm3,%%xmm3 \n" + asm("pxor %%xmm3,%%xmm3 \n" - // 2 pixel loop. - LABELALIGN + // 2 pixel loop. + LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" "lea 0x8(%0),%0 \n" @@ -9005,12 +8755,11 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, "lea 0x8(%1),%1 \n" "sub $0x2,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBPOLYNOMIALROW_SSE2 @@ -9019,8 +8768,7 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { - asm volatile( - "vbroadcastf128 (%3),%%ymm4 \n" + asm("vbroadcastf128 (%3),%%ymm4 \n" "vbroadcastf128 0x10(%3),%%ymm5 \n" "vbroadcastf128 0x20(%3),%%ymm6 \n" "vbroadcastf128 0x30(%3),%%ymm7 \n" @@ -9062,8 +8810,7 @@ void HalfFloatRow_SSE2(const uint16_t* src, float scale, int width) { scale *= kScaleBias; - asm volatile( - "movd %3,%%xmm4 \n" + asm("movd %3,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" "sub %0,%1 \n" @@ -9100,8 +8847,7 @@ void HalfFloatRow_AVX2(const uint16_t* src, float scale, int width) { scale *= kScaleBias; - asm volatile( - "vbroadcastss %3, %%ymm4 \n" + asm("vbroadcastss %3, %%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" "sub %0,%1 \n" @@ -9141,8 +8887,7 @@ void HalfFloatRow_F16C(const uint16_t* src, uint16_t* dst, float scale, int width) { - asm volatile( - "vbroadcastss %3, %%ymm4 \n" + asm("vbroadcastss %3, %%ymm4 \n" "sub %0,%1 \n" // 16 pixel loop. @@ -9176,8 +8921,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #ifdef HAS_HALFFLOATROW_F16C void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" @@ -9207,7 +8951,7 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width) { uintptr_t pixel_temp; - asm volatile( + asm( // 1 pixel loop. LABELALIGN "1: \n" @@ -9240,7 +8984,7 @@ void RGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width) { uintptr_t pixel_temp; - asm volatile( + asm( // 1 pixel loop. LABELALIGN "1: \n" @@ -9273,8 +9017,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, uint32_t lumacoeff) { uintptr_t pixel_temp; uintptr_t table_temp; - asm volatile( - "movd %6,%%xmm3 \n" + asm("movd %6,%%xmm3 \n" "pshufd $0x0,%%xmm3,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "psllw $0x8,%%xmm4 \n" @@ -9377,8 +9120,7 @@ void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "movdqa (%4),%%xmm4 \n" // 3 shuffler constants "movdqa 16(%4),%%xmm5 \n" "movdqa 32(%4),%%xmm6 \n" @@ -9415,8 +9157,7 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants "vbroadcastf128 16(%4),%%ymm5 \n" "vbroadcastf128 32(%4),%%ymm6 \n" @@ -9463,8 +9204,7 @@ void NV21ToYUV24Row_AVX512(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile( - "sub %0,%1 \n" + asm("sub %0,%1 \n" "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants "vmovdqa 32(%4),%%ymm5 \n" "vmovdqa 64(%4),%%ymm6 \n" LABELALIGN @@ -9502,9 +9242,9 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("movdqu %3,%%xmm5 \n" + asm("movdqu %3,%%xmm5 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -9516,19 +9256,19 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { "lea 0x20(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : "m"(kShuffleUVToVU) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "m"(kShuffleUVToVU) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_SWAPUVROW_SSSE3 #ifdef HAS_SWAPUVROW_AVX2 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm("vbroadcastf128 %3,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -9541,11 +9281,11 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : "m"(kShuffleUVToVU) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "m"(kShuffleUVToVU) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_SWAPUVROW_AVX2 @@ -9555,8 +9295,7 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, int src_stride_v, uint8_t* dst_uv, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" + asm("pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -9601,8 +9340,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, int src_stride_v, uint8_t* dst_uv, int width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" @@ -9643,8 +9381,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, } void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { - asm volatile( - "pxor %%xmm1,%%xmm1 \n" + asm("pxor %%xmm1,%%xmm1 \n" LABELALIGN "1: \n" diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 39a5c0dbc..3b04ec5f7 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -497,7 +497,6 @@ void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); - vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); vuint8m2x3_t v_dst_bgr = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b); __riscv_vsseg3e8_v_u8m2x3(dst_raw, v_dst_bgr, vl); w -= vl; @@ -2101,7 +2100,6 @@ void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); - vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); @@ -2191,7 +2189,6 @@ void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, vuint16m4_t v_y_u16; size_t vl = __riscv_vsetvl_e8m2(w); vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl); - vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 0); vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1); vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2); vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3); diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 17eeffadf..304770d0c 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( + asm( // 16 pixel loop. LABELALIGN "1: \n" @@ -114,8 +114,8 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : + : "memory", "cc", "xmm0", "xmm1"); } void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, @@ -123,8 +123,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" + asm("pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -146,16 +145,15 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm4", "xmm5"); + : + : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" + asm("pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -195,7 +193,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -209,11 +207,11 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, @@ -221,8 +219,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" @@ -246,16 +243,15 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm4", "xmm5"); + : + : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" @@ -297,8 +293,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" + asm("pcmpeqb %%xmm5,%%xmm5 \n" "psrld $0x18,%%xmm5 \n" "pslld $0x10,%%xmm5 \n" @@ -319,8 +314,8 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm5"); + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, @@ -328,8 +323,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { intptr_t stridex3; - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" + asm("pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" "packuswb %%xmm4,%%xmm4 \n" @@ -383,8 +377,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrld $0x18,%%ymm5,%%ymm5 \n" "vpslld $0x10,%%ymm5,%%ymm5 \n" @@ -408,16 +401,15 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr, : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm5"); + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsllw $0x3,%%ymm4,%%ymm5 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" @@ -472,8 +464,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( - "movdqa %0,%%xmm3 \n" + asm("movdqa %0,%%xmm3 \n" "movdqa %1,%%xmm4 \n" "movdqa %2,%%xmm5 \n" : @@ -481,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, "m"(kShuf1), // %1 "m"(kShuf2) // %2 ); - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm2 \n" @@ -497,19 +488,18 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 + asm("movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %2,%%xmm4 \n" // kShuf21 : @@ -517,8 +507,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); - asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 + asm("movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %2,%%xmm1 \n" // kRound34 : @@ -526,7 +515,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "m"(kMadd11), // %1 "m"(kRound34) // %2 ); - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" @@ -559,21 +548,20 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 + asm("movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %2,%%xmm4 \n" // kShuf21 : @@ -581,8 +569,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); - asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 + asm("movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %2,%%xmm1 \n" // kRound34 : @@ -591,7 +578,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "m"(kRound34) // %2 ); - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" @@ -627,13 +614,13 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, @@ -641,8 +628,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile( - "movdqa %3,%%xmm4 \n" + asm("movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN @@ -671,8 +657,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" + asm("movdqa %0,%%xmm2 \n" "movdqa %1,%%xmm3 \n" "movdqa %2,%%xmm4 \n" "movdqa %3,%%xmm5 \n" @@ -682,7 +667,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAb2), // %2 "m"(kScaleAb2) // %3 ); - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n" @@ -703,20 +688,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" + asm("movdqa %0,%%xmm2 \n" "movdqa %1,%%xmm3 \n" "movdqa %2,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -725,7 +708,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAc3), // %1 "m"(kScaleAc33) // %2 ); - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n" @@ -765,12 +748,12 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, @@ -783,8 +766,7 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "pxor %%xmm0,%%xmm0 \n" // 0 + asm("pxor %%xmm0,%%xmm0 \n" // 0 "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $1,%%xmm6 \n" // all 2 @@ -839,8 +821,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - LABELALIGN + asm(LABELALIGN "1: \n" "pxor %%xmm0,%%xmm0 \n" // 0 // above line @@ -953,8 +934,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile( - "movdqa %3,%%xmm5 \n" + asm("movdqa %3,%%xmm5 \n" "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 @@ -1005,8 +985,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - "pcmpeqw %%xmm7,%%xmm7 \n" + asm("pcmpeqw %%xmm7,%%xmm7 \n" "psrlw $15,%%xmm7 \n" "psllw $3,%%xmm7 \n" // all 8 "movdqa %5,%%xmm6 \n" @@ -1103,8 +1082,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" + asm("pxor %%xmm5,%%xmm5 \n" "pcmpeqd %%xmm4,%%xmm4 \n" "psrld $31,%%xmm4 \n" "pslld $1,%%xmm4 \n" // all 2 @@ -1156,8 +1134,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - "pxor %%xmm7,%%xmm7 \n" + asm("pxor %%xmm7,%%xmm7 \n" "pcmpeqd %%xmm6,%%xmm6 \n" "psrld $31,%%xmm6 \n" "pslld $3,%%xmm6 \n" // all 8 @@ -1264,8 +1241,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqw %%xmm4,%%xmm4 \n" + asm("pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 "movdqa %3,%%xmm3 \n" @@ -1305,8 +1281,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - "pcmpeqw %%xmm6,%%xmm6 \n" + asm("pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $3,%%xmm6 \n" // all 8 "movdqa %5,%%xmm7 \n" @@ -1390,8 +1365,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + asm("vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vbroadcastf128 %3,%%ymm3 \n" @@ -1434,8 +1408,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + asm("vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 "vbroadcastf128 %5,%%ymm7 \n" @@ -1516,8 +1489,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile( - "vbroadcastf128 %3,%%ymm5 \n" + asm("vbroadcastf128 %3,%%ymm5 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 @@ -1568,8 +1540,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" + asm("vbroadcastf128 %5,%%ymm5 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 @@ -1630,8 +1601,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile( - "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + asm("vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n" "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 @@ -1680,8 +1650,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + asm("vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n" "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 @@ -1763,10 +1732,10 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" + asm("pxor %%xmm5,%%xmm5 \n" - // 16 pixel loop. - LABELALIGN + // 16 pixel loop. + LABELALIGN "1: \n" "movdqu (%0),%%xmm3 \n" "lea 0x10(%0),%0 \n" // src_ptr += 16 @@ -1782,11 +1751,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #ifdef HAS_SCALEADDROW_AVX2 @@ -1794,9 +1763,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm("vpxor %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm3 \n" "lea 0x20(%0),%0 \n" // src_ptr += 32 @@ -1811,11 +1780,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEADDROW_AVX2 @@ -1835,8 +1804,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, int x, int dx) { intptr_t x0, x1, temp_pixel; - asm volatile( - "movd %6,%%xmm2 \n" + asm("movd %6,%%xmm2 \n" "movd %7,%%xmm3 \n" "movl $0x04040000,%k2 \n" "movd %k2,%%xmm5 \n" @@ -1932,7 +1900,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, int dx) { (void)x; (void)dx; - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -1945,11 +1913,11 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, "sub $0x20,%2 \n" "jg 1b \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, @@ -1957,7 +1925,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1967,11 +1935,11 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, @@ -1979,7 +1947,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1992,18 +1960,18 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -2020,11 +1988,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Reads 4 pixels at a time. @@ -2037,8 +2005,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; (void)src_stride; - asm volatile( - "lea 0x00(,%1,4),%1 \n" + asm("lea 0x00(,%1,4),%1 \n" "lea 0x00(%1,%1,2),%4 \n" LABELALIGN @@ -2060,8 +2027,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, "+r"(dst_argb), // %2 "+r"(dst_width), // %3 "=&r"(src_stepx_x12) // %4 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Blends four 2x2 to 4x1. @@ -2074,8 +2041,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); - asm volatile( - "lea 0x00(,%1,4),%1 \n" + asm("lea 0x00(,%1,4),%1 \n" "lea 0x00(%1,%1,2),%4 \n" "lea 0x00(%0,%5,1),%5 \n" @@ -2107,8 +2073,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, "+rm"(dst_width), // %3 "=&r"(src_stepx_x12), // %4 "+r"(row1) // %5 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } void ScaleARGBCols_SSE2(uint8_t* dst_argb, @@ -2117,8 +2083,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb, int x, int dx) { intptr_t x0, x1; - asm volatile( - "movd %5,%%xmm2 \n" + asm("movd %5,%%xmm2 \n" "movd %6,%%xmm3 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" "pshufd $0x11,%%xmm3,%%xmm0 \n" @@ -2188,7 +2153,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, int dx) { (void)x; (void)dx; - asm volatile(LABELALIGN + asm(LABELALIGN "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -2201,11 +2166,11 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } // Shuffle table for arranging 2 pixels into pairs for pmaddubsw @@ -2226,16 +2191,14 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, int x, int dx) { intptr_t x0, x1; - asm volatile( - "movdqa %0,%%xmm4 \n" + asm("movdqa %0,%%xmm4 \n" "movdqa %1,%%xmm5 \n" : : "m"(kShuffleColARGB), // %0 "m"(kShuffleFractions) // %1 ); - asm volatile( - "movd %5,%%xmm2 \n" + asm("movd %5,%%xmm2 \n" "movd %6,%%xmm3 \n" "pcmpeqb %%xmm6,%%xmm6 \n" "psrlw $0x9,%%xmm6 \n" @@ -2283,8 +2246,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, "packuswb %%xmm0,%%xmm0 \n" "movd %%xmm0,(%0) \n" - LABELALIGN - "99: \n" // clang-format error. + LABELALIGN "99: \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 @@ -2298,8 +2260,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, // Divide num by div and return as 16.16 fixed point result. int FixedDiv_X86(int num, int div) { - asm volatile( - "cdq \n" + asm("cdq \n" "shld $0x10,%%eax,%%edx \n" "shl $0x10,%%eax \n" "idiv %1 \n" @@ -2312,8 +2273,7 @@ int FixedDiv_X86(int num, int div) { // Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_X86(int num, int div) { - asm volatile( - "cdq \n" + asm("cdq \n" "shld $0x10,%%eax,%%edx \n" "shl $0x10,%%eax \n" "sub $0x10001,%%eax \n" @@ -2344,8 +2304,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 + asm("pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5, %%xmm5 \n" // zero @@ -2384,8 +2343,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 + asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero @@ -2428,8 +2386,7 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqw %%xmm4,%%xmm4 \n" + asm("pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 "movdqa %3,%%xmm3 \n" @@ -2469,8 +2426,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - "pcmpeqw %%xmm6,%%xmm6 \n" + asm("pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $3,%%xmm6 \n" // all 8 "movdqa %5,%%xmm7 \n" @@ -2553,8 +2509,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm volatile( - "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + asm("vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vbroadcastf128 %3,%%ymm3 \n" @@ -2596,8 +2551,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + asm("vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 "vbroadcastf128 %5,%%ymm7 \n" @@ -2676,8 +2630,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" + asm("pxor %%xmm5,%%xmm5 \n" "pcmpeqd %%xmm4,%%xmm4 \n" "psrld $31,%%xmm4 \n" "pslld $1,%%xmm4 \n" // all 2 @@ -2728,8 +2681,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - "pxor %%xmm7,%%xmm7 \n" + asm("pxor %%xmm7,%%xmm7 \n" "pcmpeqd %%xmm6,%%xmm6 \n" "psrld $31,%%xmm6 \n" "pslld $3,%%xmm6 \n" // all 8 @@ -2819,8 +2771,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile( - "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + asm("vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n" "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 @@ -2868,8 +2819,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile( - "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + asm("vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n" "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index ec1d72eb8..9973318f5 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2551,11 +2551,12 @@ int TestHalfFloatPlane(int benchmark_width, #if defined(__arm__) static void EnableFlushDenormalToZero(void) { uint32_t cw; - __asm__ __volatile__( - "vmrs %0, fpscr \n" - "orr %0, %0, #0x1000000 \n" - "vmsr fpscr, %0 \n" - : "=r"(cw)::"memory"); + asm volatile ( + "vmrs %0, fpscr \n" + "orr %0, %0, #0x1000000 \n" + "vmsr fpscr, %0 \n" + : "=r"(cw) + ::"memory", "cc"); // Clobber List } #endif