diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 756f83cb3..36c5e575c 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -116,7 +116,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) { uint32_t hash = seed; const uint32_t c16 = 0x92d9e201; // 33^16 uint32_t tmp, tmp2; - asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" + asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n" // count is always a multiple of 16. diff --git a/source/row_common.cc b/source/row_common.cc index 36561e0b7..057c3bb9f 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -653,7 +653,6 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { } #endif - #define AVGB(a, b) (((a) + (b) + 1) >> 1) // ARM uses uint16. TODO: Make ARM use uint8 to allow dotproduct. #if !defined(LIBYUV_ARGBTOUV_PAVGB) @@ -787,7 +786,6 @@ MAKEROWY(RAW, 0, 1, 2, 3) // g -0.41869 * 256 = −107.18464 = -107 // r 0.50000 * 256 = 128.0 = 128 - #ifdef LIBYUV_RGB7 // Old 7 bit math for compatibility on unsupported platforms. static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { diff --git a/source/row_gcc.cc b/source/row_gcc.cc index ce8af5839..5c6431aa6 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -508,35 +508,35 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" + asm volatile("movdqa %3,%%xmm6 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -546,35 +546,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" + asm volatile("movdqa %3,%%xmm6 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -1192,21 +1192,21 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile("movdqa %3,%%xmm2 \n" + asm volatile("movdqa %3,%%xmm2 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrlw $8,%%xmm0 \n" - "psrlw $8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "pshufb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1297,21 +1297,21 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpsrlw $8,%%ymm0,%%ymm0 \n" - "vpsrlw $8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x40(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" + asm volatile("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1478,8 +1478,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" "vmovdqa %6,%%ymm6 \n" // - LABELALIGN RGBTOY_AVX2( - ymm7) "vzeroupper \n" + LABELALIGN RGBTOY_AVX2(ymm7) "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1500,7 +1499,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" "vmovdqa %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( - ymm7) "vzeroupper \n" + ymm7) "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1541,7 +1540,7 @@ void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqa %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( - ymm5) "vzeroupper \n" + ymm5) "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1560,8 +1559,7 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqa %5,%%ymm6 \n" // - LABELALIGN RGBTOY_AVX2( - ymm5) "vzeroupper \n" + LABELALIGN RGBTOY_AVX2(ymm5) "vzeroupper \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2604,27 +2602,26 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + asm volatile(YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA444 - YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] + LABELALIGN "1: \n" READYUVA444 YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width] "+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width] "+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } #endif // HAS_I444ALPHATOARGBROW_SSSE3 @@ -2931,27 +2928,26 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + asm volatile(YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA210 - YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] + LABELALIGN "1: \n" READYUVA210 YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width] "+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width] "+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } #endif @@ -2964,27 +2960,26 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + asm volatile(YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA410 - YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] + LABELALIGN "1: \n" READYUVA410 YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width] "+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width] "+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } #endif @@ -3031,27 +3026,26 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + asm volatile(YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA422 - YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] + LABELALIGN "1: \n" READYUVA422 YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width] "+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width] "+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } #endif // HAS_I422ALPHATOARGBROW_SSSE3 @@ -3060,21 +3054,20 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile(YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV12 - YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[uv_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); + LABELALIGN "1: \n" READNV12 YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[uv_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, @@ -3082,22 +3075,21 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile(YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV21 - YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [vu_buf] "+r"(vu_buf), // %[vu_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] - [kShuffleNV21] "m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); + LABELALIGN "1: \n" READNV21 YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [vu_buf] "+r"(vu_buf), // %[vu_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] + [kShuffleNV21] "m"(kShuffleNV21) + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, @@ -3107,10 +3099,9 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, asm volatile( "movdqa %[kShuffleYUY2Y],%%xmm6 \n" "movdqa %[kShuffleYUY2UV],%%xmm7 \n" YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READYUY2 - YUVTORGB(yuvconstants) STOREARGB + LABELALIGN "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [yuy2_buf] "+r"(yuy2_buf), // %[yuy2_buf] @@ -3129,10 +3120,9 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, asm volatile( "movdqa %[kShuffleUYVYY],%%xmm6 \n" "movdqa %[kShuffleUYVYUV],%%xmm7 \n" YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READUYVY - YUVTORGB(yuvconstants) STOREARGB + LABELALIGN "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [uyvy_buf] "+r"(uyvy_buf), // %[uyvy_buf] @@ -3149,21 +3139,20 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile(YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP210 - YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[u_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); + LABELALIGN "1: \n" READP210 YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, @@ -3171,21 +3160,20 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile(YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP410 - YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[u_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); + LABELALIGN "1: \n" READP410 YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); } void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, @@ -4006,29 +3994,28 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + asm volatile(YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA210_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + LABELALIGN "1: \n" READYUVA210_AVX2 YUVTORGB_AVX2( + yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width] "+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width] "+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); } #endif // HAS_I210TOARGBROW_AVX2 @@ -4042,29 +4029,28 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + asm volatile(YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA410_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + LABELALIGN "1: \n" READYUVA410_AVX2 YUVTORGB_AVX2( + yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width] "+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width] "+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); } #endif // HAS_I410TOARGBROW_AVX2 @@ -4118,28 +4104,27 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + asm volatile(YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA444_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] + LABELALIGN "1: \n" READYUVA444_AVX2 YUVTORGB_AVX2( + yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width] "+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width] "+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); } #endif // HAS_I444ALPHATOARGBROW_AVX2 @@ -4153,28 +4138,27 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + asm volatile(YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA422_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] + LABELALIGN "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2( + yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) - [width] "+m"(width) // %[width] + [width] "+m"(width) // %[width] #else - [width] "+rm"(width) // %[width] + [width] "+rm"(width) // %[width] #endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); } #endif // HAS_I422ALPHATOARGBROW_AVX2 @@ -4230,22 +4214,21 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile(YUVTORGB_SETUP_AVX2( + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READNV12_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[uv_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", - "xmm3", "xmm4", "xmm5"); + LABELALIGN "1: \n" READNV12_AVX2 YUVTORGB_AVX2( + yuvconstants) STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[uv_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", + "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_NV12TOARGBROW_AVX2 @@ -4257,23 +4240,22 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile(YUVTORGB_SETUP_AVX2( + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READNV21_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [vu_buf] "+r"(vu_buf), // %[vu_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] - [kShuffleNV21] "m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", - "xmm3", "xmm4", "xmm5"); + LABELALIGN "1: \n" READNV21_AVX2 YUVTORGB_AVX2( + yuvconstants) STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [vu_buf] "+r"(vu_buf), // %[vu_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] + [kShuffleNV21] "m"(kShuffleNV21) + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", + "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_NV21TOARGBROW_AVX2 @@ -4287,10 +4269,10 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, asm volatile( "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6 \n" "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READYUY2_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 + LABELALIGN "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" @@ -4314,10 +4296,10 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, asm volatile( "vbroadcastf128 %[kShuffleUYVYY],%%ymm6 \n" "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READUYVY_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 + LABELALIGN "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" @@ -4339,22 +4321,21 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile(YUVTORGB_SETUP_AVX2( + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READP210_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[uv_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", - "xmm3", "xmm4", "xmm5"); + LABELALIGN "1: \n" READP210_AVX2 YUVTORGB_AVX2( + yuvconstants) STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[uv_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", + "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_P210TOARGBROW_AVX2 @@ -4366,22 +4347,21 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile(YUVTORGB_SETUP_AVX2( + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READP410_AVX2 - YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[uv_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", - "xmm3", "xmm4", "xmm5"); + LABELALIGN "1: \n" READP410_AVX2 YUVTORGB_AVX2( + yuvconstants) STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[uv_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", + "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_P410TOARGBROW_AVX2 @@ -4553,16 +4533,16 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" + asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,1),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4574,18 +4554,18 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_MIRRORROW_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4601,16 +4581,16 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" + asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,2),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu -0x10(%0,%2,2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(temp_width) // %2 @@ -4622,18 +4602,18 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { #ifdef HAS_MIRRORUVROW_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(temp_width) // %2 @@ -4726,17 +4706,17 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("lea -0x10(%0,%2,4),%0 \n" + asm volatile("lea -0x10(%0,%2,4),%0 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4750,16 +4730,16 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("vmovdqu %3,%%ymm5 \n" + asm volatile("vmovdqu %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4983,20 +4963,20 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%0),%%zmm0 \n" - "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" - "lea 0x20(%0),%0 \n" - "vpsllw $0x8,%%zmm1,%%zmm1 \n" - "vporq %%zmm0,%%zmm1,%%zmm2 \n" - "vmovdqu64 %%zmm2,(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%0),%%zmm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" + "lea 0x20(%0),%0 \n" + "vpsllw $0x8,%%zmm1,%%zmm1 \n" + "vporq %%zmm0,%%zmm1,%%zmm2 \n" + "vmovdqu64 %%zmm2,(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5011,20 +4991,20 @@ void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%0),%%ymm0 \n" - "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" - "lea 0x10(%0),%0 \n" - "vpsllw $0x8,%%ymm1,%%ymm1 \n" - "vpor %%ymm0,%%ymm1,%%ymm2 \n" - "vmovdqu %%ymm2,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%0),%%ymm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x10(%0),%0 \n" + "vpsllw $0x8,%%ymm1,%%ymm1 \n" + "vpor %%ymm0,%%ymm1,%%ymm2 \n" + "vmovdqu %%ymm2,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5039,21 +5019,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5288,24 +5268,24 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - asm volatile("vpbroadcastw %3,%%zmm2 \n" + asm volatile("vpbroadcastw %3,%%zmm2 \n" // 64 pixels per loop. LABELALIGN - "1: \n" - "vmovups (%0),%%zmm0 \n" - "vmovups 0x40(%0),%%zmm1 \n" - "add $0x80,%0 \n" - "vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n" - "vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n" - "vpmovuswb %%zmm0,%%ymm0 \n" - "vpmovuswb %%zmm1,%%ymm1 \n" - "vmovups %%ymm0,(%1) \n" - "vmovups %%ymm1,0x20(%1) \n" - "add $0x40,%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovups (%0),%%zmm0 \n" + "vmovups 0x40(%0),%%zmm1 \n" + "add $0x80,%0 \n" + "vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n" + "vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n" + "vpmovuswb %%zmm0,%%ymm0 \n" + "vpmovuswb %%zmm1,%%ymm1 \n" + "vmovups %%ymm0,(%1) \n" + "vmovups %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -6490,7 +6470,7 @@ void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) { // Multiple of 1. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep movsb \n" + asm volatile("rep movsb \n" : "+S"(src), // %0 "+D"(dst), // %1 "+c"(width_tmp) // %2 @@ -6700,7 +6680,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width >> 2); const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm volatile("rep stosl \n" + asm volatile("rep stosl \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -6709,7 +6689,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep stosb \n" + asm volatile("rep stosb \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v8) // %2 @@ -6718,7 +6698,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep stosl \n" + asm volatile("rep stosl \n" : "+D"(dst_argb), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -7879,28 +7859,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" + asm volatile("pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -7916,27 +7896,27 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" // 4 pixel loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu (%1),%%ymm3 \n" - "lea 0x20(%1),%1 \n" - "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -8763,20 +8743,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile("movdqu (%3),%%xmm5 \n" + asm volatile("movdqu (%3),%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -8791,21 +8771,21 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile("vbroadcastf128 (%3),%%ymm5 \n" + asm volatile("vbroadcastf128 (%3),%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -8820,24 +8800,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "add $0x10,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8854,24 +8834,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "add $0x10,%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,(%3) \n" - "movdqu %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8888,27 +8868,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" - "vextractf128 $0x0,%%ymm1,(%3) \n" - "vextractf128 $0x0,%%ymm2,0x10(%3) \n" - "vextractf128 $0x1,%%ymm1,0x20(%3) \n" - "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8925,27 +8905,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" - "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" - "vextractf128 $0x0,%%ymm1,(%3) \n" - "vextractf128 $0x0,%%ymm2,0x10(%3) \n" - "vextractf128 $0x1,%%ymm1,0x20(%3) \n" - "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8961,47 +8941,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { - asm volatile("pxor %%xmm3,%%xmm3 \n" + asm volatile("pxor %%xmm3,%%xmm3 \n" // 2 pixel loop. LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps 0x10(%3),%%xmm0 \n" - "mulps 0x10(%3),%%xmm4 \n" - "addps (%3),%%xmm0 \n" - "addps (%3),%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps 0x20(%3),%%xmm2 \n" - "mulps 0x20(%3),%%xmm6 \n" - "mulps 0x30(%3),%%xmm1 \n" - "mulps 0x30(%3),%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -9499,20 +9479,20 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("movdqu %3,%%xmm5 \n" + asm volatile("movdqu %3,%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 @@ -9523,21 +9503,21 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { #ifdef HAS_SWAPUVROW_AVX2 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 diff --git a/source/row_neon.cc b/source/row_neon.cc index cb86b3f42..0a1a83d1d 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, "vmov.u8 d6, #255 \n" "1: \n" READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 - STORERGBA "bgt 1b \n" + STORERGBA "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -379,8 +379,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "vmov.u8 d7, #0x0f \n" // vbic bits to clear - "1: \n" READYUV422 YUVTORGB - RGBTORGB8 + "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTOARGB4444 "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels "bgt 1b \n" @@ -401,8 +400,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV400 YUVTORGB - RGBTORGB8 + "1: \n" READYUV400 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 8ec539b4e..4b6947e18 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -289,11 +289,11 @@ void I210ToAR30Row_NEON(const uint16_t* src_y, uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV210 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "dup v23.8h, %w[alpha] \n" + "1: \n" READYUV210 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -317,11 +317,11 @@ void I410ToAR30Row_NEON(const uint16_t* src_y, uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV410 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "dup v23.8h, %w[alpha] \n" + "1: \n" READYUV410 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -344,11 +344,11 @@ void I212ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV212 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "1: \n" READYUV212 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -521,12 +521,12 @@ void P210ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP210 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "ldr q2, [%[kIndices]] \n" + "1: \n" READYUVP210 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] @@ -547,12 +547,12 @@ void P410ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP410 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "ldr q2, [%[kIndices]] \n" + "1: \n" READYUVP410 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] @@ -808,7 +808,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8h, #0x80, lsl #8 \n" - "1: \n" // + "1: \n" // READYUV422 "subs %w[width], %w[width], #8 \n" // I4XXTORGB RGBTORGB8_TOP ARGBTOARGB1555_FROM_TOP "st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels @@ -2768,7 +2768,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) { - asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" + asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" "movi v29.16b, #0x80 \n" // 128.5 "1: \n" "ldp q0, q1, [%[src]], #32 \n" diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index f699a49bf..bab60a213 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -1761,25 +1761,25 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" + asm volatile("pxor %%xmm5,%%xmm5 \n" // 16 pixel loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" // src_ptr += 16 - "movdqu (%1),%%xmm0 \n" - "movdqu 0x10(%1),%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -1792,23 +1792,23 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm3 \n" - "lea 0x20(%0),%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw (%1),%%ymm2,%%ymm0 \n" - "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2