From 94417b9d213364905ce849c25719b819b8dbbaaa Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 26 Sep 2025 12:19:35 -0700 Subject: [PATCH] Pass rgbconstants via struct pointer instead of elements with m Now 66 instructions SYM ARGBToUVRow_SSSE3: 62ccd0: BASE push ebp 62ccd1: BASE mov ebp, esp 62ccd3: BASE push ebx 62ccd4: BASE push edi 62ccd5: BASE push esi 62ccd6: BASE and esp, 0xfffffffc 62ccd9: BASE sub esp, 0xc 62ccdc: BASE call 0x62cce1 62cce1: BASE pop eax 62cce2: BASE add eax, 0xe1c27 62cce8: BASE mov ecx, dword ptr [ebp+0xc] 62cceb: BASE mov edx, dword ptr [ebp+0x8] 62ccee: BASE mov esi, dword ptr [ebp+0x10] 62ccf1: BASE mov edi, dword ptr [ebp+0x18] 62ccf4: BASE mov dword ptr [esp+0x8], edi 62ccf8: BASE mov edi, dword ptr [ebp+0x14] 62ccfb: BASE lea ebx, ptr [eax-0x5ecf88] 62cd01: SSE2 movdqa xmm4, xmmword ptr [ebx] 62cd05: SSE2 movdqa xmm5, xmmword ptr [ebx+0x10] 62cd0a: SSE2 pcmpeqb xmm6, xmm6 62cd0e: SSSE3 pabsb xmm6, xmm6 62cd13: SSE2 movdqa xmm7, xmmword ptr [eax-0x5ecfa8] 62cd1b: BASE sub edi, esi 62cd1d: SSE2 movdqu xmm0, xmmword ptr [edx] 62cd21: SSE2 movdqu xmm1, xmmword ptr [edx+0x10] 62cd26: SSE2 movdqu xmm2, xmmword ptr [edx+ecx*1] 62cd2b: SSE2 movdqu xmm3, xmmword ptr [edx+ecx*1+0x10] 62cd31: SSSE3 pshufb xmm0, xmm7 62cd36: SSSE3 pshufb xmm1, xmm7 62cd3b: SSSE3 pshufb xmm2, xmm7 62cd40: SSSE3 pshufb xmm3, xmm7 62cd45: SSSE3 pmaddubsw xmm0, xmm6 62cd4a: SSSE3 pmaddubsw xmm1, xmm6 62cd4f: SSSE3 pmaddubsw xmm2, xmm6 62cd54: SSSE3 pmaddubsw xmm3, xmm6 62cd59: SSE2 paddw xmm0, xmm2 62cd5d: SSE2 paddw xmm1, xmm3 62cd61: SSE2 pxor xmm2, xmm2 62cd65: SSE2 psrlw xmm0, 0x1 62cd6a: SSE2 psrlw xmm1, 0x1 62cd6f: SSE2 pavgw xmm0, xmm2 62cd73: SSE2 pavgw xmm1, xmm2 62cd77: SSE2 packuswb xmm0, xmm1 62cd7b: SSE2 movdqa xmm2, xmm6 62cd7f: SSE2 psllw xmm2, 0xf 62cd84: SSE2 movdqa xmm1, xmm0 62cd88: SSSE3 pmaddubsw xmm1, xmm5 62cd8d: SSSE3 pmaddubsw xmm0, xmm4 62cd92: SSSE3 phaddw xmm0, xmm1 62cd97: SSE2 psubw xmm2, xmm0 62cd9b: SSE2 psrlw xmm2, 0x8 62cda0: SSE2 packuswb xmm2, xmm2 62cda4: SSE2 movd dword ptr [esi], xmm2 62cda8: SSE2 pshufd xmm2, xmm2, 0x55 62cdad: SSE2 movd dword ptr [esi+edi*1], xmm2 62cdb2: BASE lea edx, ptr [edx+0x20] 62cdb5: BASE lea esi, ptr [esi+0x4] 62cdb8: BASE sub dword ptr [esp+0x8], 0x8 62cdbd: BASE jnle 0x62cd1d 62cdc3: BASE lea esp, ptr [ebp-0xc] 62cdc6: BASE pop esi 62cdc7: BASE pop edi 62cdc8: BASE pop ebx 62cdc9: BASE pop ebp 62cdca: BASE ret Was 68 instructions ARGBToUVRow_SSSE3: 62ccd0: BASE push ebp 62ccd1: BASE mov ebp, esp 62ccd3: BASE push edi 62ccd4: BASE push esi 62ccd5: BASE and esp, 0xfffffff0 62ccd8: BASE sub esp, 0x30 62ccdb: BASE call 0x62cce0 62cce0: BASE pop eax 62cce1: BASE add eax, 0xe1c28 62cce7: BASE mov ecx, dword ptr [ebp+0xc] 62ccea: BASE mov edx, dword ptr [ebp+0x8] 62cced: BASE mov esi, dword ptr [ebp+0x10] 62ccf0: BASE mov edi, dword ptr [ebp+0x18] 62ccf3: BASE mov dword ptr [esp+0xc], edi 62ccf7: BASE mov edi, dword ptr [ebp+0x14] 62ccfa: SSE movaps xmm0, xmmword ptr [eax-0x5ecf88] 62cd01: SSE movaps xmmword ptr [esp+0x20], xmm0 62cd06: SSE movaps xmm0, xmmword ptr [eax-0x5ecf78] 62cd0d: SSE movaps xmmword ptr [esp+0x10], xmm0 62cd12: SSE2 movdqa xmm4, xmmword ptr [esp+0x20] 62cd18: SSE2 movdqa xmm5, xmmword ptr [esp+0x10] 62cd1e: SSE2 pcmpeqb xmm6, xmm6 62cd22: SSSE3 pabsb xmm6, xmm6 62cd27: SSE2 movdqa xmm7, xmmword ptr [eax-0x5ecfa8] 62cd2f: BASE sub edi, esi 62cd31: SSE2 movdqu xmm0, xmmword ptr [edx] 62cd35: SSE2 movdqu xmm1, xmmword ptr [edx+0x10] 62cd3a: SSE2 movdqu xmm2, xmmword ptr [edx+ecx*1] 62cd3f: SSE2 movdqu xmm3, xmmword ptr [edx+ecx*1+0x10] 62cd45: SSSE3 pshufb xmm0, xmm7 62cd4a: SSSE3 pshufb xmm1, xmm7 62cd4f: SSSE3 pshufb xmm2, xmm7 62cd54: SSSE3 pshufb xmm3, xmm7 62cd59: SSSE3 pmaddubsw xmm0, xmm6 62cd5e: SSSE3 pmaddubsw xmm1, xmm6 62cd63: SSSE3 pmaddubsw xmm2, xmm6 62cd68: SSSE3 pmaddubsw xmm3, xmm6 62cd6d: SSE2 paddw xmm0, xmm2 62cd71: SSE2 paddw xmm1, xmm3 62cd75: SSE2 pxor xmm2, xmm2 62cd79: SSE2 psrlw xmm0, 0x1 62cd7e: SSE2 psrlw xmm1, 0x1 62cd83: SSE2 pavgw xmm0, xmm2 62cd87: SSE2 pavgw xmm1, xmm2 62cd8b: SSE2 packuswb xmm0, xmm1 62cd8f: SSE2 movdqa xmm2, xmm6 62cd93: SSE2 psllw xmm2, 0xf 62cd98: SSE2 movdqa xmm1, xmm0 62cd9c: SSSE3 pmaddubsw xmm1, xmm5 62cda1: SSSE3 pmaddubsw xmm0, xmm4 62cda6: SSSE3 phaddw xmm0, xmm1 62cdab: SSE2 psubw xmm2, xmm0 62cdaf: SSE2 psrlw xmm2, 0x8 62cdb4: SSE2 packuswb xmm2, xmm2 62cdb8: SSE2 movd dword ptr [esi], xmm2 62cdbc: SSE2 pshufd xmm2, xmm2, 0x55 62cdc1: SSE2 movd dword ptr [esi+edi*1], xmm2 62cdc6: BASE lea edx, ptr [edx+0x20] 62cdc9: BASE lea esi, ptr [esi+0x4] 62cdcc: BASE sub dword ptr [esp+0xc], 0x8 62cdd1: BASE jnle 0x62cd31 62cdd7: BASE lea esp, ptr [ebp-0x8] 62cdda: BASE pop esi 62cddb: BASE pop edi 62cddc: BASE pop ebp 62cddd: BASE ret 62cdde: BASE int3 BUG=444157316 Change-Id: Iad044f851359f5b052091c7bdab9b96946fc3682 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6987370 Reviewed-by: Justin Green --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_gcc.cc | 187 +++++++++++++++------------------------ 3 files changed, 75 insertions(+), 116 deletions(-) diff --git a/README.chromium b/README.chromium index 3365e9695..6ad6e8073 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1919 +Version: 1920 License: BSD-3-Clause License File: LICENSE Shipped: yes diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 0fc37aed8..52f5a7cd8 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1919 +#define LIBYUV_VERSION 1920 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 6461bf728..9af3fc408 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -9,9 +9,6 @@ */ #include "libyuv/row.h" -#if defined(__i386__) && defined(__pic__) -#include -#endif #ifdef __cplusplus namespace libyuv { extern "C" { @@ -57,9 +54,6 @@ static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, static const uvec16 kAddY0 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u}; -static const uvec16 kAddUV128 = {0x8000u, 0x8000u, 0x8000u, 0x8000u, - 0x8000u, 0x8000u, 0x8000u, 0x8000u}; - static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u}; @@ -286,7 +280,7 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff - "psrld $0x18,%%xmm5 \n" + "psrld $24,%%xmm5 \n" "movdqa %3,%%xmm4 \n" LABELALIGN @@ -364,8 +358,8 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "pcmpeqb %%xmm3,%%xmm3 \n" "psllw $0xb,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xa,%%xmm4 \n" - "psrlw $0x5,%%xmm4 \n" + "psllw $10,%%xmm4 \n" + "psrlw $5,%%xmm4 \n" "pcmpeqb %%xmm7,%%xmm7 \n" "psllw $0x8,%%xmm7 \n" "sub %0,%1 \n" @@ -1592,15 +1586,20 @@ struct RgbUVConstants { vec8 kRGBToV; }; +// Offsets into RgbUVConstants structure +#define KRGBTOU 0 +#define KRGBTOV 16 + void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) { asm volatile( - "movdqa %4,%%xmm3 \n" - "movdqa %5,%%xmm4 \n" - "movdqa %6,%%xmm5 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // 0x8000 + "psllw $15,%%xmm5 \n" + "movdqa 0x0(%4),%%xmm3 \n" // kRGBToU + "movdqa 0x10(%4),%%xmm4 \n" // kRGBToV "sub %1,%2 \n" LABELALIGN @@ -1655,9 +1654,7 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, #else "+rm"(width) // %3 #endif - : "m"(rgbuvconstants->kRGBToU), // %4 - "m"(rgbuvconstants->kRGBToV), // %5 - "m"(kAddUV128) // %6 + : "r"(rgbuvconstants) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBTOUV444ROW_SSSE3 @@ -1670,10 +1667,11 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, int width, const struct RgbUVConstants* rgbuvconstants) { asm volatile( - "vbroadcastf128 %4,%%ymm3 \n" - "vbroadcastf128 %5,%%ymm4 \n" - "vbroadcastf128 %6,%%ymm5 \n" - "vmovdqa %7,%%ymm7 \n" + "vbroadcastf128 0x0(%4),%%ymm3 \n" // kRGBToU + "vbroadcastf128 0x10(%4),%%ymm4 \n" // kRGBToV + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // 0x8000 + "vpsllw $15,%%ymm5,%%ymm5 \n" + "vmovdqa %5,%%ymm7 \n" "sub %1,%2 \n" LABELALIGN @@ -1726,10 +1724,8 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, #else "+rm"(width) // %3 #endif - : "m"(rgbuvconstants->kRGBToU), // %4 - "m"(rgbuvconstants->kRGBToV), // %5 - "m"(kAddUV128), // %6 - "m"(kPermdARGBToY_AVX) // %7 + : "r"(rgbuvconstants), // %4 + "m"(kPermdARGBToY_AVX) // %5 : "memory", "cc", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7"); } @@ -1751,60 +1747,48 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) { -#if defined(__i386__) && defined(__pic__) - // i386 + PIC builds: Inline asm may run out of general-purpose registers. - // In PIC, EBX is reserved for the GOT (and with a frame pointer EBP is also - // unavailable), so addressing struct fields via memory operands can require - // extra temporaries that the compiler cannot allocate given the asm constraints. - // To avoid this, copy the RGB-to-UV constants to stack locals first and let the - // asm use simple stack-relative addressing. - __attribute__((aligned(16))) vec8 local_kRGBToU = {}; - __attribute__((aligned(16))) vec8 local_kRGBToV = {}; - memcpy(&local_kRGBToU, &rgbuvconstants->kRGBToU, sizeof(local_kRGBToU)); - memcpy(&local_kRGBToV, &rgbuvconstants->kRGBToV, sizeof(local_kRGBToV)); -#endif asm volatile( - "movdqa %5,%%xmm4 \n" // RGBToU - "movdqa %6,%%xmm5 \n" // RGBToV - "pcmpeqb %%xmm6,%%xmm6 \n" // 0x0101 - "pabsb %%xmm6,%%xmm6 \n" - "movdqa %7,%%xmm7 \n" // kShuffleAARRGGBB - "sub %1,%2 \n" + "movdqa 0x0(%5),%%xmm4 \n" // RGBToU + "movdqa 0x10(%5),%%xmm5 \n" // RGBToV + "pcmpeqb %%xmm6,%%xmm6 \n" // 0x0101 + "pabsb %%xmm6,%%xmm6 \n" + "movdqa %6,%%xmm7 \n" // kShuffleAARRGGBB + "sub %1,%2 \n" "1: \n" - "movdqu (%0),%%xmm0 \n" // Read 8x2 ARGB Pixels - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "pshufb %%xmm7,%%xmm0 \n" // aarrggbb - "pshufb %%xmm7,%%xmm1 \n" - "pshufb %%xmm7,%%xmm2 \n" - "pshufb %%xmm7,%%xmm3 \n" - "pmaddubsw %%xmm6,%%xmm0 \n" // 8x2 -> 4x2 - "pmaddubsw %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm6,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" // 4x2 -> 4x1 - "paddw %%xmm3,%%xmm1 \n" - "pxor %%xmm2,%%xmm2 \n" // 0 for vpavgw - "psrlw $1,%%xmm0 \n" - "psrlw $1,%%xmm1 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm2,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" // mutates + "movdqu (%0),%%xmm0 \n" // Read 8x2 ARGB Pixels + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "pshufb %%xmm7,%%xmm0 \n" // aarrggbb + "pshufb %%xmm7,%%xmm1 \n" + "pshufb %%xmm7,%%xmm2 \n" + "pshufb %%xmm7,%%xmm3 \n" + "pmaddubsw %%xmm6,%%xmm0 \n" // 8x2 -> 4x2 + "pmaddubsw %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm6,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" // 4x2 -> 4x1 + "paddw %%xmm3,%%xmm1 \n" + "pxor %%xmm2,%%xmm2 \n" // 0 for vpavgw + "psrlw $1,%%xmm0 \n" + "psrlw $1,%%xmm1 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm2,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" // mutates - "movdqa %%xmm6,%%xmm2 \n" - "psllw $15,%%xmm2 \n" // 0x8000 - "movdqa %%xmm0,%%xmm1 \n" - "pmaddubsw %%xmm5,%%xmm1 \n" // 4 V - "pmaddubsw %%xmm4,%%xmm0 \n" // 4 U - "phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv - "psubw %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,(%1) \n" // Write 4 U's - "pshufd $0x55,%%xmm2,%%xmm2 \n" // Copy V to low 4 bytes - "movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's + "movdqa %%xmm6,%%xmm2 \n" + "psllw $15,%%xmm2 \n" // 0x8000 + "movdqa %%xmm0,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" // 4 V + "pmaddubsw %%xmm4,%%xmm0 \n" // 4 U + "phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv + "psubw %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,(%1) \n" // Write 4 U's + "pshufd $0x55,%%xmm2,%%xmm2 \n" // Copy V to low 4 bytes + "movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's "lea 0x20(%0),%0 \n" "lea 0x4(%1),%1 \n" @@ -1819,16 +1803,10 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, "+rm"(width) // %3 #endif : "r"((intptr_t)(src_stride_argb)), // %4 -#if defined(__i386__) && defined(__pic__) - "m"(local_kRGBToU), // %5 - "m"(local_kRGBToV), // %6 -#else // defined(__i386__) && defined(__pic__) - "m"(rgbuvconstants->kRGBToU), // %5 - "m"(rgbuvconstants->kRGBToV), // %6 -#endif // defined(__i386__) && defined(__pic__) - "m"(kShuffleAARRGGBB) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); + "r"(rgbuvconstants), // %5 + "m"(kShuffleAARRGGBB) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOUVROW_SSSE3 @@ -1838,30 +1816,17 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, // 16x2 -> 8x1 ARGB pixels converted to 8 U and 8 V // ARGBToUV does rounding average of 4 ARGB pixels void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct RgbUVConstants* rgbuvconstants) { -#if defined(__i386__) && defined(__pic__) - // i386 + PIC builds: Inline asm may run out of general-purpose registers. - // In PIC, EBX is reserved for the GOT (and with a frame pointer EBP is also - // unavailable), so addressing struct fields via memory operands can require - // extra temporaries that the compiler cannot allocate given the asm constraints. - // To avoid this, copy the RGB-to-UV constants to stack locals first and let the - // asm use simple stack-relative addressing. - __attribute__((aligned(32))) vec8 local_kRGBToU = {}; - __attribute__((aligned(32))) vec8 local_kRGBToV = {}; - memcpy(&local_kRGBToU, &rgbuvconstants->kRGBToU, sizeof(local_kRGBToU)); - memcpy(&local_kRGBToV, &rgbuvconstants->kRGBToV, sizeof(local_kRGBToV)); -#endif - + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { asm volatile( - "vbroadcastf128 %5,%%ymm4 \n" // RGBToU - "vbroadcastf128 %6,%%ymm5 \n" // RGBToV + "vbroadcastf128 0(%5),%%ymm4 \n" // RGBToU + "vbroadcastf128 0x10(%5),%%ymm5 \n" // RGBToV "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0x0101 "vpabsb %%ymm6,%%ymm6 \n" - "vmovdqa %7,%%ymm7 \n" // kShuffleAARRGGBB + "vmovdqa %6,%%ymm7 \n" // kShuffleAARRGGBB "sub %1,%2 \n" "1: \n" @@ -1913,16 +1878,10 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, "+rm"(width) // %3 #endif : "r"((intptr_t)(src_stride_argb)), // %4 -#if defined(__i386__) && defined(__pic__) - "m"(local_kRGBToU), // %5 - "m"(local_kRGBToV), // %6 -#else - "m"(rgbuvconstants->kRGBToU), // %5 - "m"(rgbuvconstants->kRGBToV), // %6 -#endif // defined(__i386__) && defined(__pic__) - "m"(kShuffleAARRGGBB) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); + "r"(rgbuvconstants), // %5 + "m"(kShuffleAARRGGBB) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOUVROW_AVX2