diff --git a/README.chromium b/README.chromium index 8c29c7ca3..477263b0f 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1493 +Version: 1494 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index acfe90047..fcd62b0f5 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -147,6 +147,8 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 #define HAS_I444TOABGRROW_SSSE3 +#define HAS_I422ALPHATOARGBROW_SSSE3 +#define HAS_I422ALPHATOABGRROW_SSSE3 // Effects: #define HAS_ARGBADDROW_SSE2 @@ -187,14 +189,6 @@ extern "C" { #define HAS_I422TOABGRROW_SSSE3 #endif - -// The following are available on 32 bit x86 Visual C and clangcl. -// TODO(fbarchard): Port to gcc. -#if !defined(LIBYUV_DISABLE_X86) && defined (_M_IX86) -#define HAS_I422ALPHATOARGBROW_SSSE3 -#define HAS_I422ALPHATOABGRROW_SSSE3 -#endif - // The following are available for AVX2 Visual C and clangcl 32 bit: // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ @@ -255,6 +249,8 @@ extern "C" { #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 +#define HAS_I422ALPHATOARGBROW_AVX2 +#define HAS_I422ALPHATOABGRROW_AVX2 // Effects: #define HAS_ARGBADDROW_AVX2 @@ -1250,6 +1246,20 @@ void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, uint8* dst_argb, struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width); +void I422AlphaToABGRRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1456,6 +1466,20 @@ void I422AlphaToABGRRow_Any_SSSE3(const uint8* y_buf, uint8* dst_abgr, struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width); +void I422AlphaToABGRRow_Any_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_abgr, + struct YuvConstants* yuvconstants, + int width); void I411ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index daa8755ce..52305e256 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1493 +#define LIBYUV_VERSION 1494 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_any.cc b/source/row_any.cc index cae38f7ac..55c24dc15 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -48,6 +48,10 @@ extern "C" { ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) ANY41C(I422AlphaToABGRRow_Any_SSSE3, I422AlphaToABGRRow_SSSE3, 1, 0, 4, 7) #endif +#ifdef HAS_I422ALPHATOARGBROW_AVX2 +ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 7) +ANY41C(I422AlphaToABGRRow_Any_AVX2, I422AlphaToABGRRow_AVX2, 1, 0, 4, 7) +#endif #undef ANY41C // Any 3 planes to 1. diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 42666cca7..a6df00264 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1364,6 +1364,19 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "punpcklbw %%xmm4,%%xmm4 \n" \ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ + "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ + "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" + // Read 2 UV from 411, upsample to 8 UV #define READYUV411 \ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ @@ -1426,7 +1439,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "packuswb %%xmm1,%%xmm1 \n" \ "packuswb %%xmm2,%%xmm2 \n" -// Store 8 ARGB values. Assumes XMM5 is set. +// Store 8 ARGB values. #define STOREARGB \ "punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklbw %%xmm5,%%xmm2 \n" \ @@ -1449,7 +1462,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" -// Store 8 ABGR values. Assumes XMM5 is set. +// Store 8 ABGR values. #define STOREABGR \ "punpcklbw %%xmm1,%%xmm2 \n" \ "punpcklbw %%xmm5,%%xmm0 \n" \ @@ -1460,7 +1473,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" -// Store 8 RGBA values. Assumes XMM5 is set. +// Store 8 RGBA values. #define STORERGBA \ "pcmpeqb %%xmm5,%%xmm5 \n" \ "punpcklbw %%xmm2,%%xmm1 \n" \ @@ -1643,6 +1656,62 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, ); } +void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + LABELALIGN + "1: \n" + READYUVA422 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_abgr, + struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + LABELALIGN + "1: \n" + READYUVA422 + YUVTORGB(yuvconstants) + STOREABGR + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1838,6 +1907,22 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. +#define READYUVA422_AVX2 \ + "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ + "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ + "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ + "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" + // Read 8 UV from NV12, upsample to 16 UV. #define READNV12_AVX2 \ "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ @@ -1887,7 +1972,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" -// Store 16 ARGB values. Assumes XMM5 is set. +// Store 16 ARGB values. #define STOREARGB_AVX2 \ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ @@ -1899,6 +1984,18 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" +// Store 16 ABGR values. +#define STOREABGR_AVX2 \ + "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \ + "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \ + "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \ + "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_abgr]) " \n" \ + "lea " MEMLEA(0x40,[dst_abgr]) ",%[dst_abgr] \n" + #if defined(HAS_I422TOBGRAROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). @@ -1974,13 +2071,79 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 +#if defined(HAS_I422ALPHATOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. +void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + LABELALIGN + "1: \n" + READYUVA422_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I422ALPHATOARGBROW_AVX2 + +#if defined(HAS_I422ALPHATOABGRROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. +void OMITFP I422AlphaToABGRRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_abgr, + struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + LABELALIGN + "1: \n" + READYUVA422_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREABGR_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [a_buf]"+r"(a_buf), // %[a_buf] + [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I422ALPHATOABGRROW_AVX2 + #if defined(HAS_I422TOABGRROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* dst_argb, + uint8* dst_abgr, struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -1990,24 +2153,14 @@ void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) - - // Step 3: Weave into ABGR - "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels - "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels - "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" - "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + STOREABGR_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] + [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 diff --git a/source/row_win.cc b/source/row_win.cc index fff337a7d..64c029ce2 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1896,6 +1896,23 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm lea eax, [eax + 16] \ } +// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. +#define READYUVA422_AVX2 __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ + __asm lea eax, [eax + 16] \ + __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vpermq ymm5, ymm5, 0xd8 \ + __asm lea ebp, [ebp + 16] \ + } + // Read 4 UV from 411, upsample to 16 UV. #define READYUV411_AVX2 __asm { \ __asm vmovd xmm0, dword ptr [esi] /* U */ \ @@ -2057,6 +2074,92 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 +#ifdef HAS_I422ALPHATOARGBROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. +__declspec(naked) +void I422AlphaToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + + convertloop: + READYUVA422_AVX2 + YUVTORGB_AVX2(ebx) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422ALPHATOARGBROW_AVX2 + +#ifdef HAS_I422ALPHATOABGRROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. +__declspec(naked) +void I422AlphaToABGRRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_abgr, + struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // abgr + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + + convertloop: + READYUVA422_AVX2 + YUVTORGB_AVX2(ebx) + STOREABGR_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422ALPHATOABGRROW_AVX2 + #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). @@ -2848,7 +2951,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, } // 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB (32 bytes). +// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. __declspec(naked) void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -2870,7 +2973,6 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, mov ebx, [esp + 16 + 24] // yuvconstants mov ecx, [esp + 16 + 28] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUVA422 @@ -2889,7 +2991,7 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, } // 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR (32 bytes). +// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR. __declspec(naked) void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -2911,7 +3013,6 @@ void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, mov ebx, [esp + 16 + 24] // yuvconstants mov ecx, [esp + 16 + 28] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUVA422