diff --git a/README.chromium b/README.chromium index 95ff15644..e36271cef 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 565 +Version: 566 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d7f43e347..ce6f16bfb 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -123,6 +123,12 @@ extern "C" { // TODO(fbarchard): Port to gcc. #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ARGBCOLORTABLEROW_X86 +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +// TODO(fbarchard): Hook these up to all functions. e.g. format conversion. +#define HAS_ARGBTOYROW_AVX2 +#define HAS_ARGBTOUVROW_AVX2 +#endif #endif // The following are Yasm x86 only. @@ -258,6 +264,13 @@ typedef __declspec(align(16)) int16 vec16[8]; typedef __declspec(align(16)) uint16 uvec16[8]; typedef __declspec(align(16)) int32 vec32[4]; typedef __declspec(align(16)) uint32 uvec32[4]; +typedef __declspec(align(32)) int8 lvec8[32]; +typedef __declspec(align(32)) uint8 ulvec8[32]; +typedef __declspec(align(32)) int16 lvec16[16]; +typedef __declspec(align(32)) uint16 ulvec16[16]; +typedef __declspec(align(32)) int32 lvec32[8]; +typedef __declspec(align(32)) uint32 ulvec32[8]; + #elif defined(__GNUC__) #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) typedef int8 __attribute__((vector_size(16))) vec8; @@ -360,6 +373,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, int width); +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); @@ -430,6 +446,12 @@ void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix); void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix); void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix); +void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index c5579b10b..156ecfd19 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 565 +#define LIBYUV_VERSION 566 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index 7c70b5006..770e37dc5 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -739,7 +739,26 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, } } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + clear = true; + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_Unaligned_AVX2; + ARGBToYRow = ARGBToYRow_Unaligned_AVX2; + if (IS_ALIGNED(src_argb, 32) && IS_ALIGNED(src_stride_argb, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + if (IS_ALIGNED(dst_y, 32) && IS_ALIGNED(dst_stride_y, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -767,6 +786,12 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); } + +#if defined(HAS_ARGBTOYROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } diff --git a/source/row_any.cc b/source/row_any.cc index 2e43b1c62..cbc6cc012 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -195,6 +195,9 @@ BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, dst_y + (width - NUM) * BPP, NUM); \ } +#ifdef HAS_ARGBTOYROW_AVX2 +YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_Unaligned_AVX2, 4, 1, 32) +#endif #ifdef HAS_ARGBTOYROW_SSSE3 YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16) YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16) @@ -251,37 +254,40 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, #endif // RGB/YUV to UV does multiple of 16 with SIMD and remainder with C. -#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP) \ +#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \ void NAMEANY(const uint8* src_argb, int src_stride_argb, \ uint8* dst_u, uint8* dst_v, int width) { \ - int n = width & ~15; \ + int n = width & ~MASK; \ ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \ ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \ dst_u + (n >> 1), \ dst_v + (n >> 1), \ - width & 15); \ + width & MASK); \ } +#ifdef HAS_ARGBTOYROW_AVX2 +UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_Unaligned_AVX2, ARGBToUVRow_C, 4, 31) +#endif #ifdef HAS_ARGBTOUVROW_SSSE3 -UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4) -UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4) -UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4) -UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4) -UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2) -UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2) +UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15) +UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15) +UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15) +UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15) +UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15) +UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15) #endif #ifdef HAS_ARGBTOUVROW_NEON -UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4) -UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4) -UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4) -UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4) -UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3) -UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3) -UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2) -UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2) -UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2) -UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2) -UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2) +UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) +UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15) +UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15) +UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15) +UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15) +UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) +UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) +UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) +UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) +UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15) +UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) #endif #undef UVANY diff --git a/source/row_win.cc b/source/row_win.cc index 575543874..95b912e7c 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -25,14 +25,35 @@ static const vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; +static const lvec8 kARGBToY_AVX = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + static const vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; +// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version. +static const lvec8 kARGBToU_AVX = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + static const vec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; +static const lvec8 kARGBToV_AVX = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 +}; + +// Unshuffle for vphaddw + vpackuswb vpermd. +static const lvec32 kShufARGBToY_AVX = { + 0, 4, 1, 5, 2, 6, 3, 7 +}; + // Constants for BGRA. static const vec8 kBGRAToY = { 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 @@ -76,11 +97,25 @@ static const uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; +static const ulvec8 kAddY16_AVX = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u +}; + static const uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; +static const ulvec8 kAddUV128_AVX = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; + // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u @@ -727,6 +762,49 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } } +#ifdef HAS_ARGBTOYROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) __declspec(align(32)) +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + vmovdqa ymm6, kShufARGBToY_AVX + vmovdqa ymm5, kAddY16_AVX + vmovdqa ymm4, kARGBToY_AVX + + align 16 + convertloop: + vmovdqa ymm0, [eax] + vmovdqa ymm1, [eax + 32] + vmovdqa ymm2, [eax + 64] + vmovdqa ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + vpaddb ymm0, ymm0, ymm5 + sub ecx, 32 + vmovdqa [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + ret +vphaddw ymm0, ymm0, ymm1 +vpermq ymm0, ymm0, 0xd8 +vpackuswb ymm0, ymm0, ymm2 +vpermq ymm0, ymm0, 0xd8 + } +} +#endif // HAS_ARGBTOYROW_AVX2 + __declspec(naked) __declspec(align(16)) void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { @@ -761,6 +839,44 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } } +#ifdef HAS_ARGBTOYROW_AVX2 +__declspec(naked) __declspec(align(32)) +void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + vmovdqa ymm6, kShufARGBToY_AVX + vmovdqa ymm5, kAddY16_AVX + vmovdqa ymm4, kARGBToY_AVX + + align 16 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm2, ymm2, ymm3 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + vpaddb ymm0, ymm0, ymm5 + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBTOYROW_AVX2 + __declspec(naked) __declspec(align(16)) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { @@ -1031,6 +1147,80 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } +#ifdef HAS_ARGBTOUVROW_AVX2 +__declspec(naked) __declspec(align(32)) +void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vmovdqa ymm7, kARGBToU_AVX + vmovdqa ymm6, kARGBToV_AVX + vmovdqa ymm5, kAddUV128_AVX + sub edi, edx // stride from u to v + + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + vmovdqa ymm0, [eax] + vmovdqa ymm1, [eax + 32] + vmovdqa ymm2, [eax + 64] + vmovdqa ymm3, [eax + 96] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vpavgb ymm2, ymm2, [eax + esi + 64] + vpavgb ymm3, ymm3, [eax + esi + 96] + lea eax, [eax + 128] + vmovdqa ymm4, ymm0 // TODO(fbarchard): Remove. + vshufps ymm0, ymm0, ymm1, 0x88 + vshufps ymm4, ymm4, ymm1, 0xdd + vpavgb ymm0, ymm0, ymm4 + vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. + vmovdqa ymm4, ymm2 // TODO(fbarchard): Remove. + vshufps ymm2, ymm2, ymm3, 0x88 + vshufps ymm4, ymm4, ymm3, 0xdd + vpavgb ymm2, ymm2, ymm4 + vpermq ymm2, ymm2, 0xd8 // TODO(fbarchard): Remove. + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V + vmovdqa ymm1, ymm0 // TODO(fbarchard): Remove. + vmovdqa ymm3, ymm2 // TODO(fbarchard): Remove. + vpmaddubsw ymm0, ymm0, ymm7 // U + vpmaddubsw ymm2, ymm2, ymm7 + vpmaddubsw ymm1, ymm1, ymm6 // V + vpmaddubsw ymm3, ymm3, ymm6 + vphaddw ymm0, ymm0, ymm2 + vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. + vphaddw ymm1, ymm1, ymm3 + vpermq ymm1, ymm1, 0xd8 // TODO(fbarchard): Remove. + vpsraw ymm0, ymm0, 8 + vpsraw ymm1, ymm1, 8 + vpacksswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 + vpaddb ymm0, ymm0, ymm5 // -> unsigned + + // step 3 - store 16 U and 16 V values + sub ecx, 32 + vmovdqa ymm1, ymm0 + vextractf128 qword ptr [edx], ymm0, 0 // U + vextractf128 qword ptr [edx + edi], ymm0, 1 // V + lea edx, [edx + 16] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBTOUVROW_AVX2 + __declspec(naked) __declspec(align(16)) void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { @@ -1101,6 +1291,80 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, } } +#ifdef HAS_ARGBTOUVROW_AVX2 +__declspec(naked) __declspec(align(32)) +void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vmovdqa ymm7, kARGBToU_AVX + vmovdqa ymm6, kARGBToV_AVX + vmovdqa ymm5, kAddUV128_AVX + sub edi, edx // stride from u to v + + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vpavgb ymm2, ymm2, [eax + esi + 64] + vpavgb ymm3, ymm3, [eax + esi + 96] + lea eax, [eax + 128] + vmovdqa ymm4, ymm0 + vshufps ymm0, ymm0, ymm1, 0x88 + vshufps ymm4, ymm4, ymm1, 0xdd + vpavgb ymm0, ymm0, ymm4 + vpermq ymm0, ymm0, 0xd8 + vmovdqa ymm4, ymm2 + vshufps ymm2, ymm2, ymm3, 0x88 + vshufps ymm4, ymm4, ymm3, 0xdd + vpavgb ymm2, ymm2, ymm4 + vpermq ymm2, ymm2, 0xd8 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V + vmovdqa ymm1, ymm0 + vmovdqa ymm3, ymm2 + vpmaddubsw ymm0, ymm0, ymm7 // U + vpmaddubsw ymm2, ymm2, ymm7 + vpmaddubsw ymm1, ymm1, ymm6 // V + vpmaddubsw ymm3, ymm3, ymm6 + vphaddw ymm0, ymm0, ymm2 + vpermq ymm0, ymm0, 0xd8 + vphaddw ymm1, ymm1, ymm3 + vpermq ymm1, ymm1, 0xd8 + vpsraw ymm0, ymm0, 8 + vpsraw ymm1, ymm1, 8 + vpacksswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 + vpaddb ymm0, ymm0, ymm5 // -> unsigned + + // step 3 - store 16 U and 16 V values + sub ecx, 32 + vmovdqa ymm1, ymm0 + vextractf128 qword ptr [edx], ymm0, 0 // U + vextractf128 qword ptr [edx + edi], ymm0, 1 // V + lea edx, [edx + 16] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBTOUVROW_AVX2 + __declspec(naked) __declspec(align(16)) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) {