From 5333e94e70fffcb126fe507cabbaefa0783b8125 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 13 Oct 2016 23:20:57 -0700 Subject: [PATCH] Port ARGBExtractAlpha_AVX2 function to windows. BUG=libyuv:572 TEST=try bots R=wangcheng@google.com, magjed@chromium.org Review URL: https://codereview.chromium.org/2416783004 . --- include/libyuv/row.h | 39 ++++++++++++++++----------------------- source/row_win.cc | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 608664bb0..09b24f91e 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -83,6 +83,7 @@ extern "C" { #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_ARGBSETROW_X86 #define HAS_ARGBSHUFFLEROW_SSE2 #define HAS_ARGBSHUFFLEROW_SSSE3 @@ -97,12 +98,12 @@ extern "C" { #define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 -#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 +#define HAS_HALFFLOATROW_SSE2 #define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 @@ -140,7 +141,6 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 -#define HAS_HALFFLOATROW_SSE2 // Effects: #define HAS_ARGBADDROW_SSE2 @@ -189,6 +189,7 @@ extern "C" { defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_ARGBCOPYALPHAROW_AVX2 #define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#define HAS_ARGBEXTRACTALPHAROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 @@ -199,12 +200,8 @@ extern "C" { #define HAS_ARGBTOYROW_AVX2 #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 +#define HAS_HALFFLOATROW_AVX2 #define HAS_I400TOARGBROW_AVX2 -#if !(defined(_DEBUG) && defined(__i386__)) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_AVX2 -#endif #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGBROW_AVX2 @@ -228,7 +225,6 @@ extern "C" { #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 -#define HAS_HALFFLOATROW_AVX2 // Effects: #define HAS_ARGBADDROW_AVX2 @@ -237,15 +233,12 @@ extern "C" { #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_BLENDPLANEROW_AVX2 + +#if !(defined(_DEBUG) && defined(__i386__)) +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +#define HAS_I422ALPHATOARGBROW_AVX2 #endif - - -// The following are available clang 3.4 or gcc 4.7. -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) && \ - !(defined(__clang__) && defined(_M_IX86) ) -#define HAS_ARGBEXTRACTALPHAROW_AVX2 #endif // The following are available for AVX2 Visual C and clangcl 32 bit: @@ -279,6 +272,7 @@ extern "C" { #define HAS_ARGB4444TOARGBROW_NEON #define HAS_ARGB4444TOUVROW_NEON #define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBEXTRACTALPHAROW_NEON #define HAS_ARGBSETROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON @@ -291,7 +285,6 @@ extern "C" { #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYROW_NEON -#define HAS_ARGBEXTRACTALPHAROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON @@ -371,15 +364,15 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) -#define HAS_MIRRORROW_MSA #define HAS_ARGBMIRRORROW_MSA -#define HAS_I422TOYUY2ROW_MSA #define HAS_I422TOUYVYROW_MSA -#define HAS_YUY2TOYROW_MSA -#define HAS_YUY2TOUVROW_MSA -#define HAS_YUY2TOUV422ROW_MSA -#define HAS_UYVYTOYROW_MSA +#define HAS_I422TOYUY2ROW_MSA +#define HAS_MIRRORROW_MSA #define HAS_UYVYTOUVROW_MSA +#define HAS_UYVYTOYROW_MSA +#define HAS_YUY2TOUV422ROW_MSA +#define HAS_YUY2TOUVROW_MSA +#define HAS_YUY2TOYROW_MSA #endif diff --git a/source/row_win.cc b/source/row_win.cc index e3c16e2a5..9dc805535 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3445,6 +3445,41 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +// width in pixels +__declspec(naked) +void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a + mov ecx, [esp + 12] // width + vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX + + extractloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpsrld ymm0, ymm0, 24 + vpsrld ymm1, ymm1, 24 + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + lea eax, [eax + 128] + vpackssdw ymm0, ymm0, ymm1 // mutates + vpsrld ymm2, ymm2, 24 + vpsrld ymm3, ymm3, 24 + vpackssdw ymm2, ymm2, ymm3 // mutates + vpackuswb ymm0, ymm0, ymm2 // mutates + vpermd ymm0, ymm4, ymm0 // unmutate + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg extractloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels __declspec(naked)