From f6631bb814600f841f74a9d8a626b528be2fd8bb Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 14 Oct 2013 19:37:21 +0000 Subject: [PATCH] CopyAlpha AVX2 BUG=none TEST=Alpha* R=ryanpetrie@google.com Review URL: https://webrtc-codereview.appspot.com/2392004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@812 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 7 ++- include/libyuv/version.h | 2 +- source/planar_functions.cc | 15 ++++++- source/row_win.cc | 91 ++++++++++++++++++++++++++++++++------ 5 files changed, 99 insertions(+), 18 deletions(-) diff --git a/README.chromium b/README.chromium index dda773769..ea02c6053 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 811 +Version: 812 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 005e499e5..84f3bab10 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -167,8 +167,8 @@ extern "C" { // Effects: // TODO(fbarchard): Optimize and enable // #define HAS_ARGBLUMACOLORTABLEROW_SSSE3 -// TODO(fbarchard): Optimize and enable -// #define HAS_ARGBCOPYALPHAROW_SSE2 +#define HAS_ARGBCOPYALPHAROW_SSE2 +#define HAS_ARGBCOPYALPHAROW_SSE41 // Caveat: Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 @@ -187,6 +187,7 @@ extern "C" { #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 +#define HAS_ARGBCOPYALPHAROW_AVX2 // Effects: #define HAS_ARGBADDROW_AVX2 @@ -701,6 +702,8 @@ void CopyRow_C(const uint8* src, uint8* dst, int count); void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width); void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width); +void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width); +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width); void SetRow_X86(uint8* dst, uint32 v32, int count); void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 64a896621..564fd3f63 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 811 +#define LIBYUV_VERSION 812 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index d151c2a5c..65993c93c 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2188,9 +2188,22 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) = ARGBCopyAlphaRow_C; #if defined(HAS_ARGBCOPYALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && + IS_ALIGNED(width, 8)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; } +#endif +#if defined(HAS_ARGBCOPYALPHAROW_SSE41) + if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(width, 8)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE41; + } +#endif +#if defined(HAS_ARGBCOPYALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; + } #endif for (int y = 0; y < height; ++y) { ARGBCopyAlphaRow(src_argb, dst_argb, width); diff --git a/source/row_win.cc b/source/row_win.cc index 1632732ea..3a8eaefd0 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3603,37 +3603,102 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_X86 - #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels __declspec(naked) __declspec(align(16)) void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { - mov edx, edi mov eax, [esp + 4] // src - mov edi, [esp + 8] // dst + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 - align 16 + align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqa xmm2, [eax] + movdqa xmm3, [eax + 16] lea eax, [eax + 32] - maskmovdqu xmm0, xmm5 - lea edi, [edi + 16] - maskmovdqu xmm1, xmm5 - lea edi, [edi + 16] + movdqa xmm4, [edx] + movdqa xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + lea edx, [edx + 32] sub ecx, 8 jg convertloop - mov edi, edx ret } } #endif // HAS_ARGBCOPYALPHAROW_SSE2 +#ifdef HAS_ARGBCOPYALPHAROW_SSE41 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + pcmpeqb xmm0, xmm0 // generate mask 0x00ffffff + psrld xmm0, 8 + + align 4 + convertloop: + movdqu xmm1, [eax] + movdqu xmm2, [eax + 16] + lea eax, [eax + 32] + pblendvb xmm1, [edx], xmm0 + pblendvb xmm2, [edx + 16], xmm0 + movdqu [edx], xmm1 + movdqu [edx + 16], xmm2 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_SSE41 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + vpcmpeqb ymm0, ymm0, ymm0 // generate mask 0x00ffffff + vpsrld ymm0, ymm0, 8 + + align 4 + convertloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + 32] + lea eax, [eax + 64] + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + #ifdef HAS_SETROW_X86 // SetRow8 writes 'count' bytes using a 32 bit value repeated. __declspec(naked) __declspec(align(16))