From 3075de82856a044ebd3e808b2f0918d2b0e9713c Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 15 Oct 2013 00:32:29 +0000 Subject: [PATCH] Use simple masking for AVX2 version of CopyAlpha so it can be implemented using a more generic bit mask function in future, and use more broadly known and optimized opcodes that will always be fast. Same performance as vblend. BUG=none TEST=CopyAlpha* R=johannkoenig@google.com Review URL: https://webrtc-codereview.appspot.com/2393005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@813 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 2 -- include/libyuv/version.h | 2 +- source/planar_functions.cc | 5 ---- source/row_win.cc | 51 ++++++++++---------------------------- 5 files changed, 15 insertions(+), 47 deletions(-) diff --git a/README.chromium b/README.chromium index ea02c6053..73080d93e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 812 +Version: 813 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 84f3bab10..69a7076c0 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -168,7 +168,6 @@ extern "C" { // TODO(fbarchard): Optimize and enable // #define HAS_ARGBLUMACOLORTABLEROW_SSSE3 #define HAS_ARGBCOPYALPHAROW_SSE2 -#define HAS_ARGBCOPYALPHAROW_SSE41 // Caveat: Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 @@ -702,7 +701,6 @@ void CopyRow_C(const uint8* src, uint8* dst, int count); void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width); void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width); -void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width); void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width); void SetRow_X86(uint8* dst, uint32 v32, int count); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 564fd3f63..cd645723d 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 812 +#define LIBYUV_VERSION 813 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 65993c93c..243a87f59 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2195,11 +2195,6 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; } #endif -#if defined(HAS_ARGBCOPYALPHAROW_SSE41) - if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(width, 8)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE41; - } -#endif #if defined(HAS_ARGBCOPYALPHAROW_AVX2) if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; diff --git a/source/row_win.cc b/source/row_win.cc index 3a8eaefd0..dd2152f34 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3640,35 +3640,6 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { } #endif // HAS_ARGBCOPYALPHAROW_SSE2 -#ifdef HAS_ARGBCOPYALPHAROW_SSE41 -// width in pixels -__declspec(naked) __declspec(align(16)) -void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count - pcmpeqb xmm0, xmm0 // generate mask 0x00ffffff - psrld xmm0, 8 - - align 4 - convertloop: - movdqu xmm1, [eax] - movdqu xmm2, [eax + 16] - lea eax, [eax + 32] - pblendvb xmm1, [edx], xmm0 - pblendvb xmm2, [edx + 16], xmm0 - movdqu [edx], xmm1 - movdqu [edx + 16], xmm2 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - ret - } -} -#endif // HAS_ARGBCOPYALPHAROW_SSE41 - #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels __declspec(naked) __declspec(align(16)) @@ -3677,18 +3648,21 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count - vpcmpeqb ymm0, ymm0, ymm0 // generate mask 0x00ffffff - vpsrld ymm0, ymm0, 8 + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm1, ymm0, 8 // generate mask 0x00ffffff + vpslld ymm0, ymm0, 24 // generate mask 0xff000000 align 4 convertloop: - vmovdqu ymm1, [eax] - vmovdqu ymm2, [eax + 32] + vpand ymm2, ymm0, [eax] + vpand ymm3, ymm0, [eax + 32] lea eax, [eax + 64] - vpblendvb ymm1, ymm1, [edx], ymm0 - vpblendvb ymm2, ymm2, [edx + 32], ymm0 - vmovdqu [edx], ymm1 - vmovdqu [edx + 32], ymm2 + vpand ymm4, ymm1, [edx] + vpand ymm5, ymm1, [edx + 32] + vpor ymm2, ymm2, ymm4 + vpor ymm3, ymm3, ymm5 + vmovdqu [edx], ymm2 + vmovdqu [edx + 32], ymm3 lea edx, [edx + 64] sub ecx, 16 jg convertloop @@ -6958,7 +6932,8 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, // 2 pixel loop. align 16 convertloop: -// (slow) pmovzxbd xmm0, dword ptr [eax] // BGRA pixel +// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel +// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel movq xmm0, qword ptr [eax] // BGRABGRA lea eax, [eax + 8] punpcklbw xmm0, xmm3