mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 09:16:48 +08:00
Use simple masking for AVX2 version of CopyAlpha so it can be implemented using a more generic bit mask function in future, and use more broadly known and optimized opcodes that will always be fast. Same performance as vblend.
BUG=none TEST=CopyAlpha* R=johannkoenig@google.com Review URL: https://webrtc-codereview.appspot.com/2393005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@813 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
f6631bb814
commit
3075de8285
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 812
|
||||
Version: 813
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -168,7 +168,6 @@ extern "C" {
|
||||
// TODO(fbarchard): Optimize and enable
|
||||
// #define HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
||||
#define HAS_ARGBCOPYALPHAROW_SSE2
|
||||
#define HAS_ARGBCOPYALPHAROW_SSE41
|
||||
|
||||
// Caveat: Visual C 2012 required for AVX2.
|
||||
#if _MSC_VER >= 1700
|
||||
@ -702,7 +701,6 @@ void CopyRow_C(const uint8* src, uint8* dst, int count);
|
||||
|
||||
void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width);
|
||||
void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width);
|
||||
void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width);
|
||||
void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width);
|
||||
|
||||
void SetRow_X86(uint8* dst, uint32 v32, int count);
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 812
|
||||
#define LIBYUV_VERSION 813
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -2195,11 +2195,6 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
|
||||
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBCOPYALPHAROW_SSE41)
|
||||
if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(width, 8)) {
|
||||
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE41;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
|
||||
ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
|
||||
|
||||
@ -3640,35 +3640,6 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||
}
|
||||
#endif // HAS_ARGBCOPYALPHAROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBCOPYALPHAROW_SSE41
|
||||
// width in pixels
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
mov edx, [esp + 8] // dst
|
||||
mov ecx, [esp + 12] // count
|
||||
pcmpeqb xmm0, xmm0 // generate mask 0x00ffffff
|
||||
psrld xmm0, 8
|
||||
|
||||
align 4
|
||||
convertloop:
|
||||
movdqu xmm1, [eax]
|
||||
movdqu xmm2, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
pblendvb xmm1, [edx], xmm0
|
||||
pblendvb xmm2, [edx + 16], xmm0
|
||||
movdqu [edx], xmm1
|
||||
movdqu [edx + 16], xmm2
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBCOPYALPHAROW_SSE41
|
||||
|
||||
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
|
||||
// width in pixels
|
||||
__declspec(naked) __declspec(align(16))
|
||||
@ -3677,18 +3648,21 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
|
||||
mov eax, [esp + 4] // src
|
||||
mov edx, [esp + 8] // dst
|
||||
mov ecx, [esp + 12] // count
|
||||
vpcmpeqb ymm0, ymm0, ymm0 // generate mask 0x00ffffff
|
||||
vpsrld ymm0, ymm0, 8
|
||||
vpcmpeqb ymm0, ymm0, ymm0
|
||||
vpsrld ymm1, ymm0, 8 // generate mask 0x00ffffff
|
||||
vpslld ymm0, ymm0, 24 // generate mask 0xff000000
|
||||
|
||||
align 4
|
||||
convertloop:
|
||||
vmovdqu ymm1, [eax]
|
||||
vmovdqu ymm2, [eax + 32]
|
||||
vpand ymm2, ymm0, [eax]
|
||||
vpand ymm3, ymm0, [eax + 32]
|
||||
lea eax, [eax + 64]
|
||||
vpblendvb ymm1, ymm1, [edx], ymm0
|
||||
vpblendvb ymm2, ymm2, [edx + 32], ymm0
|
||||
vmovdqu [edx], ymm1
|
||||
vmovdqu [edx + 32], ymm2
|
||||
vpand ymm4, ymm1, [edx]
|
||||
vpand ymm5, ymm1, [edx + 32]
|
||||
vpor ymm2, ymm2, ymm4
|
||||
vpor ymm3, ymm3, ymm5
|
||||
vmovdqu [edx], ymm2
|
||||
vmovdqu [edx + 32], ymm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
@ -6958,7 +6932,8 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
||||
// 2 pixel loop.
|
||||
align 16
|
||||
convertloop:
|
||||
// (slow) pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
|
||||
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
|
||||
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
|
||||
movq xmm0, qword ptr [eax] // BGRABGRA
|
||||
lea eax, [eax + 8]
|
||||
punpcklbw xmm0, xmm3
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user