mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-09 11:16:43 +08:00
CopyAlpha AVX2
BUG=none TEST=Alpha* R=ryanpetrie@google.com Review URL: https://webrtc-codereview.appspot.com/2392004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@812 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
88ce3c0caa
commit
f6631bb814
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 811
|
Version: 812
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -167,8 +167,8 @@ extern "C" {
|
|||||||
// Effects:
|
// Effects:
|
||||||
// TODO(fbarchard): Optimize and enable
|
// TODO(fbarchard): Optimize and enable
|
||||||
// #define HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
// #define HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
||||||
// TODO(fbarchard): Optimize and enable
|
#define HAS_ARGBCOPYALPHAROW_SSE2
|
||||||
// #define HAS_ARGBCOPYALPHAROW_SSE2
|
#define HAS_ARGBCOPYALPHAROW_SSE41
|
||||||
|
|
||||||
// Caveat: Visual C 2012 required for AVX2.
|
// Caveat: Visual C 2012 required for AVX2.
|
||||||
#if _MSC_VER >= 1700
|
#if _MSC_VER >= 1700
|
||||||
@ -187,6 +187,7 @@ extern "C" {
|
|||||||
#define HAS_YUY2TOUV422ROW_AVX2
|
#define HAS_YUY2TOUV422ROW_AVX2
|
||||||
#define HAS_YUY2TOUVROW_AVX2
|
#define HAS_YUY2TOUVROW_AVX2
|
||||||
#define HAS_YUY2TOYROW_AVX2
|
#define HAS_YUY2TOYROW_AVX2
|
||||||
|
#define HAS_ARGBCOPYALPHAROW_AVX2
|
||||||
|
|
||||||
// Effects:
|
// Effects:
|
||||||
#define HAS_ARGBADDROW_AVX2
|
#define HAS_ARGBADDROW_AVX2
|
||||||
@ -701,6 +702,8 @@ void CopyRow_C(const uint8* src, uint8* dst, int count);
|
|||||||
|
|
||||||
void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width);
|
void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width);
|
||||||
void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width);
|
void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width);
|
||||||
|
void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width);
|
||||||
|
void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width);
|
||||||
|
|
||||||
void SetRow_X86(uint8* dst, uint32 v32, int count);
|
void SetRow_X86(uint8* dst, uint32 v32, int count);
|
||||||
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
|
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 811
|
#define LIBYUV_VERSION 812
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -2188,9 +2188,22 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
|
|||||||
void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
|
void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
|
||||||
ARGBCopyAlphaRow_C;
|
ARGBCopyAlphaRow_C;
|
||||||
#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
|
#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
|
||||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
|
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||||
|
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
|
||||||
|
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
|
||||||
|
IS_ALIGNED(width, 8)) {
|
||||||
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
|
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(HAS_ARGBCOPYALPHAROW_SSE41)
|
||||||
|
if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(width, 8)) {
|
||||||
|
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE41;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
|
||||||
|
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
|
||||||
|
ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
for (int y = 0; y < height; ++y) {
|
for (int y = 0; y < height; ++y) {
|
||||||
ARGBCopyAlphaRow(src_argb, dst_argb, width);
|
ARGBCopyAlphaRow(src_argb, dst_argb, width);
|
||||||
|
|||||||
@ -3603,37 +3603,102 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
|
|||||||
}
|
}
|
||||||
#endif // HAS_COPYROW_X86
|
#endif // HAS_COPYROW_X86
|
||||||
|
|
||||||
|
|
||||||
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
|
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
|
||||||
// width in pixels
|
// width in pixels
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
|
void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov edx, edi
|
|
||||||
mov eax, [esp + 4] // src
|
mov eax, [esp + 4] // src
|
||||||
mov edi, [esp + 8] // dst
|
mov edx, [esp + 8] // dst
|
||||||
mov ecx, [esp + 12] // count
|
mov ecx, [esp + 12] // count
|
||||||
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
|
pcmpeqb xmm0, xmm0 // generate mask 0xff000000
|
||||||
pslld xmm5, 24
|
pslld xmm0, 24
|
||||||
|
pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
|
||||||
|
psrld xmm1, 8
|
||||||
|
|
||||||
align 16
|
align 4
|
||||||
convertloop:
|
convertloop:
|
||||||
movdqa xmm0, [eax]
|
movdqa xmm2, [eax]
|
||||||
movdqa xmm1, [eax + 16]
|
movdqa xmm3, [eax + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
maskmovdqu xmm0, xmm5
|
movdqa xmm4, [edx]
|
||||||
lea edi, [edi + 16]
|
movdqa xmm5, [edx + 16]
|
||||||
maskmovdqu xmm1, xmm5
|
pand xmm2, xmm0
|
||||||
lea edi, [edi + 16]
|
pand xmm3, xmm0
|
||||||
|
pand xmm4, xmm1
|
||||||
|
pand xmm5, xmm1
|
||||||
|
por xmm2, xmm4
|
||||||
|
por xmm3, xmm5
|
||||||
|
movdqa [edx], xmm2
|
||||||
|
movdqa [edx + 16], xmm3
|
||||||
|
lea edx, [edx + 32]
|
||||||
sub ecx, 8
|
sub ecx, 8
|
||||||
jg convertloop
|
jg convertloop
|
||||||
|
|
||||||
mov edi, edx
|
|
||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // HAS_ARGBCOPYALPHAROW_SSE2
|
#endif // HAS_ARGBCOPYALPHAROW_SSE2
|
||||||
|
|
||||||
|
#ifdef HAS_ARGBCOPYALPHAROW_SSE41
|
||||||
|
// width in pixels
|
||||||
|
__declspec(naked) __declspec(align(16))
|
||||||
|
void ARGBCopyAlphaRow_SSE41(const uint8* src, uint8* dst, int width) {
|
||||||
|
__asm {
|
||||||
|
mov eax, [esp + 4] // src
|
||||||
|
mov edx, [esp + 8] // dst
|
||||||
|
mov ecx, [esp + 12] // count
|
||||||
|
pcmpeqb xmm0, xmm0 // generate mask 0x00ffffff
|
||||||
|
psrld xmm0, 8
|
||||||
|
|
||||||
|
align 4
|
||||||
|
convertloop:
|
||||||
|
movdqu xmm1, [eax]
|
||||||
|
movdqu xmm2, [eax + 16]
|
||||||
|
lea eax, [eax + 32]
|
||||||
|
pblendvb xmm1, [edx], xmm0
|
||||||
|
pblendvb xmm2, [edx + 16], xmm0
|
||||||
|
movdqu [edx], xmm1
|
||||||
|
movdqu [edx + 16], xmm2
|
||||||
|
lea edx, [edx + 32]
|
||||||
|
sub ecx, 8
|
||||||
|
jg convertloop
|
||||||
|
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // HAS_ARGBCOPYALPHAROW_SSE41
|
||||||
|
|
||||||
|
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
|
||||||
|
// width in pixels
|
||||||
|
__declspec(naked) __declspec(align(16))
|
||||||
|
void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
|
||||||
|
__asm {
|
||||||
|
mov eax, [esp + 4] // src
|
||||||
|
mov edx, [esp + 8] // dst
|
||||||
|
mov ecx, [esp + 12] // count
|
||||||
|
vpcmpeqb ymm0, ymm0, ymm0 // generate mask 0x00ffffff
|
||||||
|
vpsrld ymm0, ymm0, 8
|
||||||
|
|
||||||
|
align 4
|
||||||
|
convertloop:
|
||||||
|
vmovdqu ymm1, [eax]
|
||||||
|
vmovdqu ymm2, [eax + 32]
|
||||||
|
lea eax, [eax + 64]
|
||||||
|
vpblendvb ymm1, ymm1, [edx], ymm0
|
||||||
|
vpblendvb ymm2, ymm2, [edx + 32], ymm0
|
||||||
|
vmovdqu [edx], ymm1
|
||||||
|
vmovdqu [edx + 32], ymm2
|
||||||
|
lea edx, [edx + 64]
|
||||||
|
sub ecx, 16
|
||||||
|
jg convertloop
|
||||||
|
|
||||||
|
vzeroupper
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // HAS_ARGBCOPYALPHAROW_AVX2
|
||||||
|
|
||||||
#ifdef HAS_SETROW_X86
|
#ifdef HAS_SETROW_X86
|
||||||
// SetRow8 writes 'count' bytes using a 32 bit value repeated.
|
// SetRow8 writes 'count' bytes using a 32 bit value repeated.
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user