mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
SSSE3 version of alpha blender does pshufb instead of shift and 2 pshufw.
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/446008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@219 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
67be98bd44
commit
8670b1ae04
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 218
|
||||
Version: 219
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 218
|
||||
#define LIBYUV_VERSION 219
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
|
||||
@ -176,6 +176,15 @@ int ARGBBlend(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBBLENDROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBBlendRow = ARGBBlendRow_SSSE3;
|
||||
if (IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||
ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
ARGBBlendRow(src_argb, dst_argb, width);
|
||||
|
||||
@ -76,6 +76,10 @@ extern "C" {
|
||||
#define HAS_I420TOABGRROW_NEON
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && !defined(YUV_DISABLE_ASM)
|
||||
#define HAS_ARGBBLENDROW_SSSE3
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define SIMD_ALIGNED(var) __declspec(align(16)) var
|
||||
typedef __declspec(align(16)) signed char vec8[16];
|
||||
@ -241,8 +245,11 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
||||
int width);
|
||||
|
||||
// ARGB preattenuated alpha blend.
|
||||
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
int width);
|
||||
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
int width);
|
||||
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
|
||||
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
|
||||
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
|
||||
|
||||
|
||||
@ -1961,7 +1961,6 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
#ifdef HAS_ARGBBLENDROW_SSE2
|
||||
// Blend 8 pixels at a time
|
||||
// Destination aligned to 16 bytes, multiple of 4 pixels
|
||||
// TODO(fbarchard): SSSE3 version with pshufb for alpha and maybe pmaddubsw
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
int width) {
|
||||
@ -1988,7 +1987,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3,0F5h
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256-alpha
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
movdqa xmm1, [edx] // _a_g
|
||||
psrlw xmm1, 8 // _a_g
|
||||
@ -2006,12 +2005,12 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
|
||||
movdqa xmm0, xmm3 // src argb
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
movdqa xmm2, [edx + 16] // _r_b
|
||||
movdqa xmm2, [edx + 16] // _r_b
|
||||
psrlw xmm3, 8 // alpha
|
||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3,0F5h
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256-alpha
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
movdqa xmm1, [edx + 16] // _a_g
|
||||
psrlw xmm1, 8 // _a_g
|
||||
@ -2058,7 +2057,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3,0F5h
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256-alpha
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
movd xmm1, [edx] // _a_g
|
||||
psrlw xmm1, 8 // _a_g
|
||||
@ -2100,9 +2099,115 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // HAS_ARGBBLENDROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBBLENDROW_SSSE3
|
||||
// Blend 8 pixels at a time
|
||||
// Shuffle table for reversing the bytes.
|
||||
static const uvec8 kShuffleAlpha = {
|
||||
3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
|
||||
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
|
||||
};
|
||||
|
||||
// Same as SSE2, but replaces
|
||||
// psrlw xmm3, 8 // alpha
|
||||
// pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
// pshuflw xmm3, xmm3,0F5h
|
||||
// with..
|
||||
// pshufb xmm3, kShuffleAlpha // alpha
|
||||
|
||||
// Destination aligned to 16 bytes, multiple of 4 pixels
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // width
|
||||
pcmpeqb xmm7, xmm7 // generate constant 1
|
||||
psrlw xmm7, 15
|
||||
pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
|
||||
psrlw xmm6, 8
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
|
||||
psllw xmm5, 8
|
||||
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
||||
pslld xmm4, 24
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqu xmm3, [eax]
|
||||
movdqa xmm0, xmm3 // src argb
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
pshufb xmm3, kShuffleAlpha // alpha
|
||||
movdqa xmm2, [edx] // _r_b
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
movdqa xmm1, [edx] // _a_g
|
||||
psrlw xmm1, 8 // _a_g
|
||||
por xmm0, xmm4 // set alpha to 255
|
||||
pmullw xmm1, xmm3 // _a_g * alpha
|
||||
movdqu xmm3, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
psrlw xmm2, 8 // _r_b convert to 8 bits again
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 4
|
||||
movdqa [edx], xmm0
|
||||
jle done
|
||||
|
||||
movdqa xmm0, xmm3 // src argb
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
movdqa xmm2, [edx + 16] // _r_b
|
||||
pshufb xmm3, kShuffleAlpha // alpha
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
movdqa xmm1, [edx + 16] // _a_g
|
||||
psrlw xmm1, 8 // _a_g
|
||||
por xmm0, xmm4 // set alpha to 255
|
||||
pmullw xmm1, xmm3 // _a_g * alpha
|
||||
psrlw xmm2, 8 // _r_b convert to 8 bits again
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 4
|
||||
movdqa [edx + 16], xmm0
|
||||
lea edx, [edx + 32]
|
||||
jg convertloop
|
||||
|
||||
done:
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
// Do 1 to 3 pixels to get destination aligned.
|
||||
if ((uintptr_t)(dst_argb) & 15) {
|
||||
int count = width;
|
||||
if (((intptr_t)(dst_argb) & 3) == 0) {
|
||||
count = (-(intptr_t)(dst_argb) >> 2) & 3;
|
||||
}
|
||||
ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
|
||||
src_argb += count * 4;
|
||||
dst_argb += count * 4;
|
||||
width -= count;
|
||||
}
|
||||
// Do multiple of 4 pixels
|
||||
if (width & ~3) {
|
||||
ARGBBlendRow_Aligned_SSSE3(src_argb, dst_argb, width & ~3);
|
||||
}
|
||||
// Do remaining 1 to 3 pixels
|
||||
if (width & 3) {
|
||||
src_argb += (width & ~3) * 4;
|
||||
dst_argb += (width & ~3) * 4;
|
||||
width &= 3;
|
||||
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBBLENDROW_SSSE3
|
||||
|
||||
#endif // _M_IX86
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user