mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
SSSE3 version of alpha blender does pshufb instead of shift and 2 pshufw.
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/446008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@219 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
67be98bd44
commit
8670b1ae04
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 218
|
Version: 219
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 218
|
#define LIBYUV_VERSION 219
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
|
|||||||
@ -176,6 +176,15 @@ int ARGBBlend(const uint8* src_argb, int src_stride_argb,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_ARGBBLENDROW_SSSE3)
|
||||||
|
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||||
|
ARGBBlendRow = ARGBBlendRow_SSSE3;
|
||||||
|
if (IS_ALIGNED(width, 4) &&
|
||||||
|
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||||
|
ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for (int y = 0; y < height; ++y) {
|
for (int y = 0; y < height; ++y) {
|
||||||
ARGBBlendRow(src_argb, dst_argb, width);
|
ARGBBlendRow(src_argb, dst_argb, width);
|
||||||
|
|||||||
@ -76,6 +76,10 @@ extern "C" {
|
|||||||
#define HAS_I420TOABGRROW_NEON
|
#define HAS_I420TOABGRROW_NEON
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) && !defined(YUV_DISABLE_ASM)
|
||||||
|
#define HAS_ARGBBLENDROW_SSSE3
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#define SIMD_ALIGNED(var) __declspec(align(16)) var
|
#define SIMD_ALIGNED(var) __declspec(align(16)) var
|
||||||
typedef __declspec(align(16)) signed char vec8[16];
|
typedef __declspec(align(16)) signed char vec8[16];
|
||||||
@ -241,8 +245,11 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
|||||||
int width);
|
int width);
|
||||||
|
|
||||||
// ARGB preattenuated alpha blend.
|
// ARGB preattenuated alpha blend.
|
||||||
|
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||||
|
int width);
|
||||||
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||||
int width);
|
int width);
|
||||||
|
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
|
||||||
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
|
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
|
||||||
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
|
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
|
||||||
|
|
||||||
|
|||||||
@ -1961,7 +1961,6 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
|||||||
#ifdef HAS_ARGBBLENDROW_SSE2
|
#ifdef HAS_ARGBBLENDROW_SSE2
|
||||||
// Blend 8 pixels at a time
|
// Blend 8 pixels at a time
|
||||||
// Destination aligned to 16 bytes, multiple of 4 pixels
|
// Destination aligned to 16 bytes, multiple of 4 pixels
|
||||||
// TODO(fbarchard): SSSE3 version with pshufb for alpha and maybe pmaddubsw
|
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
@ -1988,7 +1987,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
|||||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||||
pshuflw xmm3, xmm3,0F5h
|
pshuflw xmm3, xmm3,0F5h
|
||||||
pand xmm2, xmm6 // _r_b
|
pand xmm2, xmm6 // _r_b
|
||||||
paddw xmm3, xmm7 // 256-alpha
|
paddw xmm3, xmm7 // 256 - alpha
|
||||||
pmullw xmm2, xmm3 // _r_b * alpha
|
pmullw xmm2, xmm3 // _r_b * alpha
|
||||||
movdqa xmm1, [edx] // _a_g
|
movdqa xmm1, [edx] // _a_g
|
||||||
psrlw xmm1, 8 // _a_g
|
psrlw xmm1, 8 // _a_g
|
||||||
@ -2011,7 +2010,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
|||||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||||
pshuflw xmm3, xmm3,0F5h
|
pshuflw xmm3, xmm3,0F5h
|
||||||
pand xmm2, xmm6 // _r_b
|
pand xmm2, xmm6 // _r_b
|
||||||
paddw xmm3, xmm7 // 256-alpha
|
paddw xmm3, xmm7 // 256 - alpha
|
||||||
pmullw xmm2, xmm3 // _r_b * alpha
|
pmullw xmm2, xmm3 // _r_b * alpha
|
||||||
movdqa xmm1, [edx + 16] // _a_g
|
movdqa xmm1, [edx + 16] // _a_g
|
||||||
psrlw xmm1, 8 // _a_g
|
psrlw xmm1, 8 // _a_g
|
||||||
@ -2058,7 +2057,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
|||||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||||
pshuflw xmm3, xmm3,0F5h
|
pshuflw xmm3, xmm3,0F5h
|
||||||
pand xmm2, xmm6 // _r_b
|
pand xmm2, xmm6 // _r_b
|
||||||
paddw xmm3, xmm7 // 256-alpha
|
paddw xmm3, xmm7 // 256 - alpha
|
||||||
pmullw xmm2, xmm3 // _r_b * alpha
|
pmullw xmm2, xmm3 // _r_b * alpha
|
||||||
movd xmm1, [edx] // _a_g
|
movd xmm1, [edx] // _a_g
|
||||||
psrlw xmm1, 8 // _a_g
|
psrlw xmm1, 8 // _a_g
|
||||||
@ -2100,9 +2099,115 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
|||||||
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
|
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // HAS_ARGBBLENDROW_SSE2
|
#endif // HAS_ARGBBLENDROW_SSE2
|
||||||
|
|
||||||
|
#ifdef HAS_ARGBBLENDROW_SSSE3
|
||||||
|
// Blend 8 pixels at a time
|
||||||
|
// Shuffle table for reversing the bytes.
|
||||||
|
static const uvec8 kShuffleAlpha = {
|
||||||
|
3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
|
||||||
|
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
|
||||||
|
};
|
||||||
|
|
||||||
|
// Same as SSE2, but replaces
|
||||||
|
// psrlw xmm3, 8 // alpha
|
||||||
|
// pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||||
|
// pshuflw xmm3, xmm3,0F5h
|
||||||
|
// with..
|
||||||
|
// pshufb xmm3, kShuffleAlpha // alpha
|
||||||
|
|
||||||
|
// Destination aligned to 16 bytes, multiple of 4 pixels
|
||||||
|
__declspec(naked) __declspec(align(16))
|
||||||
|
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||||
|
int width) {
|
||||||
|
__asm {
|
||||||
|
mov eax, [esp + 4] // src_argb
|
||||||
|
mov edx, [esp + 8] // dst_argb
|
||||||
|
mov ecx, [esp + 12] // width
|
||||||
|
pcmpeqb xmm7, xmm7 // generate constant 1
|
||||||
|
psrlw xmm7, 15
|
||||||
|
pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
|
||||||
|
psrlw xmm6, 8
|
||||||
|
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
|
||||||
|
psllw xmm5, 8
|
||||||
|
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
||||||
|
pslld xmm4, 24
|
||||||
|
|
||||||
|
align 16
|
||||||
|
convertloop:
|
||||||
|
movdqu xmm3, [eax]
|
||||||
|
movdqa xmm0, xmm3 // src argb
|
||||||
|
pxor xmm3, xmm4 // ~alpha
|
||||||
|
pshufb xmm3, kShuffleAlpha // alpha
|
||||||
|
movdqa xmm2, [edx] // _r_b
|
||||||
|
pand xmm2, xmm6 // _r_b
|
||||||
|
paddw xmm3, xmm7 // 256 - alpha
|
||||||
|
pmullw xmm2, xmm3 // _r_b * alpha
|
||||||
|
movdqa xmm1, [edx] // _a_g
|
||||||
|
psrlw xmm1, 8 // _a_g
|
||||||
|
por xmm0, xmm4 // set alpha to 255
|
||||||
|
pmullw xmm1, xmm3 // _a_g * alpha
|
||||||
|
movdqu xmm3, [eax + 16]
|
||||||
|
lea eax, [eax + 32]
|
||||||
|
psrlw xmm2, 8 // _r_b convert to 8 bits again
|
||||||
|
paddusb xmm0, xmm2 // + src argb
|
||||||
|
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||||
|
paddusb xmm0, xmm1 // + src argb
|
||||||
|
sub ecx, 4
|
||||||
|
movdqa [edx], xmm0
|
||||||
|
jle done
|
||||||
|
|
||||||
|
movdqa xmm0, xmm3 // src argb
|
||||||
|
pxor xmm3, xmm4 // ~alpha
|
||||||
|
movdqa xmm2, [edx + 16] // _r_b
|
||||||
|
pshufb xmm3, kShuffleAlpha // alpha
|
||||||
|
pand xmm2, xmm6 // _r_b
|
||||||
|
paddw xmm3, xmm7 // 256 - alpha
|
||||||
|
pmullw xmm2, xmm3 // _r_b * alpha
|
||||||
|
movdqa xmm1, [edx + 16] // _a_g
|
||||||
|
psrlw xmm1, 8 // _a_g
|
||||||
|
por xmm0, xmm4 // set alpha to 255
|
||||||
|
pmullw xmm1, xmm3 // _a_g * alpha
|
||||||
|
psrlw xmm2, 8 // _r_b convert to 8 bits again
|
||||||
|
paddusb xmm0, xmm2 // + src argb
|
||||||
|
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||||
|
paddusb xmm0, xmm1 // + src argb
|
||||||
|
sub ecx, 4
|
||||||
|
movdqa [edx + 16], xmm0
|
||||||
|
lea edx, [edx + 32]
|
||||||
|
jg convertloop
|
||||||
|
|
||||||
|
done:
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||||
|
// Do 1 to 3 pixels to get destination aligned.
|
||||||
|
if ((uintptr_t)(dst_argb) & 15) {
|
||||||
|
int count = width;
|
||||||
|
if (((intptr_t)(dst_argb) & 3) == 0) {
|
||||||
|
count = (-(intptr_t)(dst_argb) >> 2) & 3;
|
||||||
|
}
|
||||||
|
ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
|
||||||
|
src_argb += count * 4;
|
||||||
|
dst_argb += count * 4;
|
||||||
|
width -= count;
|
||||||
|
}
|
||||||
|
// Do multiple of 4 pixels
|
||||||
|
if (width & ~3) {
|
||||||
|
ARGBBlendRow_Aligned_SSSE3(src_argb, dst_argb, width & ~3);
|
||||||
|
}
|
||||||
|
// Do remaining 1 to 3 pixels
|
||||||
|
if (width & 3) {
|
||||||
|
src_argb += (width & ~3) * 4;
|
||||||
|
dst_argb += (width & ~3) * 4;
|
||||||
|
width &= 3;
|
||||||
|
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // HAS_ARGBBLENDROW_SSSE3
|
||||||
|
|
||||||
#endif // _M_IX86
|
#endif // _M_IX86
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user