From 8670b1ae043acf3b12e928b0a6453e23fadce16c Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 19 Mar 2012 16:50:03 +0000 Subject: [PATCH] SSSE3 version of alpha blender does pshufb instead of shift and 2 pshufw. BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/446008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@219 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/planar_functions.cc | 9 +++ source/row.h | 7 +++ source/row_win.cc | 117 +++++++++++++++++++++++++++++++++++-- 5 files changed, 129 insertions(+), 8 deletions(-) diff --git a/README.chromium b/README.chromium index c74afb92d..fafe8d8f6 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 218 +Version: 219 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index f5cb202de..bdbe8e75a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 218 +#define LIBYUV_VERSION 219 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 367309c73..07e6173c7 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -176,6 +176,15 @@ int ARGBBlend(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlendRow = ARGBBlendRow_SSSE3; + if (IS_ALIGNED(width, 4) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3; + } + } +#endif for (int y = 0; y < height; ++y) { ARGBBlendRow(src_argb, dst_argb, width); diff --git a/source/row.h b/source/row.h index c52dec9eb..db60c92ee 100644 --- a/source/row.h +++ b/source/row.h @@ -76,6 +76,10 @@ extern "C" { #define HAS_I420TOABGRROW_NEON #endif +#if defined(_MSC_VER) && !defined(YUV_DISABLE_ASM) +#define HAS_ARGBBLENDROW_SSSE3 +#endif + #if defined(_MSC_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var typedef __declspec(align(16)) signed char vec8[16]; @@ -241,8 +245,11 @@ void YToARGBRow_SSE2(const uint8* y_buf, int width); // ARGB preattenuated alpha blend. +void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width); void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width); diff --git a/source/row_win.cc b/source/row_win.cc index 1881e89a6..7007876a0 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1961,7 +1961,6 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, #ifdef HAS_ARGBBLENDROW_SSE2 // Blend 8 pixels at a time // Destination aligned to 16 bytes, multiple of 4 pixels -// TODO(fbarchard): SSSE3 version with pshufb for alpha and maybe pmaddubsw __declspec(naked) __declspec(align(16)) void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { @@ -1988,7 +1987,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, pshufhw xmm3, xmm3,0F5h // 8 alpha words pshuflw xmm3, xmm3,0F5h pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256-alpha + paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha movdqa xmm1, [edx] // _a_g psrlw xmm1, 8 // _a_g @@ -2006,12 +2005,12 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [edx + 16] // _r_b + movdqa xmm2, [edx + 16] // _r_b psrlw xmm3, 8 // alpha pshufhw xmm3, xmm3,0F5h // 8 alpha words pshuflw xmm3, xmm3,0F5h pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256-alpha + paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha movdqa xmm1, [edx + 16] // _a_g psrlw xmm1, 8 // _a_g @@ -2058,7 +2057,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { pshufhw xmm3, xmm3,0F5h // 8 alpha words pshuflw xmm3, xmm3,0F5h pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256-alpha + paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha movd xmm1, [edx] // _a_g psrlw xmm1, 8 // _a_g @@ -2100,9 +2099,115 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ARGBBlendRow1_SSE2(src_argb, dst_argb, width); } } - #endif // HAS_ARGBBLENDROW_SSE2 +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Blend 8 pixels at a time +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; + +// Same as SSE2, but replaces +// psrlw xmm3, 8 // alpha +// pshufhw xmm3, xmm3,0F5h // 8 alpha words +// pshuflw xmm3, xmm3,0F5h +// with.. +// pshufb xmm3, kShuffleAlpha // alpha + +// Destination aligned to 16 bytes, multiple of 4 pixels +__declspec(naked) __declspec(align(16)) +void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm7, xmm7 // generate constant 1 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + align 16 + convertloop: + movdqu xmm3, [eax] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + pshufb xmm3, kShuffleAlpha // alpha + movdqa xmm2, [edx] // _r_b + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqa xmm1, [edx] // _a_g + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + movdqu xmm3, [eax + 16] + lea eax, [eax + 32] + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + jle done + + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqa xmm2, [edx + 16] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqa xmm1, [edx + 16] // _a_g + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + jg convertloop + + done: + ret + } +} + +void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + // Do 1 to 3 pixels to get destination aligned. + if ((uintptr_t)(dst_argb) & 15) { + int count = width; + if (((intptr_t)(dst_argb) & 3) == 0) { + count = (-(intptr_t)(dst_argb) >> 2) & 3; + } + ARGBBlendRow1_SSE2(src_argb, dst_argb, count); + src_argb += count * 4; + dst_argb += count * 4; + width -= count; + } + // Do multiple of 4 pixels + if (width & ~3) { + ARGBBlendRow_Aligned_SSSE3(src_argb, dst_argb, width & ~3); + } + // Do remaining 1 to 3 pixels + if (width & 3) { + src_argb += (width & ~3) * 4; + dst_argb += (width & ~3) * 4; + width &= 3; + ARGBBlendRow1_SSE2(src_argb, dst_argb, width); + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + #endif // _M_IX86 #ifdef __cplusplus