diff --git a/README.chromium b/README.chromium index 401ee75d9..dbbf4cc76 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 242 +Version: 243 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 70d29b9b4..90b99ec4c 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 242 +#define LIBYUV_VERSION 243 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index eb946745a..2f005736f 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -893,6 +893,13 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, ARGBAttenuateRow = ARGBAttenuateRow_SSE2; } #endif +#if defined(HAS_ARGBATTENUATE_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } +#endif for (int y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); diff --git a/source/row.h b/source/row.h index 895517a6e..fb9040ec2 100644 --- a/source/row.h +++ b/source/row.h @@ -69,6 +69,11 @@ extern "C" { #define HAS_ARGBATTENUATE_SSE2 #endif +// The following are available on Windows 32 bit +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) +#define HAS_ARGBATTENUATE_SSSE3 +#endif + // The following are available on Neon platforms #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #define HAS_MIRRORROW_NEON @@ -363,6 +368,7 @@ void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); #ifdef __cplusplus } // extern "C" diff --git a/source/row_win.cc b/source/row_win.cc index e18141e3a..c0519f295 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2334,8 +2334,58 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { ret } } - #endif // HAS_ARGBATTENUATE_SSE2 + +#ifdef HAS_ARGBATTENUATE_SSSE3 +// Shuffle table duplicating alpha +static const uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +static const uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 + pslld xmm3, 24 + movdqa xmm4, kShuffleAlpha0 + movdqa xmm5, kShuffleAlpha1 + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + pshufb xmm0, xmm4 // isolate first 2 alphas + movdqa xmm1, [eax] // read 4 pixels + punpcklbw xmm1, xmm1 // first 2 pixel rgbs + pmulhuw xmm0, xmm1 // rgb * a + movdqa xmm1, [eax] // read 4 pixels + pshufb xmm1, xmm5 // isolate next 2 alphas + movdqa xmm2, [eax] // read 4 pixels + punpckhbw xmm2, xmm2 // next 2 pixel rgbs + pmulhuw xmm1, xmm2 // rgb * a + movdqa xmm2, [eax] // mask original alpha + pand xmm2, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + por xmm0, xmm2 // copy original alpha + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + + ret + } +} + +#endif // HAS_ARGBATTENUATE_SSSE3 + #endif // _M_IX86 #ifdef __cplusplus