diff --git a/README.chromium b/README.chromium index 7b8bdc7a1..ac76a2349 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 249 +Version: 251 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d69289ad9..d9437d953 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 249 +#define LIBYUV_VERSION 251 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare.cc b/source/compare.cc index c7ec336f6..c82b3918f 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -64,13 +64,13 @@ static const uvec32 kHashMul3 = { 0x00000001, // 33 ^ 0 }; -//27: 66 0F 38 40 C6 pmulld xmm0,xmm6 -//44: 66 0F 38 40 DD pmulld xmm3,xmm5 -//59: 66 0F 38 40 E5 pmulld xmm4,xmm5 -//72: 66 0F 38 40 D5 pmulld xmm2,xmm5 -//83: 66 0F 38 40 CD pmulld xmm1,xmm5 +// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 +// 44: 66 0F 38 40 DD pmulld xmm3,xmm5 +// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 +// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 +// 83: 66 0F 38 40 CD pmulld xmm1,xmm5 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ - _asm _emit 0x40 _asm _emit reg + _asm _emit 0x40 _asm _emit reg __declspec(naked) __declspec(align(16)) static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { @@ -124,27 +124,34 @@ static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { #elif !defined(YUV_DISABLE_ASM) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) +// GCC 4.2 on OSX has link error when passing static or const to inline. +// TODO(fbarchard): Use static const when gcc 4.2 support is dropped. +#ifdef __APPLE__ +#define CONST +#else +#define CONST static const +#endif #define HAS_HASHDJB2_SSE41 -static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 -static const uvec32 kHashMul0 = { +CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +CONST uvec32 kHashMul0 = { 0x0c3525e1, // 33 ^ 15 0xa3476dc1, // 33 ^ 14 0x3b4039a1, // 33 ^ 13 0x4f5f0981, // 33 ^ 12 }; -static const uvec32 kHashMul1 = { +CONST uvec32 kHashMul1 = { 0x30f35d61, // 33 ^ 11 0x855cb541, // 33 ^ 10 0x040a9121, // 33 ^ 9 0x747c7101, // 33 ^ 8 }; -static const uvec32 kHashMul2 = { +CONST uvec32 kHashMul2 = { 0xec41d4e1, // 33 ^ 7 0x4cfa3cc1, // 33 ^ 6 0x025528a1, // 33 ^ 5 0x00121881, // 33 ^ 4 }; -static const uvec32 kHashMul3 = { +CONST uvec32 kHashMul3 = { 0x00008c61, // 33 ^ 3 0x00000441, // 33 ^ 2 0x00000021, // 33 ^ 1 diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 406c97aa6..0fae87118 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -143,6 +143,21 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width) { void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width) = ARGBBlendRow_C; +#if defined(HAS_ARGBBLENDROW1_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlendRow = ARGBBlendRow1_SSSE3; +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (width >= 4) { + ARGBBlendRow = ARGBBlendRow_Any_SSSE3; + if (IS_ALIGNED(width, 4) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3; + } + } +#endif + return ARGBBlendRow; + } +#endif #if defined(HAS_ARGBBLENDROW1_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBBlendRow = ARGBBlendRow1_SSE2; @@ -156,15 +171,6 @@ ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width) { } #endif } -#endif -#if defined(HAS_ARGBBLENDROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { - ARGBBlendRow = ARGBBlendRow_Any_SSSE3; - if (IS_ALIGNED(width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3; - } - } #endif return ARGBBlendRow; } diff --git a/source/row.h b/source/row.h index cb22aa554..98f9f32e4 100644 --- a/source/row.h +++ b/source/row.h @@ -40,7 +40,7 @@ extern "C" { #define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGBATTENUATE_SSSE3 #define HAS_ARGBBLENDROW_SSSE3 -#define HAS_ARGBBLENDROW1_SSE2 +#define HAS_ARGBBLENDROW1_SSSE3 #define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2 #define HAS_ARGBTORAWROW_SSSE3 @@ -79,6 +79,7 @@ extern "C" { #define HAS_MIRRORROW_SSE2 #define HAS_ARGBATTENUATE_SSE2 #define HAS_ARGBBLENDROW_SSE2 +#define HAS_ARGBBLENDROW1_SSE2 #endif // The following are available on Neon platforms @@ -281,6 +282,8 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width); void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width); +void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width); void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, diff --git a/source/row_common.cc b/source/row_common.cc index 6ebb48e52..769fe793c 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -565,7 +565,7 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { count = (-(intptr_t)(dst_argb) >> 2) & 3; } - ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count); + ARGBBlendRow1_SSSE3(src_argb0, src_argb1, dst_argb, count); src_argb0 += count * 4; src_argb1 += count * 4; dst_argb += count * 4; @@ -581,7 +581,7 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, src_argb1 += (width & ~3) * 4; dst_argb += (width & ~3) * 4; width &= 3; - ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width); + ARGBBlendRow1_SSSE3(src_argb0, src_argb1, dst_argb, width); } } #endif // HAS_ARGBBLENDROW_SSSE3 diff --git a/source/row_posix.cc b/source/row_posix.cc index 2838585ae..fe21b397f 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2235,6 +2235,58 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBBLENDROW1_SSSE3 +// Blend 1 pixel at a time, unaligned +void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + + // 1 pixel loop + "1: \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBBLENDROW1_SSSE3 + #ifdef HAS_ARGBATTENUATE_SSE2 // Attenuate 4 pixels at a time. // aligned to 16 bytes diff --git a/source/row_win.cc b/source/row_win.cc index c9f46d4e1..2e538e321 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2079,6 +2079,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, // src_argb0 unaligned. // src_argb1 and dst_argb aligned to 16 bytes. // width must be multiple of 4 pixels. +// TODO(fbarchard): handle less than 4 pixels and unaligned pointer __declspec(naked) __declspec(align(16)) void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { @@ -2157,7 +2158,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, // Blend 1 pixel at a time, unaligned. __declspec(naked) __declspec(align(16)) void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { + uint8* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb0 @@ -2226,7 +2227,7 @@ static const uvec8 kShuffleAlpha = { // Destination aligned to 16 bytes, multiple of 4 pixels. __declspec(naked) __declspec(align(16)) void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { + uint8* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb0 @@ -2294,6 +2295,57 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 +#ifdef HAS_ARGBBLENDROW1_SSSE3 +// Blend 1 pixel at a time, unaligned. +__declspec(naked) __declspec(align(16)) +void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 1 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + align 16 + convertloop: + movd xmm3, [eax] + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBBLENDROW1_SSSE3 + #ifdef HAS_ARGBATTENUATE_SSE2 // Attenuate 4 pixels at a time. // aligned to 16 bytes