diff --git a/README.chromium b/README.chromium index 8daeba949..1da0a1fc6 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 238 +Version: 239 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index fb97ce6da..fd72a817c 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 238 +#define LIBYUV_VERSION 239 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row.h b/source/row.h index 2b68ae016..a541c7621 100644 --- a/source/row.h +++ b/source/row.h @@ -65,6 +65,7 @@ extern "C" { #define HAS_YUY2TOUVROW_SSE2 #define HAS_UYVYTOUVROW_SSE2 #define HAS_ARGBBLENDROW_SSE2 +#define HAS_ARGBBLENDROW_SSSE3 #endif // The following are available on Neon platforms @@ -78,12 +79,6 @@ extern "C" { #define HAS_I420TOABGRROW_NEON #endif -// The following are only available on Win32 -// TODO(fbarchard): Port to GCC -#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) -#define HAS_ARGBBLENDROW_SSSE3 -#endif - #if defined(_MSC_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var typedef __declspec(align(16)) int8 vec8[16]; diff --git a/source/row_common.cc b/source/row_common.cc index 32e2db95a..e61fb0dc8 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -458,7 +458,7 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { // Blend src_argb0 over src_argb1 and store to dst_argb. // dst_argb may be src_argb0 or src_argb1. void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { + uint8* dst_argb, int width) { for (int x = 0; x < width - 1; x += 2) { uint32 a = src_argb0[3]; if (a == 0) { @@ -526,6 +526,66 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, } } +#ifdef HAS_ARGBBLENDROW_SSE2 +void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + // Do 1 to 3 pixels to get destination aligned. + if ((uintptr_t)(dst_argb) & 15) { + int count = width; + if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { + count = (-(intptr_t)(dst_argb) >> 2) & 3; + } + ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count); + src_argb0 += count * 4; + src_argb1 += count * 4; + dst_argb += count * 4; + width -= count; + } + // Do multiple of 4 pixels + if (width & ~3) { + ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3); + } + // Do remaining 1 to 3 pixels + if (width & 3) { + src_argb0 += (width & ~3) * 4; + src_argb1 += (width & ~3) * 4; + dst_argb += (width & ~3) * 4; + width &= 3; + ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width); + } +} +#endif // HAS_ARGBBLENDROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + // Do 1 to 3 pixels to get destination aligned. + if ((uintptr_t)(dst_argb) & 15) { + int count = width; + if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { + count = (-(intptr_t)(dst_argb) >> 2) & 3; + } + ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count); + src_argb0 += count * 4; + src_argb1 += count * 4; + dst_argb += count * 4; + width -= count; + } + // Do multiple of 4 pixels. + if (width & ~3) { + ARGBBlendRow_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3); + } + // Do remaining 1 to 3 pixels + if (width & 3) { + src_argb0 += (width & ~3) * 4; + src_argb1 += (width & ~3) * 4; + dst_argb += (width & ~3) * 4; + width &= 3; + ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width); + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + // Wrappers to handle odd sizes/alignments #define YUVANY(NAMEANY, I420TORGB_SSE, I420TORGB_C) \ void NAMEANY(const uint8* y_buf, \ diff --git a/source/row_posix.cc b/source/row_posix.cc index 14a3a0bf9..d7d174bc2 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2029,8 +2029,10 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, #endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_ARGBBLENDROW_SSE2 -// Blend 8 pixels at a time -// Destination aligned to 16 bytes, multiple of 4 pixels +// Blend 8 pixels at a time. +// src_argb0 unaligned. +// src_argb1 and dst_argb aligned to 16 bytes. +// width must be multiple of 4 pixels. void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -2045,7 +2047,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, // 8 pixel loop "1: \n" - "movdqu (%0),%%xmm3 \n" // first 4 pixels + "movdqu (%0),%%xmm3 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" "movdqu (%1),%%xmm2 \n" @@ -2068,7 +2070,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, "sub $0x4,%3 \n" "movdqa %%xmm0,(%2) \n" "jle 9f \n" - "movdqa %%xmm3,%%xmm0 \n" // next 4 pixels + "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" "movdqu 0x10(%1),%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" @@ -2105,7 +2107,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, // Blend 1 pixel at a time, unaligned void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { + uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $0xf,%%xmm7 \n" @@ -2130,7 +2132,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" + "lea 0x4(%1),%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" @@ -2153,36 +2155,83 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, #endif ); } - -void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - // Do 1 to 3 pixels to get destination aligned. - if ((uintptr_t)(dst_argb) & 15) { - int count = width; - if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { - count = (-(intptr_t)(dst_argb) >> 2) & 3; - } - ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count); - src_argb0 += count * 4; - src_argb1 += count * 4; - dst_argb += count * 4; - width -= count; - } - // Do multiple of 4 pixels - if (width & ~3) { - ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3); - } - // Do remaining 1 to 3 pixels - if (width & 3) { - src_argb0 += (width & ~3) * 4; - src_argb1 += (width & ~3) * 4; - dst_argb += (width & ~3) * 4; - width &= 3; - ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width); - } -} #endif // HAS_ARGBBLENDROW_SSE2 +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for reversing the bytes. +CONST uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; +void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + + // 8 pixel loop + "1: \n" + "movdqu (%0),%%xmm3 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "pshufb %4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%2) \n" + "jle 9f \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu 0x10(%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "lea 0x20(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "jg 1b \n" + "9: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBBLENDROW_SSSE3 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index 472780565..09c2eb162 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2203,35 +2203,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, } } -void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - // Do 1 to 3 pixels to get destination aligned. - if ((uintptr_t)(dst_argb) & 15) { - int count = width; - if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { - count = (-(intptr_t)(dst_argb) >> 2) & 3; - } - ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count); - src_argb0 += count * 4; - src_argb1 += count * 4; - dst_argb += count * 4; - width -= count; - } - // Do multiple of 4 pixels - if (width & ~3) { - ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3); - } - // Do remaining 1 to 3 pixels - if (width & 3) { - src_argb0 += (width & ~3) * 4; - src_argb1 += (width & ~3) * 4; - dst_argb += (width & ~3) * 4; - width &= 3; - ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width); - } -} #endif // HAS_ARGBBLENDROW_SSE2 - #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for reversing the bytes. static const uvec8 kShuffleAlpha = { @@ -2318,34 +2290,6 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, ret } } - -void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - // Do 1 to 3 pixels to get destination aligned. - if ((uintptr_t)(dst_argb) & 15) { - int count = width; - if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { - count = (-(intptr_t)(dst_argb) >> 2) & 3; - } - ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count); - src_argb0 += count * 4; - src_argb1 += count * 4; - dst_argb += count * 4; - width -= count; - } - // Do multiple of 4 pixels. - if (width & ~3) { - ARGBBlendRow_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3); - } - // Do remaining 1 to 3 pixels - if (width & 3) { - src_argb0 += (width & ~3) * 4; - src_argb1 += (width & ~3) * 4; - dst_argb += (width & ~3) * 4; - width &= 3; - ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width); - } -} #endif // HAS_ARGBBLENDROW_SSSE3 #endif // _M_IX86