diff --git a/README.chromium b/README.chromium index 3e952aab2..3b3aed2b2 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1546 +Version: 1547 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 587e0f643..40e53e15c 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -252,6 +252,12 @@ extern "C" { #define HAS_RGB565TOARGBROW_AVX2 #endif +// The following are available for 32 bit Visual C and clangcl 32 bit: +// TODO(fbarchard): Port to gcc. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#define HAS_BLENDPLANEROW_SSSE3 +#endif + // The following are also available on x64 Visual C. #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ (!defined(__clang__) || defined(__SSSE3__)) @@ -1454,6 +1460,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +// Unattenuated planar alpha blend. +void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_C(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width); + // ARGB multiply images. Same API as Blend, but these require // pointer and width alignment for SSE2. void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 85c18574e..10754081b 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1546 +#define LIBYUV_VERSION 1547 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_common.cc b/source/row_common.cc index cb806b911..3f003aa10 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2016,6 +2016,18 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, } } #undef BLEND + +void BlendPlaneRow_C(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32 f = *src0++; + uint32 b = *src1++; + uint32 a = *alpha++; + *dst++ = (((a) * f) + ((255 - a) * b) + 255) >> 8; + } +} + #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 // Multiply source RGB by alpha and store to destination. diff --git a/source/row_win.cc b/source/row_win.cc index aa94487c1..e3353cabf 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4063,6 +4063,58 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, } #endif // HAS_YUY2TOYROW_SSE2 +#ifdef HAS_BLENDPLANEROW_SSSE3 +// Blend 8 pixels at a time. +// =((G2*C2)+(H2*(D2))+32768+127)/256 +__declspec(naked) +void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) { + __asm { + push esi + push edi + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + mov eax, 0x80808080 // 128 for biasing image to signed. + movd xmm6, eax + pshufd xmm6, xmm6, 0x00 + + mov eax, 0x807f807f // 32768 + 127 for unbias and round. + movd xmm7, eax + pshufd xmm7, xmm7, 0x00 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 + mov esi, [esp + 8 + 12] // alpha + mov edi, [esp + 8 + 16] // dst + mov ecx, [esp + 8 + 20] // width + sub eax, esi + sub edx, esi + sub edi, esi + + // 8 pixel loop. + convertloop8: + movq xmm0, qword ptr [esi] // alpha + punpcklbw xmm0, xmm0 + pxor xmm0, xmm5 // a, 255-a + movq xmm1, qword ptr [eax + esi] // src0 + movq xmm2, qword ptr [edx + esi] // src1 + punpcklbw xmm1, xmm2 + psubb xmm1, xmm6 // bias src0/1 - 128 + pmaddubsw xmm0, xmm1 + paddw xmm0, xmm7 // unbias result - 32768 and round. + psrlw xmm0, 8 + packuswb xmm0, xmm0 + movq qword ptr [edi + esi], xmm0 + lea esi, [esi + 8] + sub ecx, 8 + jge convertloop8 + + pop edi + pop esi + ret + } +} +#endif // HAS_BLENDPLANEROW_SSSE3 + #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. static const uvec8 kShuffleAlpha = { diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 1416408f4..fc22fe139 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1163,6 +1163,87 @@ TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) { EXPECT_LE(max_diff, 1); } +#ifdef HAS_BLENDPLANEROW_SSSE3 +// TODO(fbarchard): Switch to I420Blend. +static void TestBlendPlane(int width, int height, int benchmark_iterations, + int invert, int off) { + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + width = width * height; + height = 1; + if (width < 1) { + width = 1; + } + if (width < 256) { + width = 256; + } + const int kBpp = 1; + const int kStride = width * kBpp; + align_buffer_64(src_argb_a, kStride * height + off); + align_buffer_64(src_argb_b, kStride * height + off); + align_buffer_64(src_argb_alpha, kStride * height + off); + align_buffer_64(dst_argb_c, kStride * height); + align_buffer_64(dst_argb_opt, kStride * height); + + if (has_ssse3) { + for (int i = 0; i < 255; ++i) { + src_argb_a[i] = i; + src_argb_b[i] = 255 - i; + src_argb_alpha[i] = 255; + } + memset(dst_argb_opt, 0xfb, kStride * height); + BlendPlaneRow_SSSE3(src_argb_a + off, + src_argb_b + off, + src_argb_alpha + off, + dst_argb_opt, + width * height); + for (int i = 0; i < kStride * height; ++i) { + EXPECT_EQ(src_argb_a[i], dst_argb_opt[i]); + } + } + for (int i = 0; i < kStride * height; ++i) { + src_argb_a[i + off] = (fastrand() & 0xff); + src_argb_b[i + off] = (fastrand() & 0xff); + src_argb_alpha[i + off] = (fastrand() & 0xff); + } + memset(dst_argb_c, 255, kStride * height); + memset(dst_argb_opt, 255, kStride * height); + + BlendPlaneRow_C(src_argb_a + off, + src_argb_b + off, + src_argb_alpha + off, + dst_argb_c, + width * height); + for (int i = 0; i < benchmark_iterations; ++i) { + if (has_ssse3) { + BlendPlaneRow_SSSE3(src_argb_a + off, + src_argb_b + off, + src_argb_alpha + off, + dst_argb_opt, + width * height); + } else { + BlendPlaneRow_C(src_argb_a + off, + src_argb_b + off, + src_argb_alpha + off, + dst_argb_opt, + width * height); + } + } + for (int i = 0; i < kStride * height; ++i) { + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); + } + free_aligned_buffer_64(src_argb_a); + free_aligned_buffer_64(src_argb_b); + free_aligned_buffer_64(dst_argb_c); + free_aligned_buffer_64(dst_argb_opt); + return; +} + +TEST_F(LibYUVPlanarTest, BlendPlane_Opt) { + TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, + +1, 0); +} +#endif + TEST_F(LibYUVPlanarTest, TestAffine) { SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]); SIMD_ALIGNED(uint8 interpolate_pixels_C[1280][4]);