diff --git a/README.chromium b/README.chromium index d046e76c4..c54a85c6c 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 201 +Version: 203 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 214235a01..051f848da 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -133,6 +133,11 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Alpha Blend ARGB +int ARGBBlend(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + int I422ToYUY2(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 1c2305867..e069731c8 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 201 +#define LIBYUV_VERSION 203 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index e4c0d9dfa..72afce84d 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -140,6 +140,43 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, return 0; } + +// Alpha Blend ARGB +int ARGBBlend(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) = + ARGBBlendRow_C; +#if defined(HAS_ARGBBLENDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBBlendRow = ARGBBlendRow_SSE2; + } +#endif +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 2)) { + ARGBBlendRow = ARGBBlendRow_SSSE3; + } +#endif + + for (int y = 0; y < height; ++y) { + ARGBBlendRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + // Convert I422 to ARGB. int I422ToARGB(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, diff --git a/source/row.h b/source/row.h index f1da41e50..14bc6dca0 100644 --- a/source/row.h +++ b/source/row.h @@ -64,6 +64,11 @@ extern "C" { #define HAS_UYVYTOUVROW_SSE2 #endif +#if defined(_MSC_VER) +#define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBBLENDROW_SSE2 +#endif + // The following are available on Neon platforms #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #define HAS_MIRRORROW_NEON @@ -239,6 +244,10 @@ void YToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, int width); +void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width); + // 'Any' wrappers use memcpy() void I420ToARGBRow_Any_SSSE3(const uint8* y_buf, const uint8* u_buf, diff --git a/source/row_common.cc b/source/row_common.cc index 30b1da6fd..224f7f4f9 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -452,6 +452,138 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { } } +#define BLENDER(f, b, a) (f * a + b * (a ^ 0xff) + 0x80) >> 8 +void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + for (int x = 0; x < width - 1; x += 2) { + uint32 a = src_argb[3]; + if (a) { + if (a < 255) { + const uint32 fb = src_argb[0]; + const uint32 fg = src_argb[1]; + const uint32 fr = src_argb[2]; + const uint32 bb = dst_argb[0]; + const uint32 bg = dst_argb[1]; + const uint32 br = dst_argb[2]; + dst_argb[0] = BLENDER(fb, bb, a); + dst_argb[1] = BLENDER(fg, bg, a); + dst_argb[2] = BLENDER(fr, br, a); + dst_argb[3] = 255u; + } else { + *(uint32*)dst_argb = *(uint32*)src_argb; + } + } + a = src_argb[4 + 3]; + if (a) { + if (a < 255) { + const uint32 fb = src_argb[4 + 0]; + const uint32 fg = src_argb[4 + 1]; + const uint32 fr = src_argb[4 + 2]; + const uint32 bb = dst_argb[4 + 0]; + const uint32 bg = dst_argb[4 + 1]; + const uint32 br = dst_argb[4 + 2]; + dst_argb[4 + 0] = BLENDER(fb, bb, a); + dst_argb[4 + 1] = BLENDER(fg, bg, a); + dst_argb[4 + 2] = BLENDER(fr, br, a); + dst_argb[4 + 3] = 255u; + } else { + *(uint32*)(dst_argb + 4) = *(uint32*)(src_argb + 4); + } + } + src_argb += 8; + dst_argb += 8; + } + + if (width & 1) { + const uint32 a = src_argb[3]; + if (a) { + if (a < 255) { + const uint32 fb = src_argb[0]; + const uint32 fg = src_argb[1]; + const uint32 fr = src_argb[2]; + const uint32 bb = dst_argb[0]; + const uint32 bg = dst_argb[1]; + const uint32 br = dst_argb[2]; + dst_argb[0] = BLENDER(fb, bb, a); + dst_argb[1] = BLENDER(fg, bg, a); + dst_argb[2] = BLENDER(fr, br, a); + dst_argb[3] = 255u; + } else { + *(uint32*)dst_argb = *(uint32*)src_argb; + } + } + } +} + +#if 0 +void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + for (int x = 0; x < width - 1; x += 2) { + uint32 f = *(uint32*)src_argb; + uint32 a = f >> 24; + if (a) { + const uint32 b = *(uint32*)dst_argb; + if (a < 255) { + const uint32 src_rb = f & 0x00ff00ff; + const uint32 dst_rb = b & 0x00ff00ff; + const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) & + 0xff00ff00; + + const uint32 src_g = f & 0x0000ff00; + const uint32 dst_g = b & 0x0000ff00; + const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) & + 0x00ff0000); + + f = ((out_rb | out_g) >> 8) | 0xff000000; + } + *(uint32*)dst_argb = f; + } + + f = *(uint32*)(src_argb + 4); + a = f >> 24; + if (a) { + const uint32 b = *(uint32*)(dst_argb + 4); + if (a < 255) { + const uint32 src_rb = f & 0x00ff00ff; + const uint32 dst_rb = b & 0x00ff00ff; + const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) & + 0xff00ff00; + + const uint32 src_g = f & 0x0000ff00; + const uint32 dst_g = b & 0x0000ff00; + const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) & + 0x00ff0000); + + f = ((out_rb | out_g) >> 8) | 0xff000000; + } + *(uint32*)(dst_argb + 4) = f; + } + src_argb += 8; + dst_argb += 8; + } + + if (width & 1) { + uint32 f = *(uint32*)src_argb; + uint32 a = f >> 24; + if (a) { + const uint32 b = *(uint32*)dst_argb; + if (a < 255) { + const uint32 src_rb = f & 0x00ff00ff; + const uint32 dst_rb = b & 0x00ff00ff; + const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) & + 0xff00ff00; + + const uint32 src_g = f & 0x0000ff00; + const uint32 dst_g = b & 0x0000ff00; + const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) & + 0x00ff0000); + + f = ((out_rb | out_g) >> 8) | 0xff000000; + } + *(uint32*)dst_argb = f; + } + } +} +#endif + // Wrappers to handle odd sizes/alignments #define MAKEYUVANY(NAMEANY, NAME, COPYROW) \ void NAMEANY(const uint8* y_buf, \ diff --git a/source/row_win.cc b/source/row_win.cc index 8b008e830..519edbb36 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1909,6 +1909,121 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, } #endif // HAS_YUY2TOYROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for copying alpha +static const uvec8 kShuffleAlpha = { + 7u, 7u, 7u, 7u, 7u, 7u, 0x80, 0x80, 15u, 15u, 15u, 15u, 15u, 15u, 0x80, 0x80 +}; + +__declspec(naked) +void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, 0x00200020 // rounding constant for 8.6 fixed point + movd xmm3, eax + pshufd xmm3, xmm3, 0 + mov eax, 0x3f3f3f3f // mask for alpha + movd xmm7, eax + pshufd xmm7, xmm7, 0 + movdqa xmm4, kShuffleAlpha + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + pcmpeqb xmm6, xmm6 // generate 0x00010001 for negating + psrlw xmm6, 15 + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + + convertloop: + movq xmm0, qword ptr [eax] // fetch 2 pixels + movq xmm1, qword ptr [eax + edx] + punpcklbw xmm1, xmm0 // mix 2 pixels aArRgGbB_aArRgGbB + movdqa xmm2, xmm1 // alpha from byte 7 and 15 + pshufb xmm2, xmm4 + pxor xmm2, xmm5 + psrlw xmm2, 2 + pand xmm2, xmm7 + paddw xmm2, xmm6 // -a = (a^255)+1 + pmaddubsw xmm1, xmm2 + paddw xmm1, xmm3 // round + psrlw xmm1, 6 + + packuswb xmm1, xmm1 // pack 2 pixels + sub ecx, 2 + movq qword ptr [eax + edx], xmm1 + lea eax, [eax + 8] + ja convertloop + + ret + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + +#ifdef HAS_ARGBBLENDROW_SSE2 +// TODO(fbarchard): Single multiply method b+a(f-b) +// TODO(fbarchard): Unroll and pair +// TODO(fbarchard): Test for transparent and opaque common cases +__declspec(naked) +void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + pcmpeqb xmm4, xmm4 // generate 0xffffffff do negative alpha + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + sub ecx, 1 + je last1 + + convertloop: + movq xmm0, qword ptr [eax] // fetch 2 pixels + movq xmm1, qword ptr [eax + edx] + punpcklbw xmm0, xmm0 // src 16 bits + punpcklbw xmm1, xmm1 // dst 16 bits + pshuflw xmm2, xmm0, 0xff // src alpha + pshufhw xmm2, xmm2, 0xff + movdqa xmm3, xmm2 // dst alpha + pxor xmm3, xmm4 + pmulhuw xmm0, xmm2 // src * a + pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff) + paddw xmm0, xmm1 + psrlw xmm0, 8 + packuswb xmm0, xmm0 // pack 2 pixels + sub ecx, 2 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + ja convertloop + + last1: + add ecx, 1 + je done + + mov ecx, [eax] // handle remaining pixel + movd xmm0, ecx + mov ecx, [eax + edx] + movd xmm1, ecx + punpcklbw xmm0, xmm0 // src 16 bits + punpcklbw xmm1, xmm1 // dst 16 bits + pshuflw xmm2, xmm0, 0xff // src alpha + pshufhw xmm2, xmm2, 0xff + movdqa xmm3, xmm2 // dst alpha + pxor xmm3, xmm4 + pmulhuw xmm0, xmm2 // src * a + pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff) + paddw xmm0, xmm1 + psrlw xmm0, 8 + packuswb xmm0, xmm0 // pack 2 pixels + + movd ecx, xmm0 + mov dword ptr [eax + edx], ecx + + done: + + ret + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + #endif // _M_IX86 #ifdef __cplusplus diff --git a/source/scale.cc b/source/scale.cc index c2fcbb2c0..44ba9378d 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1699,20 +1699,21 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, uint16* dst_ptr, int src_width, int src_height) { int tmp_height = 0; intptr_t tmp_src = 0; + intptr_t tmp_src_stride = static_cast(src_stride); asm volatile ( "pxor %%xmm4,%%xmm4 \n" - "sub $0x1,%5 \n" + "sub $0x1,%6 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "mov %0,%3 \n" - "add %6,%0 \n" + "add %4,%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm4,%%xmm0 \n" "punpckhbw %%xmm4,%%xmm1 \n" - "mov %5,%2 \n" + "mov %6,%2 \n" "2: \n" "movdqa (%0),%%xmm2 \n" - "add %6,%0 \n" + "add %4,%0 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm2 \n" "punpckhbw %%xmm4,%%xmm3 \n" @@ -1724,15 +1725,16 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, "movdqa %%xmm1,0x10(%1) \n" "lea 0x10(%3),%0 \n" "lea 0x20(%1),%1 \n" - "sub $0x10,%4 \n" + "sub $0x10,%5 \n" "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_height), // %2 "+r"(tmp_src), // %3 - "+r"(src_width), // %4 - "+rm"(src_height) // %5 - : "rm"(static_cast(src_stride)) // %6 + "+r"(tmp_src_stride), // %4 + "+rm"(src_width), // %5 + "+rm"(src_height) // %6 + : : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" @@ -1740,7 +1742,6 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ); } - #if defined(__i386__) extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width);