From c757f308eab211f9d5467a089052e7d84606f6c1 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 3 Apr 2012 00:49:16 +0000 Subject: [PATCH] Alpha blend 2 sources and store to a destination. Useful for A under B blending. BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/472005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@233 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/planar_functions.h | 28 ++- include/libyuv/version.h | 2 +- source/planar_functions.cc | 70 ++++++++ source/row.h | 12 ++ source/row_common.cc | 71 ++++++++ source/row_posix.cc | 170 ++++++++++++++++++ source/row_win.cc | 282 ++++++++++++++++++++++++++++++ 8 files changed, 627 insertions(+), 10 deletions(-) diff --git a/README.chromium b/README.chromium index e4e61cf01..b83e57530 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 232 +Version: 233 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 756fa67c3..87de9b6b7 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -26,12 +26,12 @@ void SetPlane(uint8* dst_y, int dst_stride_y, int width, int height, uint32 value); -// Copy a plane of data (I420 to I400) +// Copy a plane of data (I420 to I400). void CopyPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height); -// I420 mirror +// I420 mirror. int I420Mirror(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -71,7 +71,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Convert I400 to ARGB. Reverse of ARGBToI400 +// Convert I400 to ARGB. Reverse of ARGBToI400. int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); @@ -86,7 +86,7 @@ int RGB24ToARGB(const uint8* src_bg24, int src_stride_bg24, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Deprecated function name +// Deprecated function name. #define BG24ToARGB RGB24ToARGB // Convert ABGR to ARGB. Also used for ARGB to ABGR. @@ -114,7 +114,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, int width, int height); -// Draw a rectangle into I420 +// Draw a rectangle into I420. int I420Rect(uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, @@ -122,7 +122,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y, int width, int height, int value_y, int value_u, int value_v); -// Draw a rectangle into ARGB +// Draw a rectangle into ARGB. int ARGBRect(uint8* dst_argb, int dst_stride_argb, int x, int y, int width, int height, @@ -133,20 +133,32 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Alpha Blend ARGB row of pixels +// Alpha Blend ARGB row of pixels. void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width); -// Alpha Blend ARGB +// Alpha Blend 2 rows of ARGB pixels and store to destination. +void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); + +// Alpha Blend ARGB. int ARGBBlend(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Alpha Blend 2 ARGB images and store to destination. +int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert I422 to YUY2. int I422ToYUY2(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, uint8* dst_frame, int dst_stride_frame, int width, int height); +// Convert I422 to UYVY. int I422ToUYVY(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index c9aaade97..aced5e45c 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define INCLUDE_LIBYUV_VERSION 232 +#define INCLUDE_LIBYUV_VERSION 233 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 67cb7a46e..866fcb4fd 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -139,6 +139,12 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, // Alpha Blend ARGB void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width) { +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlendRow_SSSE3(src_argb, dst_argb, width); + return; + } +#endif #if defined(HAS_ARGBBLENDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBBlendRow_SSE2(src_argb, dst_argb, width); @@ -148,7 +154,26 @@ void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width) { ARGBBlendRow_C(src_argb, dst_argb, width); } +// Alpha Blend 2 rows of ARGB pixels and store to destination. +void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlend2Row_SSSE3(src_argb0, src_argb1, dst_argb, width); + return; + } +#endif +#if defined(HAS_ARGBBLENDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBBlend2Row_SSE2(src_argb0, src_argb1, dst_argb, width); + return; + } +#endif + ARGBBlend2Row_C(src_argb0, src_argb1, dst_argb, width); +} + // Alpha Blend ARGB +// TODO(fbarchard): Call 3 pointer low levels to reduce code size. int ARGBBlend(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height) { @@ -191,6 +216,51 @@ int ARGBBlend(const uint8* src_argb, int src_stride_argb, return 0; } +// Alpha Blend 2 ARGB images and store to destination. +int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + void (*ARGBBlend2Row)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = ARGBBlend2Row_C; +#if defined(HAS_ARGBBLENDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBBlend2Row = ARGBBlend2Row_SSE2; + if (IS_ALIGNED(width, 4) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBBlend2Row = ARGBBlend2Row_Aligned_SSE2; + } + } +#endif +#if defined(HAS_ARGBBLENDROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBBlend2Row = ARGBBlend2Row_SSSE3; + if (IS_ALIGNED(width, 4) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBBlend2Row = ARGBBlend2Row_Aligned_SSSE3; + } + } +#endif + + for (int y = 0; y < height; ++y) { + ARGBBlend2Row(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + // Convert I422 to ARGB. int I422ToARGB(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, diff --git a/source/row.h b/source/row.h index 2b53b1c61..c70160025 100644 --- a/source/row.h +++ b/source/row.h @@ -273,6 +273,18 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width); +// ARGB preattenuated alpha blend with 2 sources and a destination. +void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); + // 'Any' functions handle any size and alignment. void I420ToARGBRow_Any_SSSE3(const uint8* y_buf, const uint8* u_buf, diff --git a/source/row_common.cc b/source/row_common.cc index b57ffb4d0..d2f17ef30 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -518,6 +518,77 @@ void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } } +// Blend src_argb0 over src_argb1 and store to dst_argb. +// dst_argb may be src_argb0 or src_argb1. +void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + for (int x = 0; x < width - 1; x += 2) { + uint32 a = src_argb0[3]; + if (a == 0) { + *reinterpret_cast(dst_argb) = + *reinterpret_cast(src_argb1); + } else if (a == 255) { + *reinterpret_cast(dst_argb) = + *reinterpret_cast(src_argb0); + } else { + const uint32 fb = src_argb0[0]; + const uint32 fg = src_argb0[1]; + const uint32 fr = src_argb0[2]; + const uint32 bb = src_argb1[0]; + const uint32 bg = src_argb1[1]; + const uint32 br = src_argb1[2]; + dst_argb[0] = BLENDER(fb, bb, a); + dst_argb[1] = BLENDER(fg, bg, a); + dst_argb[2] = BLENDER(fr, br, a); + dst_argb[3] = 255u; + } + a = src_argb0[4 + 3]; + if (a == 0) { + *reinterpret_cast(dst_argb + 4) = + *reinterpret_cast(src_argb1 + 4); + } else if (a == 255) { + *reinterpret_cast(dst_argb + 4) = + *reinterpret_cast(src_argb0 + 4); + } else { + const uint32 fb = src_argb0[4 + 0]; + const uint32 fg = src_argb0[4 + 1]; + const uint32 fr = src_argb0[4 + 2]; + const uint32 bb = src_argb1[4 + 0]; + const uint32 bg = src_argb1[4 + 1]; + const uint32 br = src_argb1[4 + 2]; + dst_argb[4 + 0] = BLENDER(fb, bb, a); + dst_argb[4 + 1] = BLENDER(fg, bg, a); + dst_argb[4 + 2] = BLENDER(fr, br, a); + dst_argb[4 + 3] = 255u; + } + src_argb0 += 8; + src_argb1 += 8; + dst_argb += 8; + } + + if (width & 1) { + uint32 a = src_argb0[3]; + if (a == 0) { + *reinterpret_cast(dst_argb) = + *reinterpret_cast(src_argb1); + } else if (a == 255) { + *reinterpret_cast(dst_argb) = + *reinterpret_cast(src_argb0); + } else { + const uint32 fb = src_argb0[0]; + const uint32 fg = src_argb0[1]; + const uint32 fr = src_argb0[2]; + const uint32 bb = src_argb1[0]; + const uint32 bg = src_argb1[1]; + const uint32 br = src_argb1[2]; + dst_argb[0] = BLENDER(fb, bb, a); + dst_argb[1] = BLENDER(fg, bg, a); + dst_argb[2] = BLENDER(fr, br, a); + dst_argb[3] = 255u; + } + } +} + // Wrappers to handle odd sizes/alignments #define YUVANY(NAMEANY, I420TORGB_SSE, I420TORGB_C) \ void NAMEANY(const uint8* y_buf, \ diff --git a/source/row_posix.cc b/source/row_posix.cc index f839e204e..e7cfb011a 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2176,6 +2176,176 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { #endif // HAS_ARGBBLENDROW_SSE2 + + + + + + + +#ifdef HAS_ARGBBLENDROW_SSE2 +// Blend 8 pixels at a time +// Destination aligned to 16 bytes, multiple of 4 pixels +void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + + // 8 pixel loop + "1: \n" + "movdqu (%0),%%xmm3 \n" // first 4 pixels + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqa (%1),%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqa (%1),%%xmm1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%2) \n" + "jle 9f \n" + "movdqa %%xmm3,%%xmm0 \n" // next 4 pixels + "pxor %%xmm4,%%xmm3 \n" + "movdqa 0x10(%1),%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqa 0x10(%1),%%xmm1 \n" + "lea 0x20(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "jg 1b \n" + "9: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +// Blend 1 pixel at a time, unaligned +void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + + // 1 pixel loop + "1: \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x1,%3 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + // Do 1 to 3 pixels to get destination aligned. + if ((uintptr_t)(dst_argb) & 15) { + int count = width; + if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { + count = (-(intptr_t)(dst_argb) >> 2) & 3; + } + ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count); + src_argb0 += count * 4; + src_argb1 += count * 4; + dst_argb += count * 4; + width -= count; + } + // Do multiple of 4 pixels + if (width & ~3) { + ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3); + } + // Do remaining 1 to 3 pixels + if (width & 3) { + src_argb0 += (width & ~3) * 4; + src_argb1 += (width & ~3) * 4; + dst_argb += (width & ~3) * 4; + width &= 3; + ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width); + } +} +#endif // HAS_ARGBBLENDROW_SSE2 + + + + + + + + + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index c98cd1ab8..ada7788c7 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2324,6 +2324,288 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { } #endif // HAS_ARGBBLENDROW_SSSE3 + + + + + + +/////////////////////////////////////// +///////////////////// 2 source versions +/////////////////////////////////////// + + + + + + + +#ifdef HAS_ARGBBLENDROW_SSE2 +// Blend 8 pixels at a time +// Destination aligned to 16 bytes, multiple of 4 pixels +__declspec(naked) __declspec(align(16)) +void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 1 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + align 16 + convertloop: + movdqu xmm3, [eax] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqa xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3,0F5h // 8 alpha words + pshuflw xmm3, xmm3,0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqa xmm1, [esi] // _a_g + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + movdqu xmm3, [eax + 16] + lea eax, [eax + 32] + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + jle done + + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqa xmm2, [esi + 16] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3,0F5h // 8 alpha words + pshuflw xmm3, xmm3,0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqa xmm1, [esi + 16] // _a_g + lea esi, [esi + 32] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + jg convertloop + + done: + pop esi + ret + } +} + +// Blend 1 pixel at a time, unaligned +__declspec(naked) __declspec(align(16)) +void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 1 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + align 16 + convertloop: + movd xmm3, [eax] + lea eax, [eax + 4] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + psrlw xmm3, 8 // alpha + pshufhw xmm3, xmm3,0F5h // 8 alpha words + pshuflw xmm3, xmm3,0F5h + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g + lea esi, [esi + 4] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jg convertloop + + pop esi + ret + } +} + +void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + // Do 1 to 3 pixels to get destination aligned. + if ((uintptr_t)(dst_argb) & 15) { + int count = width; + if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { + count = (-(intptr_t)(dst_argb) >> 2) & 3; + } + ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count); + src_argb0 += count * 4; + src_argb1 += count * 4; + dst_argb += count * 4; + width -= count; + } + // Do multiple of 4 pixels + if (width & ~3) { + ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3); + } + // Do remaining 1 to 3 pixels + if (width & 3) { + src_argb0 += (width & ~3) * 4; + src_argb1 += (width & ~3) * 4; + dst_argb += (width & ~3) * 4; + width &= 3; + ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width); + } +} +#endif // HAS_ARGBBLENDROW_SSE2 + +#ifdef HAS_ARGBBLENDROW_SSSE3 +// Blend 8 pixels at a time +// Shuffle table for reversing the bytes. + +// Same as SSE2, but replaces +// psrlw xmm3, 8 // alpha +// pshufhw xmm3, xmm3,0F5h // 8 alpha words +// pshuflw xmm3, xmm3,0F5h +// with.. +// pshufb xmm3, kShuffleAlpha // alpha + +// Destination aligned to 16 bytes, multiple of 4 pixels +__declspec(naked) __declspec(align(16)) +void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm7, xmm7 // generate constant 1 + psrlw xmm7, 15 + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + psrlw xmm6, 8 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + align 16 + convertloop: + movdqu xmm3, [eax] + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + pshufb xmm3, kShuffleAlpha // alpha + movdqa xmm2, [esi] // _r_b + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqa xmm1, [esi] // _a_g + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + movdqu xmm3, [eax + 16] + lea eax, [eax + 32] + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx], xmm0 + jle done + + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqa xmm2, [esi + 16] // _r_b + pshufb xmm3, kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqa xmm1, [esi + 16] // _a_g + lea esi, [esi + 32] + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb + sub ecx, 4 + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + jg convertloop + + done: + pop esi + ret + } +} + +void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + // Do 1 to 3 pixels to get destination aligned. + if ((uintptr_t)(dst_argb) & 15) { + int count = width; + if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { + count = (-(intptr_t)(dst_argb) >> 2) & 3; + } + ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count); + src_argb0 += count * 4; + src_argb1 += count * 4; + dst_argb += count * 4; + width -= count; + } + // Do multiple of 4 pixels + if (width & ~3) { + ARGBBlend2Row_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3); + } + // Do remaining 1 to 3 pixels + if (width & 3) { + src_argb0 += (width & ~3) * 4; + src_argb1 += (width & ~3) * 4; + dst_argb += (width & ~3) * 4; + width &= 3; + ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width); + } +} +#endif // HAS_ARGBBLENDROW_SSSE3 + #endif // _M_IX86 #ifdef __cplusplus