diff --git a/README.chromium b/README.chromium index d107dee3e..ab320872a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 535 +Version: 537 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f62d37687..459c19f74 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -118,6 +118,7 @@ extern "C" { // TODO(fbarchard): Port to gcc. #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ARGBCOLORTABLEROW_X86 +#define HAS_ARGBMULTIPLYROW_SSE2 #endif // The following are Yasm x86 only. @@ -1278,6 +1279,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride_argb, int dst_width, int source_y_fraction); +void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); #ifdef __cplusplus } // extern "C" diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 2e0086e06..a53dd8297 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 536 +#define LIBYUV_VERSION 537 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_win.cc b/source/row_win.cc index 360918b38..82d77027a 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4160,6 +4160,41 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, } #endif // HAS_ARGBSHADEROW_SSE2 +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiple 2 rows of ARGB pixels together, 4 pixels at a time. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pxor xmm5, xmm5 // constant 0 + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + movdqa xmm2, [eax + edx] // read 4 dest pixels + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm3 // argb * value + packuswb xmm0, xmm1 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 // Consider float CumulativeSum. // Consider calling CumulativeSum one row at time as needed.