diff --git a/README.chromium b/README.chromium index ec57a1d6b..7447b9fc0 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 541 +Version: 542 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 76b6517f3..7247f880e 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -209,6 +209,13 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Multiply ARGB image by ARGB image. +LIBYUV_API +int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + // Convert I422 to YUY2. LIBYUV_API int I422ToYUY2(const uint8* src_y, int src_stride_y, @@ -265,11 +272,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height, uint32 value); -// Multiply ARGB image by ARGB image. -int ARGBMultiply(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - // Interpolate between two ARGB images using specified amount of interpolation // (0 to 255) and store to destination. // 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0 diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 22eccce76..ca594af74 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -967,6 +967,15 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +// ARGB preattenuated alpha blend. Same API as Blend, but these require +// pointer and width alignment for SSE2. +void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -1270,7 +1279,6 @@ void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft, void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, const int32* previous_cumsum, int width); - LIBYUV_API void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, uint8* dst_argb, const float* uv_dudv, int width); @@ -1287,10 +1295,6 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride_argb, int dst_width, int source_y_fraction); -void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, - int width); #ifdef __cplusplus } // extern "C" diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 73eb3fad9..69e5ff069 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 541 +#define LIBYUV_VERSION 542 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 750670ce3..9aca6555d 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -404,6 +404,50 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, return 0; } +// Multiply 2 ARGB images together and store to destination. +LIBYUV_API +int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBMultiplyRow_C; +#if defined(HAS_ARGBMULTIPLYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && + IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_SSE2; + } + } +#elif defined(HAS_ARGBMULTIPLYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_NEON; + } +#endif + + // Multiply plane + for (int y = 0; y < height; ++y) { + ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + // Convert I422 to BGRA. LIBYUV_API int I422ToBGRA(const uint8* src_y, int src_stride_y, @@ -1170,47 +1214,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, return 0; } -// ARGB multiply 2 images together. -LIBYUV_API -int ARGBMultiply(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - if (!src_argb || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - - void (*ARGBMultiplyRow)(const uint8* src, uint8* dst, int width) = - ARGBMultiplyRow_C; -#if defined(HAS_ARGBMULTIPLYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBMultiplyRow = ARGBMultiplyRow_SSE2; - } - } -#elif defined(HAS_ARGBMULTIPLYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { - ARGBMultiplyRow = ARGBMultiplyRow_NEON; - } -#endif - - // Multiply plane - for (int y = 0; y < height; ++y) { - ARGBMultiplyRow(src_argb, dst_argb, width); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; -} - // Interpolate 2 ARGB images by specified amount (0 to 255). // TODO(fbarchard): Consider selecting a specialization for interpolation so // row function doesn't need to check interpolation on each row. diff --git a/source/row_any.cc b/source/row_any.cc index 1d928a3fd..8c6a8122a 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -373,10 +373,12 @@ MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) #undef MergeUVRow_ANY #define MultiplyRow_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK) \ - void NAMEANY(const uint8* src_argb, uint8* dst_argb, int width) { \ + void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \ + uint8* dst_argb, int width) { \ int n = width & ~MASK; \ - ARGBMULT_SIMD(src_argb, dst_argb, n); \ - ARGBMULT_C(src_argb + n * 4, \ + ARGBMULT_SIMD(src_argb0, src_argb1, dst_argb, n); \ + ARGBMULT_C(src_argb0 + n * 4, \ + src_argb1 + n * 4, \ dst_argb + n * 4, \ width & MASK); \ } diff --git a/source/row_common.cc b/source/row_common.cc index f4c55b6d2..b9f02d0e3 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -704,21 +704,23 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, #define REPEAT8(v) (v) | ((v) << 8) #define SHADE(f, v) v * f >> 16 -void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { for (int i = 0; i < width; ++i) { - const uint32 b = REPEAT8(src_argb[0]); - const uint32 g = REPEAT8(src_argb[1]); - const uint32 r = REPEAT8(src_argb[2]); - const uint32 a = REPEAT8(src_argb[3]); - const uint32 b_scale = dst_argb[0]; - const uint32 g_scale = dst_argb[1]; - const uint32 r_scale = dst_argb[2]; - const uint32 a_scale = dst_argb[3]; + const uint32 b = REPEAT8(src_argb0[0]); + const uint32 g = REPEAT8(src_argb0[1]); + const uint32 r = REPEAT8(src_argb0[2]); + const uint32 a = REPEAT8(src_argb0[3]); + const uint32 b_scale = src_argb1[0]; + const uint32 g_scale = src_argb1[1]; + const uint32 r_scale = src_argb1[2]; + const uint32 a_scale = src_argb1[3]; dst_argb[0] = SHADE(b, b_scale); dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); dst_argb[3] = SHADE(a, a_scale); - src_argb += 4; + src_argb0 += 4; + src_argb1 += 4; dst_argb += 4; } } diff --git a/source/row_posix.cc b/source/row_posix.cc index d62be2383..c710241ff 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3963,10 +3963,12 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiple 2 rows of ARGB pixels together, 4 pixels at a time. // Aligned to 16 bytes. -void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { asm volatile ( "pxor %%xmm5,%%xmm5 \n" "sub %0,%1 \n" + "sub %0,%2 \n" // 4 pixel loop. ".p2align 4 \n" @@ -3982,13 +3984,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%0,%2,1) \n" "lea 0x10(%0),%0 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 : : "memory", "cc" #if defined(__SSE2__) diff --git a/source/row_win.cc b/source/row_win.cc index f988312ed..5b6b5448b 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4280,18 +4280,22 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, // Multiple 2 rows of ARGB pixels together, 4 pixels at a time. // Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) -void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width pxor xmm5, xmm5 // constant 0 + sub esi, eax sub edx, eax align 16 convertloop: - movdqa xmm0, [eax] // read 4 pixels - movdqa xmm2, [eax + edx] // read 4 dest pixels + movdqa xmm0, [eax] // read 4 pixels from src_argb0 + movdqa xmm2, [eax + esi] // read 4 pixels from src_argb1 movdqa xmm1, xmm0 movdqa xmm3, xmm2 punpcklbw xmm0, xmm0 // first 2 @@ -4306,6 +4310,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { lea eax, [eax + 16] jg convertloop + pop esi ret } } diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 6e987f5c9..bfbf1bead 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -913,17 +913,21 @@ static int TestMultiply(int width, int height, int benchmark_iterations, src_argb_a[i + off] = (random() & 0xff); src_argb_b[i + off] = (random() & 0xff); } - memcpy(dst_argb_c, src_argb_b + off, kStride * height); - memcpy(dst_argb_opt, src_argb_b + off, kStride * height); + memset(dst_argb_c, 0, kStride * height); + memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(0); ARGBMultiply(src_argb_a + off, kStride, + src_argb_b + off, kStride, dst_argb_c, kStride, width, invert * height); MaskCpuFlags(-1); - ARGBMultiply(src_argb_a + off, kStride, - dst_argb_opt, kStride, - width, invert * height); + for (int i = 0; i < benchmark_iterations; ++i) { + ARGBMultiply(src_argb_a + off, kStride, + src_argb_b + off, kStride, + dst_argb_opt, kStride, + width, invert * height); + } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = @@ -933,12 +937,6 @@ static int TestMultiply(int width, int height, int benchmark_iterations, max_diff = abs_diff; } } - // Benchmark. - for (int i = 0; i < benchmark_iterations - 1; ++i) { - ARGBMultiply(src_argb_a + off, kStride, - dst_argb_opt, kStride, - width, invert * height); - } free_aligned_buffer_64(src_argb_a) free_aligned_buffer_64(src_argb_b) free_aligned_buffer_64(dst_argb_c)