diff --git a/README.chromium b/README.chromium index 7447b9fc0..d11d2fcc2 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 542 +Version: 543 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 7247f880e..f2b4aacc0 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -216,6 +216,13 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Add ARGB image with ARGB image. +LIBYUV_API +int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + // Convert I422 to YUY2. LIBYUV_API int I422ToYUY2(const uint8* src_y, int src_stride_y, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ca594af74..79c7cd135 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -99,6 +99,7 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 // Effects +#define HAS_ARGBADDROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2 #define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBBLENDROW_SSSE3 @@ -967,7 +968,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); -// ARGB preattenuated alpha blend. Same API as Blend, but these require +// ARGB multiply images. Same API as Blend, but these require // pointer and width alignment for SSE2. void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); @@ -976,6 +977,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1, void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +// ARGB add images. +void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 69e5ff069..c24413b39 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 542 +#define LIBYUV_VERSION 543 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 9aca6555d..5a613761b 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -448,6 +448,50 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, return 0; } +// Add 2 ARGB images together and store to destination. +LIBYUV_API +int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBAddRow_C; +#if defined(HAS_ARGBADDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && + IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBAddRow = ARGBAddRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBAddRow = ARGBAddRow_SSE2; + } + } +#elif defined(HAS_ARGBADDROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { + ARGBAddRow = ARGBAddRow_NEON; + } +#endif + + // Add plane + for (int y = 0; y < height; ++y) { + ARGBAddRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + // Convert I422 to BGRA. LIBYUV_API int I422ToBGRA(const uint8* src_y, int src_stride_y, diff --git a/source/row_any.cc b/source/row_any.cc index 8c6a8122a..5d6dc5403 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -324,7 +324,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, #endif #undef UV422ANY -#define SplitUVRowANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ +#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ void NAMEANY(const uint8* src_uv, \ uint8* dst_u, uint8* dst_v, int width) { \ int n = width & ~MASK; \ @@ -336,21 +336,21 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, } #ifdef HAS_SPLITUVROW_SSE2 -SplitUVRowANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15) +SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15) #endif #ifdef HAS_SPLITUVROW_AVX2 -SplitUVRowANY(SplitUVRow_Any_AVX2, SplitUVRow_Unaligned_AVX2, SplitUVRow_C, 31) +SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_Unaligned_AVX2, SplitUVRow_C, 31) #endif #ifdef HAS_SPLITUVROW_NEON -SplitUVRowANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15) +SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15) #endif #ifdef HAS_SPLITUVROW_MIPS_DSPR2 -SplitUVRowANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, +SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, SplitUVRow_C, 15) #endif -#undef SplitUVRowANY +#undef SPLITUVROWANY -#define MergeUVRow_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ +#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ void NAMEANY(const uint8* src_u, const uint8* src_v, \ uint8* dst_uv, int width) { \ int n = width & ~MASK; \ @@ -362,17 +362,17 @@ SplitUVRowANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, } #ifdef HAS_MERGEUVROW_SSE2 -MergeUVRow_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15) +MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15) #endif #ifdef HAS_MERGEUVROW_AVX2 -MergeUVRow_ANY(MergeUVRow_Any_AVX2, MergeUVRow_Unaligned_AVX2, MergeUVRow_C, 31) +MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_Unaligned_AVX2, MergeUVRow_C, 31) #endif #ifdef HAS_MERGEUVROW_NEON -MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) +MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) #endif -#undef MergeUVRow_ANY +#undef MERGEUVROW_ANY -#define MultiplyRow_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK) \ +#define MATHROW_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK) \ void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \ uint8* dst_argb, int width) { \ int n = width & ~MASK; \ @@ -384,9 +384,13 @@ MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) } #ifdef HAS_ARGBMULTIPLYROW_SSE2 -MultiplyRow_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, - ARGBMultiplyRow_C, 3) +MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C, 3) #endif +#ifdef HAS_ARGBADDROW_SSE2 +MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3) +#endif +#undef MATHROW_ANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index b9f02d0e3..50a399fc8 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -727,6 +727,30 @@ void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, #undef REPEAT8 #undef SHADE +#define SHADE(f, v) ((v + f) > 255) ? 255 : (v + f) + +void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + for (int i = 0; i < width; ++i) { + const uint32 b = src_argb0[0]; + const uint32 g = src_argb0[1]; + const uint32 r = src_argb0[2]; + const uint32 a = src_argb0[3]; + const uint32 b_add = src_argb1[0]; + const uint32 g_add = src_argb1[1]; + const uint32 r_add = src_argb1[2]; + const uint32 a_add = src_argb1[3]; + dst_argb[0] = SHADE(b, b_add); + dst_argb[1] = SHADE(g, g_add); + dst_argb[2] = SHADE(r, r_add); + dst_argb[3] = SHADE(a, a_add); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // Copy a Y to RGB. for (int x = 0; x < width; ++x) { diff --git a/source/row_posix.cc b/source/row_posix.cc index c710241ff..04be0beb9 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3948,10 +3948,10 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, "movdqa %%xmm0,(%0,%1,1) \n" "lea 0x10(%0),%0 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2" @@ -3961,7 +3961,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, #endif // HAS_ARGBSHADEROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_SSE2 -// Multiple 2 rows of ARGB pixels together, 4 pixels at a time. +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. // Aligned to 16 bytes. void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { @@ -3988,8 +3988,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "movdqa %%xmm0,(%0,%2,1) \n" "lea 0x10(%0),%0 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : @@ -4001,6 +4001,39 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBMULTIPLYROW_SSE2 +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +// Aligned to 16 bytes. +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%2 \n" + + // 4 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%1),%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%0,%2,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBADDROW_SSE2 + #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. diff --git a/source/row_win.cc b/source/row_win.cc index 5b6b5448b..c66c1ce92 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4277,7 +4277,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, #endif // HAS_ARGBSHADEROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_SSE2 -// Multiple 2 rows of ARGB pixels together, 4 pixels at a time. +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. // Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, @@ -4294,7 +4294,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, align 16 convertloop: - movdqa xmm0, [eax] // read 4 pixels from src_argb0 + movdqa xmm0, [eax] // read 4 pixels from src_argb0 movdqa xmm2, [eax + esi] // read 4 pixels from src_argb1 movdqa xmm1, xmm0 movdqa xmm3, xmm2 @@ -4302,8 +4302,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, punpckhbw xmm1, xmm1 // next 2 punpcklbw xmm2, xmm5 // first 2 punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // argb * value - pmulhuw xmm1, xmm3 // argb * value + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 packuswb xmm0, xmm1 sub ecx, 4 movdqa [eax + edx], xmm0 @@ -4316,6 +4316,38 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBMULTIPLYROW_SSE2 +#ifdef HAS_ARGBADDROW_SSE2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + pxor xmm5, xmm5 // constant 0 + sub esi, eax + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels from src_argb0 + movdqa xmm1, [eax + esi] // read 4 pixels from src_argb1 + paddusb xmm0, xmm1 // src_argb0 + src_argb1 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBADDROW_SSE2 + #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 // Consider float CumulativeSum. // Consider calling CumulativeSum one row at time as needed. diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index bfbf1bead..4f90b53c9 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -968,4 +968,72 @@ TEST_F(libyuvTest, ARGBMultiply_Opt) { EXPECT_LE(max_diff, 1); } +static int TestAdd(int width, int height, int benchmark_iterations, + int invert, int off) { + const int kBpp = 4; + const int kStride = (width * kBpp + 15) & ~15; + align_buffer_64(src_argb_a, kStride * height + off); + align_buffer_64(src_argb_b, kStride * height + off); + align_buffer_64(dst_argb_c, kStride * height); + align_buffer_64(dst_argb_opt, kStride * height); + srandom(time(NULL)); + for (int i = 0; i < kStride * height; ++i) { + src_argb_a[i + off] = (random() & 0xff); + src_argb_b[i + off] = (random() & 0xff); + } + memset(dst_argb_c, 0, kStride * height); + memset(dst_argb_opt, 0, kStride * height); + + MaskCpuFlags(0); + ARGBAdd(src_argb_a + off, kStride, + src_argb_b + off, kStride, + dst_argb_c, kStride, + width, invert * height); + MaskCpuFlags(-1); + for (int i = 0; i < benchmark_iterations; ++i) { + ARGBAdd(src_argb_a + off, kStride, + src_argb_b + off, kStride, + dst_argb_opt, kStride, + width, invert * height); + } + int max_diff = 0; + for (int i = 0; i < kStride * height; ++i) { + int abs_diff = + abs(static_cast(dst_argb_c[i]) - + static_cast(dst_argb_opt[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + free_aligned_buffer_64(src_argb_a) + free_aligned_buffer_64(src_argb_b) + free_aligned_buffer_64(dst_argb_c) + free_aligned_buffer_64(dst_argb_opt) + return max_diff; +} + +TEST_F(libyuvTest, ARGBAdd_Any) { + int max_diff = TestAdd(benchmark_width_ - 1, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBAdd_Unaligned) { + int max_diff = TestAdd(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 1); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBAdd_Invert) { + int max_diff = TestAdd(benchmark_width_, benchmark_height_, + benchmark_iterations_, -1, 0); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBAdd_Opt) { + int max_diff = TestAdd(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_LE(max_diff, 1); +} + } // namespace libyuv