diff --git a/README.chromium b/README.chromium index baad7cd52..f1e6cf56f 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 550 +Version: 551 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index f2b4aacc0..93417da9d 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -209,20 +209,27 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Multiply ARGB image by ARGB image. +// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. LIBYUV_API int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, const uint8* src_argb1, int src_stride_argb1, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Add ARGB image with ARGB image. +// Add ARGB image with ARGB image. Saturates to 255. LIBYUV_API int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, const uint8* src_argb1, int src_stride_argb1, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. +LIBYUV_API +int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + // Convert I422 to YUY2. LIBYUV_API int I422ToYUY2(const uint8* src_y, int src_stride_y, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 636a3cc15..7997e5e62 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -111,6 +111,7 @@ extern "C" { #define HAS_ARGBQUANTIZEROW_SSE2 #define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSHADEROW_SSE2 +#define HAS_ARGBSUBTRACTROW_SSE2 #define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 @@ -969,7 +970,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); -// ARGB multiply images. Same API as Blend, but these require +// ARGB multiply images. Same API as Blend, but these require // pointer and width alignment for SSE2. void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); @@ -994,6 +995,19 @@ void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1, void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +// ARGB subtract images. Same API as Blend, but these require +// pointer and width alignment for SSE2. +void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); + void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 1891f1ee9..e6077bf3a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 550 +#define LIBYUV_VERSION 551 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index c24c2d307..e041dd10f 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -498,6 +498,53 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, return 0; } +// Subtract 2 ARGB images and store to destination. +LIBYUV_API +int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, + int width) = ARGBSubtractRow_C; +#if defined(HAS_ARGBSUBTRACTROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && + IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBSubtractRow = ARGBSubtractRow_SSE2; + } + } +#elif defined(HAS_ARGBSUBTRACTROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBSubtractRow = ARGBSubtractRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_NEON; + } + } +#endif + + // Subtract plane + for (int y = 0; y < height; ++y) { + ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width); + src_argb0 += src_stride_argb0; + src_argb1 += src_stride_argb1; + dst_argb += dst_stride_argb; + } + return 0; +} + // Convert I422 to BGRA. LIBYUV_API int I422ToBGRA(const uint8* src_y, int src_stride_y, diff --git a/source/row_any.cc b/source/row_any.cc index 7890a0e9c..43d76d4d2 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -390,6 +390,10 @@ MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C, #ifdef HAS_ARGBADDROW_SSE2 MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3) #endif +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C, + 3) +#endif #ifdef HAS_ARGBMULTIPLYROW_NEON MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C, 7) @@ -397,6 +401,10 @@ MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C, #ifdef HAS_ARGBADDROW_NEON MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7) #endif +#ifdef HAS_ARGBSUBTRACTROW_NEON +MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C, + 7) +#endif #undef MATHROW_ANY #ifdef __cplusplus diff --git a/source/row_common.cc b/source/row_common.cc index 50a399fc8..a24ac074c 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -751,6 +751,30 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, } #undef SHADE +#define SHADE(f, v) (v >= f) ? 0 : (f - v) + +void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + for (int i = 0; i < width; ++i) { + const uint32 b = src_argb0[0]; + const uint32 g = src_argb0[1]; + const uint32 r = src_argb0[2]; + const uint32 a = src_argb0[3]; + const uint32 b_sub = src_argb1[0]; + const uint32 g_sub = src_argb1[1]; + const uint32 r_sub = src_argb1[2]; + const uint32 a_sub = src_argb1[3]; + dst_argb[0] = SHADE(b, b_sub); + dst_argb[1] = SHADE(g, g_sub); + dst_argb[2] = SHADE(r, r_sub); + dst_argb[3] = SHADE(a, a_sub); + src_argb0 += 4; + src_argb1 += 4; + dst_argb += 4; + } +} +#undef SHADE + void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // Copy a Y to RGB. for (int x = 0; x < width; ++x) { diff --git a/source/row_neon.cc b/source/row_neon.cc index 758f8f206..fc0887d12 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2552,7 +2552,31 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1" + : "cc", "memory", "q0", "q1", "q2", "q3" + ); +} + +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3" ); } diff --git a/source/row_posix.cc b/source/row_posix.cc index 04be0beb9..ef77eda20 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -4007,7 +4007,6 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( - "pxor %%xmm5,%%xmm5 \n" "sub %0,%1 \n" "sub %0,%2 \n" @@ -4028,12 +4027,44 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, : : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" + , "xmm0", "xmm1" #endif ); } #endif // HAS_ARGBADDROW_SSE2 +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels, 4 pixels at a time. +// Aligned to 16 bytes. +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "sub %0,%2 \n" + + // 4 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%1),%%xmm1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%0,%2,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. diff --git a/source/row_win.cc b/source/row_win.cc index 33dc26d46..f81d6acec 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4348,6 +4348,37 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBADDROW_SSE2 +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels from src_argb0 + movdqa xmm1, [eax + esi] // read 4 pixels from src_argb1 + psubusb xmm0, xmm1 // src_argb0 - src_argb1 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_SSE2 + #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 // Consider float CumulativeSum. // Consider calling CumulativeSum one row at time as needed. diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index b03caf7ff..c39835c4b 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1037,4 +1037,72 @@ TEST_F(libyuvTest, ARGBAdd_Opt) { EXPECT_LE(max_diff, 1); } +static int TestSubtract(int width, int height, int benchmark_iterations, + int invert, int off) { + const int kBpp = 4; + const int kStride = (width * kBpp + 15) & ~15; + align_buffer_64(src_argb_a, kStride * height + off); + align_buffer_64(src_argb_b, kStride * height + off); + align_buffer_64(dst_argb_c, kStride * height); + align_buffer_64(dst_argb_opt, kStride * height); + srandom(time(NULL)); + for (int i = 0; i < kStride * height; ++i) { + src_argb_a[i + off] = (random() & 0xff); + src_argb_b[i + off] = (random() & 0xff); + } + memset(dst_argb_c, 0, kStride * height); + memset(dst_argb_opt, 0, kStride * height); + + MaskCpuFlags(0); + ARGBSubtract(src_argb_a + off, kStride, + src_argb_b + off, kStride, + dst_argb_c, kStride, + width, invert * height); + MaskCpuFlags(-1); + for (int i = 0; i < benchmark_iterations; ++i) { + ARGBSubtract(src_argb_a + off, kStride, + src_argb_b + off, kStride, + dst_argb_opt, kStride, + width, invert * height); + } + int max_diff = 0; + for (int i = 0; i < kStride * height; ++i) { + int abs_diff = + abs(static_cast(dst_argb_c[i]) - + static_cast(dst_argb_opt[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + free_aligned_buffer_64(src_argb_a) + free_aligned_buffer_64(src_argb_b) + free_aligned_buffer_64(dst_argb_c) + free_aligned_buffer_64(dst_argb_opt) + return max_diff; +} + +TEST_F(libyuvTest, ARGBSubtract_Any) { + int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBSubtract_Unaligned) { + int max_diff = TestSubtract(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 1); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBSubtract_Invert) { + int max_diff = TestSubtract(benchmark_width_, benchmark_height_, + benchmark_iterations_, -1, 0); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBSubtract_Opt) { + int max_diff = TestSubtract(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_LE(max_diff, 1); +} + } // namespace libyuv