From 8ec6033478a736f1b84203792a0b254a19391100 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 17 Jan 2013 20:18:08 +0000 Subject: [PATCH] ARGBMultiply 2 images together BUG=175 TEST=Out\release\libyuv_unittest --gtest_filter=*Mult* Review URL: https://webrtc-codereview.appspot.com/1043004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@541 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/planar_functions.h | 5 +++ include/libyuv/row.h | 5 ++- include/libyuv/version.h | 2 +- source/planar_functions.cc | 41 ++++++++++++++++++ source/row_any.cc | 13 ++++++ source/row_common.cc | 24 +++++++++++ source/row_posix.cc | 38 +++++++++++++++++ unit_test/planar_test.cc | 70 +++++++++++++++++++++++++++++++ 9 files changed, 197 insertions(+), 3 deletions(-) diff --git a/README.chromium b/README.chromium index b6254244b..ec57a1d6b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 540 +Version: 541 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 688bef5dc..76b6517f3 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -265,6 +265,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height, uint32 value); +// Multiply ARGB image by ARGB image. +int ARGBMultiply(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + // Interpolate between two ARGB images using specified amount of interpolation // (0 to 255) and store to destination. // 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0 diff --git a/include/libyuv/row.h b/include/libyuv/row.h index abb9e1efe..22eccce76 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -106,6 +106,7 @@ extern "C" { #define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBINTERPOLATEROW_SSSE3 #define HAS_ARGBMIRRORROW_SSSE3 +#define HAS_ARGBMULTIPLYROW_SSE2 #define HAS_ARGBQUANTIZEROW_SSE2 #define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSHADEROW_SSE2 @@ -118,7 +119,6 @@ extern "C" { // TODO(fbarchard): Port to gcc. #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ARGBCOLORTABLEROW_X86 -#define HAS_ARGBMULTIPLYROW_SSE2 #define HAS_ARGBTOUV444ROW_SSSE3 #endif @@ -1287,7 +1287,10 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride_argb, int dst_width, int source_y_fraction); +void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + int width); #ifdef __cplusplus } // extern "C" diff --git a/include/libyuv/version.h b/include/libyuv/version.h index fb04d6d07..73eb3fad9 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 540 +#define LIBYUV_VERSION 541 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 49a4d95e5..750670ce3 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1170,6 +1170,47 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, return 0; } +// ARGB multiply 2 images together. +LIBYUV_API +int ARGBMultiply(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + + void (*ARGBMultiplyRow)(const uint8* src, uint8* dst, int width) = + ARGBMultiplyRow_C; +#if defined(HAS_ARGBMULTIPLYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_SSE2; + } + } +#elif defined(HAS_ARGBMULTIPLYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_NEON; + } +#endif + + // Multiply plane + for (int y = 0; y < height; ++y) { + ARGBMultiplyRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + // Interpolate 2 ARGB images by specified amount (0 to 255). // TODO(fbarchard): Consider selecting a specialization for interpolation so // row function doesn't need to check interpolation on each row. diff --git a/source/row_any.cc b/source/row_any.cc index 5839274b3..1d928a3fd 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -372,6 +372,19 @@ MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) #endif #undef MergeUVRow_ANY +#define MultiplyRow_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK) \ + void NAMEANY(const uint8* src_argb, uint8* dst_argb, int width) { \ + int n = width & ~MASK; \ + ARGBMULT_SIMD(src_argb, dst_argb, n); \ + ARGBMULT_C(src_argb + n * 4, \ + dst_argb + n * 4, \ + width & MASK); \ + } + +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +MultiplyRow_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, + ARGBMultiplyRow_C, 3) +#endif #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index 8758c8ee4..f4c55b6d2 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -701,6 +701,30 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, #undef REPEAT8 #undef SHADE +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v * f >> 16 + +void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + for (int i = 0; i < width; ++i) { + const uint32 b = REPEAT8(src_argb[0]); + const uint32 g = REPEAT8(src_argb[1]); + const uint32 r = REPEAT8(src_argb[2]); + const uint32 a = REPEAT8(src_argb[3]); + const uint32 b_scale = dst_argb[0]; + const uint32 g_scale = dst_argb[1]; + const uint32 r_scale = dst_argb[2]; + const uint32 a_scale = dst_argb[3]; + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // Copy a Y to RGB. for (int x = 0; x < width; ++x) { diff --git a/source/row_posix.cc b/source/row_posix.cc index e9d76271f..d62be2383 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3960,6 +3960,44 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, } #endif // HAS_ARGBSHADEROW_SSE2 +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +// Multiple 2 rows of ARGB pixels together, 4 pixels at a time. +// Aligned to 16 bytes. +void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + + // 4 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index ae1792378..6e987f5c9 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -900,4 +900,74 @@ TEST_F(libyuvTest, TestCopyPlane) { EXPECT_EQ(0, err); } +static int TestMultiply(int width, int height, int benchmark_iterations, + int invert, int off) { + const int kBpp = 4; + const int kStride = (width * kBpp + 15) & ~15; + align_buffer_64(src_argb_a, kStride * height + off); + align_buffer_64(src_argb_b, kStride * height + off); + align_buffer_64(dst_argb_c, kStride * height); + align_buffer_64(dst_argb_opt, kStride * height); + srandom(time(NULL)); + for (int i = 0; i < kStride * height; ++i) { + src_argb_a[i + off] = (random() & 0xff); + src_argb_b[i + off] = (random() & 0xff); + } + memcpy(dst_argb_c, src_argb_b + off, kStride * height); + memcpy(dst_argb_opt, src_argb_b + off, kStride * height); + + MaskCpuFlags(0); + ARGBMultiply(src_argb_a + off, kStride, + dst_argb_c, kStride, + width, invert * height); + MaskCpuFlags(-1); + ARGBMultiply(src_argb_a + off, kStride, + dst_argb_opt, kStride, + width, invert * height); + int max_diff = 0; + for (int i = 0; i < kStride * height; ++i) { + int abs_diff = + abs(static_cast(dst_argb_c[i]) - + static_cast(dst_argb_opt[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + // Benchmark. + for (int i = 0; i < benchmark_iterations - 1; ++i) { + ARGBMultiply(src_argb_a + off, kStride, + dst_argb_opt, kStride, + width, invert * height); + } + free_aligned_buffer_64(src_argb_a) + free_aligned_buffer_64(src_argb_b) + free_aligned_buffer_64(dst_argb_c) + free_aligned_buffer_64(dst_argb_opt) + return max_diff; +} + +TEST_F(libyuvTest, ARGBMultiply_Any) { + int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBMultiply_Unaligned) { + int max_diff = TestMultiply(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 1); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBMultiply_Invert) { + int max_diff = TestMultiply(benchmark_width_, benchmark_height_, + benchmark_iterations_, -1, 0); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBMultiply_Opt) { + int max_diff = TestMultiply(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_LE(max_diff, 1); +} + } // namespace libyuv