From bea690b3e03d24f77fea45c9a8592ea480a4acd8 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Sat, 5 Dec 2015 22:23:29 -0800 Subject: [PATCH] AVX2 YUV alpha blender and improved unittests AVX2 version can process 16 pixels at a time for improved memory bandwidth and fewer instructions. unittests improved to test unaligned memory, and test exactness when alpha is 0 or 255. R=dhrosa@google.com, harryjin@google.com BUG=libyuv:527 Review URL: https://codereview.chromium.org/1505433002 . --- README.chromium | 2 +- include/libyuv/planar_functions.h | 26 +++ include/libyuv/row.h | 13 +- include/libyuv/version.h | 2 +- source/planar_functions.cc | 162 ++++++++++++++++++ source/row_gcc.cc | 51 +++++- source/row_win.cc | 62 ++++++- unit_test/planar_test.cc | 267 ++++++++++++++++++++++++++---- 8 files changed, 539 insertions(+), 46 deletions(-) diff --git a/README.chromium b/README.chromium index 3b3aed2b2..b0bc90214 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1547 +Version: 1548 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 6d5dd082f..9d30225d4 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -302,6 +302,7 @@ LIBYUV_API ARGBBlendRow GetARGBBlend(); // Alpha Blend ARGB images and store to destination. +// Source is pre-multiplied by alpha using ARGBAttenuate. // Alpha of destination is set to 255. LIBYUV_API int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, @@ -309,6 +310,31 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Alpha Blend plane and store to destination. +// Source is not pre-multiplied by alpha. +LIBYUV_API +int BlendPlane(const uint8* src_y0, int src_stride_y0, + const uint8* src_y1, int src_stride_y1, + const uint8* alpha, int alpha_stride, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Alpha Blend YUV images and store to destination. +// Source is not pre-multiplied by alpha. +// Alpha is full width x height and subsampled to half size to apply to UV. +LIBYUV_API +int I420Blend(const uint8* src_y0, int src_stride_y0, + const uint8* src_u0, int src_stride_u0, + const uint8* src_v0, int src_stride_v0, + const uint8* src_y1, int src_stride_y1, + const uint8* src_u1, int src_stride_u1, + const uint8* src_v1, int src_stride_v1, + const uint8* alpha, int alpha_stride, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. LIBYUV_API int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index deed8a422..cf96c0516 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -233,6 +233,7 @@ extern "C" { #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 +#define HAS_BLENDPLANEROW_AVX2 #endif // The following are available for AVX2 Visual C and clangcl 32 bit: @@ -253,12 +254,6 @@ extern "C" { #define HAS_RGB565TOARGBROW_AVX2 #endif -// The following are available for 32 bit Visual C and clangcl 32 bit: -// TODO(fbarchard): Port to gcc. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) -#define HAS_BLENDPLANEROW_SSSE3 -#endif - // The following are also available on x64 Visual C. #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ (!defined(__clang__) || defined(__SSSE3__)) @@ -1464,6 +1459,12 @@ void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, // Unattenuated planar alpha blend. void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width); void BlendPlaneRow_C(const uint8* src0, const uint8* src1, const uint8* alpha, uint8* dst, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 10754081b..2c47a4c9f 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1547 +#define LIBYUV_VERSION 1548 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index b15b6e523..85425feaf 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -17,6 +17,7 @@ #include "libyuv/mjpeg_decoder.h" #endif #include "libyuv/row.h" +#include "libyuv/scale_row.h" // for ScaleRowDown2 #ifdef __cplusplus namespace libyuv { @@ -577,6 +578,167 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, return 0; } +// Alpha Blend plane and store to destination. +LIBYUV_API +int BlendPlane(const uint8* src_y0, int src_stride_y0, + const uint8* src_y1, int src_stride_y1, + const uint8* alpha, int alpha_stride, + uint8* dst_y, int dst_stride_y, + int width, int height) { + int y; + void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C; + if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + + // Coalesce rows for Y plane. + if (src_stride_y0 == width && + src_stride_y1 == width && + alpha_stride == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; + } + +#if defined(HAS_BLENDPLANEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { +// TODO(fbarchard): Implement any versions for odd width. +// BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + BlendPlaneRow = BlendPlaneRow_SSSE3; + } + } +#endif +#if defined(HAS_BLENDPLANEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { +// BlendPlaneRow = BlendPlaneRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + BlendPlaneRow = BlendPlaneRow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); + src_y0 += src_stride_y0; + src_y1 += src_stride_y1; + alpha += alpha_stride; + dst_y += dst_stride_y; + } + return 0; +} + +#define MAXTWIDTH 2048 +// Alpha Blend YUV images and store to destination. +LIBYUV_API +int I420Blend(const uint8* src_y0, int src_stride_y0, + const uint8* src_u0, int src_stride_u0, + const uint8* src_v0, int src_stride_v0, + const uint8* src_y1, int src_stride_y1, + const uint8* src_u1, int src_stride_u1, + const uint8* src_v1, int src_stride_v1, + const uint8* alpha, int alpha_stride, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C; + void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C; + if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || + !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + + // Blend Y plane. + BlendPlane(src_y0, src_stride_y0, + src_y1, src_stride_y1, + alpha, alpha_stride, + dst_y, dst_stride_y, + width, height); + + // Half width/height for UV. + width = (width + 1) >> 1; + height = (height + 1) >> 1; + +#if defined(HAS_BLENDPLANEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { +// TODO(fbarchard): Implement any versions for odd width. +// BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + BlendPlaneRow = BlendPlaneRow_SSSE3; + } + } +#endif +#if defined(HAS_BLENDPLANEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { +// BlendPlaneRow = BlendPlaneRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + BlendPlaneRow = BlendPlaneRow_AVX2; + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ScaleRowDown2 = ScaleRowDown2Box_NEON; + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + ScaleRowDown2 = ScaleRowDown2Box_SSE2; + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ScaleRowDown2 = ScaleRowDown2Box_AVX2; + } + } +#endif + + // Row buffer for intermediate alpha pixels. + align_buffer_64(halfalpha, width); + for (y = 0; y < height; ++y) { + // Subsample 2 rows of UV to half width and half height. + ScaleRowDown2(alpha, alpha_stride, halfalpha, width); + alpha += alpha_stride * 2; + BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, width); + BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, width); + src_u0 += src_stride_u0; + src_u1 += src_stride_u1; + dst_u += dst_stride_u; + src_v0 += src_stride_v0; + src_v1 += src_stride_v1; + dst_v += dst_stride_v; + } + free_aligned_buffer_64(halfalpha); + return 0; +} + // Multiply 2 ARGB images and store to destination. LIBYUV_API int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index c3ff96282..12c7dd884 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3467,7 +3467,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 - #ifdef HAS_BLENDPLANEROW_SSSE3 // Blend 8 pixels at a time. // =((G2*C2)+(H2*(D2))+32768+127)/256 @@ -3514,6 +3513,56 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, } #endif // HAS_BLENDPLANEROW_SSSE3 +#ifdef HAS_BLENDPLANEROW_AVX2 +// Blend 16 pixels at a time. +// =((G2*C2)+(H2*(D2))+32768+127)/256 +void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $0x8,%%ymm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm6 \n" + "vbroadcastss %%xmm6,%%ymm6 \n" + "mov $0x807f807f,%%eax \n" + "vmovd %%eax,%%xmm7 \n" + "vbroadcastss %%xmm7,%%ymm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%2),%%xmm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0,%2,1),%%xmm1 \n" + "vmovdqu (%1,%2,1),%%xmm2 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%3,%2,1) \n" + "lea 0x10(%2),%2 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+r"(width) // %4 + :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha diff --git a/source/row_win.cc b/source/row_win.cc index e3353cabf..13076ce60 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -525,7 +525,7 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, vmovd xmm5, eax vbroadcastss ymm5, xmm5 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits - movd xmm6, eax + vmovd xmm6, eax vbroadcastss ymm6, xmm6 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 @@ -576,7 +576,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, vmovd xmm5, eax vbroadcastss ymm5, xmm5 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits - movd xmm6, eax + vmovd xmm6, eax vbroadcastss ymm6, xmm6 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 @@ -4106,7 +4106,7 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, movq qword ptr [edi + esi], xmm0 lea esi, [esi + 8] sub ecx, 8 - jge convertloop8 + jg convertloop8 pop edi pop esi @@ -4115,6 +4115,62 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, } #endif // HAS_BLENDPLANEROW_SSSE3 +#ifdef HAS_BLENDPLANEROW_AVX2 +// Blend 16 pixels at a time. +// =((G2*C2)+(H2*(D2))+32768+127)/256 +__declspec(naked) +void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, + const uint8* alpha, uint8* dst, int width) { + __asm { + push esi + push edi + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 + vpsllw ymm5, ymm5, 8 + mov eax, 0x80808080 // 128 for biasing image to signed. + vmovd xmm6, eax + vbroadcastss ymm6, xmm6 + mov eax, 0x807f807f // 32768 + 127 for unbias and round. + vmovd xmm7, eax + vbroadcastss ymm7, xmm7 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 + mov esi, [esp + 8 + 12] // alpha + mov edi, [esp + 8 + 16] // dst + mov ecx, [esp + 8 + 20] // width + sub eax, esi + sub edx, esi + sub edi, esi + + // 16 pixel loop. + convertloop16: + vmovdqu xmm0, [esi] // alpha + vpermq ymm0, ymm0, 0xd8 + vpunpcklbw ymm0, ymm0, ymm0 + vpxor ymm0, ymm0, ymm5 // a, 255-a + vmovdqu xmm1, [eax + esi] // src0 + vmovdqu xmm2, [edx + esi] // src1 + vpermq ymm1, ymm1, 0xd8 + vpermq ymm2, ymm2, 0xd8 + vpunpcklbw ymm1, ymm1, ymm2 + vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 + vpmaddubsw ymm0, ymm0, ymm1 + vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + vmovdqu [edi + esi], xmm0 + lea esi, [esi + 16] + sub ecx, 16 + jg convertloop16 + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_BLENDPLANEROW_AVX2 + #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. static const uvec8 kShuffleAlpha = { diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index fc22fe139..f5a8b2129 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1163,16 +1163,14 @@ TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) { EXPECT_LE(max_diff, 1); } -#ifdef HAS_BLENDPLANEROW_SSSE3 +#ifdef HAS_BLENDPLANEROW_AVX2 // TODO(fbarchard): Switch to I420Blend. -static void TestBlendPlane(int width, int height, int benchmark_iterations, - int invert, int off) { +static void TestBlendPlaneRow(int width, int height, int benchmark_iterations, + int invert, int off) { int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + int has_avx2 = TestCpuFlag(kCpuHasAVX2); width = width * height; height = 1; - if (width < 1) { - width = 1; - } if (width < 256) { width = 256; } @@ -1181,23 +1179,39 @@ static void TestBlendPlane(int width, int height, int benchmark_iterations, align_buffer_64(src_argb_a, kStride * height + off); align_buffer_64(src_argb_b, kStride * height + off); align_buffer_64(src_argb_alpha, kStride * height + off); - align_buffer_64(dst_argb_c, kStride * height); - align_buffer_64(dst_argb_opt, kStride * height); + align_buffer_64(dst_argb_c, kStride * height + off); + align_buffer_64(dst_argb_opt, kStride * height + off); + memset(dst_argb_c, 255, kStride * height + off); + memset(dst_argb_opt, 255, kStride * height + off); if (has_ssse3) { - for (int i = 0; i < 255; ++i) { - src_argb_a[i] = i; - src_argb_b[i] = 255 - i; - src_argb_alpha[i] = 255; + // Test source is maintained exactly if alpha is 255. + for (int i = 0; i < 256; ++i) { + src_argb_a[i + off] = i; + src_argb_b[i + off] = 255 - i; + src_argb_alpha[i + off] = 255; } - memset(dst_argb_opt, 0xfb, kStride * height); BlendPlaneRow_SSSE3(src_argb_a + off, src_argb_b + off, src_argb_alpha + off, - dst_argb_opt, - width * height); - for (int i = 0; i < kStride * height; ++i) { - EXPECT_EQ(src_argb_a[i], dst_argb_opt[i]); + dst_argb_opt + off, + 256); + for (int i = 0; i < 256; ++i) { + EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]); + } + // Test destination is maintained exactly if alpha is 0. + for (int i = 0; i < 256; ++i) { + src_argb_a[i + off] = i; + src_argb_b[i + off] = 255 - i; + src_argb_alpha[i + off] = 0; + } + BlendPlaneRow_SSSE3(src_argb_a + off, + src_argb_b + off, + src_argb_alpha + off, + dst_argb_opt + off, + 256); + for (int i = 0; i < 256; ++i) { + EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]); } } for (int i = 0; i < kStride * height; ++i) { @@ -1205,34 +1219,122 @@ static void TestBlendPlane(int width, int height, int benchmark_iterations, src_argb_b[i + off] = (fastrand() & 0xff); src_argb_alpha[i + off] = (fastrand() & 0xff); } - memset(dst_argb_c, 255, kStride * height); - memset(dst_argb_opt, 255, kStride * height); BlendPlaneRow_C(src_argb_a + off, src_argb_b + off, src_argb_alpha + off, - dst_argb_c, + dst_argb_c + off, width * height); for (int i = 0; i < benchmark_iterations; ++i) { - if (has_ssse3) { - BlendPlaneRow_SSSE3(src_argb_a + off, - src_argb_b + off, - src_argb_alpha + off, - dst_argb_opt, - width * height); + if (has_avx2) { + BlendPlaneRow_AVX2(src_argb_a + off, + src_argb_b + off, + src_argb_alpha + off, + dst_argb_opt + off, + width * height); } else { - BlendPlaneRow_C(src_argb_a + off, - src_argb_b + off, - src_argb_alpha + off, - dst_argb_opt, - width * height); + if (has_ssse3) { + BlendPlaneRow_SSSE3(src_argb_a + off, + src_argb_b + off, + src_argb_alpha + off, + dst_argb_opt + off, + width * height); + } else { + BlendPlaneRow_C(src_argb_a + off, + src_argb_b + off, + src_argb_alpha + off, + dst_argb_opt + off, + width * height); + } } } for (int i = 0; i < kStride * height; ++i) { - EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); + EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]); } free_aligned_buffer_64(src_argb_a); free_aligned_buffer_64(src_argb_b); + free_aligned_buffer_64(src_argb_alpha); + free_aligned_buffer_64(dst_argb_c); + free_aligned_buffer_64(dst_argb_opt); + return; +} + +TEST_F(LibYUVPlanarTest, BlendPlaneRow_Opt) { + TestBlendPlaneRow(benchmark_width_, benchmark_height_, benchmark_iterations_, + +1, 0); +} +TEST_F(LibYUVPlanarTest, BlendPlaneRow_Unaligned) { + TestBlendPlaneRow(benchmark_width_, benchmark_height_, benchmark_iterations_, + +1, 1); +} +#endif + +static void TestBlendPlane(int width, int height, int benchmark_iterations, + int disable_cpu_flags, int benchmark_cpu_info, + int invert, int off) { + if (width < 1) { + width = 1; + } + const int kBpp = 1; + const int kStride = width * kBpp; + align_buffer_64(src_argb_a, kStride * height + off); + align_buffer_64(src_argb_b, kStride * height + off); + align_buffer_64(src_argb_alpha, kStride * height + off); + align_buffer_64(dst_argb_c, kStride * height + off); + align_buffer_64(dst_argb_opt, kStride * height + off); + memset(dst_argb_c, 255, kStride * height + off); + memset(dst_argb_opt, 255, kStride * height + off); + + // Test source is maintained exactly if alpha is 255. + for (int i = 0; i < width; ++i) { + src_argb_a[i + off] = i & 255; + src_argb_b[i + off] = 255 - (i & 255); + } + memset(src_argb_alpha + off, 255, width); + BlendPlane(src_argb_a + off, width, + src_argb_b + off, width, + src_argb_alpha + off, width, + dst_argb_opt + off, width, + width, 1); + for (int i = 0; i < width; ++i) { + EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]); + } + // Test destination is maintained exactly if alpha is 0. + memset(src_argb_alpha + off, 0, width); + BlendPlane(src_argb_a + off, width, + src_argb_b + off, width, + src_argb_alpha + off, width, + dst_argb_opt + off, width, + width, 1); + for (int i = 0; i < width; ++i) { + EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]); + } + for (int i = 0; i < kStride * height; ++i) { + src_argb_a[i + off] = (fastrand() & 0xff); + src_argb_b[i + off] = (fastrand() & 0xff); + src_argb_alpha[i + off] = (fastrand() & 0xff); + } + + MaskCpuFlags(disable_cpu_flags); + BlendPlane(src_argb_a + off, width, + src_argb_b + off, width, + src_argb_alpha + off, width, + dst_argb_c + off, width, + width, height); + MaskCpuFlags(benchmark_cpu_info); + for (int i = 0; i < benchmark_iterations; ++i) { + BlendPlane(src_argb_a + off, width, + src_argb_b + off, width, + src_argb_alpha + off, width, + dst_argb_opt + off, width, + width, height); + } + for (int i = 0; i < kStride * height; ++i) { + EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]); + } + free_aligned_buffer_64(src_argb_a); + free_aligned_buffer_64(src_argb_b); + free_aligned_buffer_64(src_argb_alpha); free_aligned_buffer_64(dst_argb_c); free_aligned_buffer_64(dst_argb_opt); return; @@ -1240,9 +1342,106 @@ static void TestBlendPlane(int width, int height, int benchmark_iterations, TEST_F(LibYUVPlanarTest, BlendPlane_Opt) { TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, - +1, 0); + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); +} +TEST_F(LibYUVPlanarTest, BlendPlane_Unaligned) { + TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); +} + +#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a)) + +static void TestI420Blend(int width, int height, int benchmark_iterations, + int disable_cpu_flags, int benchmark_cpu_info, + int invert, int off) { + width = ((width) > 0) ? (width) : 1; + const int kStrideUV = SUBSAMPLE(width, 2); + const int kSizeUV = kStrideUV * SUBSAMPLE(height, 2); + align_buffer_64(src_y0, width * height + off); + align_buffer_64(src_u0, kSizeUV + off); + align_buffer_64(src_v0, kSizeUV + off); + align_buffer_64(src_y1, width * height + off); + align_buffer_64(src_u1, kSizeUV + off); + align_buffer_64(src_v1, kSizeUV + off); + align_buffer_64(src_a, width * height + off); + align_buffer_64(dst_y_c, width * height + off); + align_buffer_64(dst_u_c, kSizeUV + off); + align_buffer_64(dst_v_c, kSizeUV + off); + align_buffer_64(dst_y_opt, width * height + off); + align_buffer_64(dst_u_opt, kSizeUV + off); + align_buffer_64(dst_v_opt, kSizeUV + off); + + MemRandomize(src_y0, width * height + off); + MemRandomize(src_u0, kSizeUV + off); + MemRandomize(src_v0, kSizeUV + off); + MemRandomize(src_y1, width * height + off); + MemRandomize(src_u1, kSizeUV + off); + MemRandomize(src_v1, kSizeUV + off); + MemRandomize(src_a, width * height + off); + memset(dst_y_c, 255, width * height + off); + memset(dst_u_c, 255, kSizeUV + off); + memset(dst_v_c, 255, kSizeUV + off); + memset(dst_y_opt, 255, width * height + off); + memset(dst_u_opt, 255, kSizeUV + off); + memset(dst_v_opt, 255, kSizeUV + off); + + MaskCpuFlags(disable_cpu_flags); + I420Blend(src_y0 + off, width, + src_u0 + off, kStrideUV, + src_v0 + off, kStrideUV, + src_y1 + off, width, + src_u1 + off, kStrideUV, + src_v1 + off, kStrideUV, + src_a + off, width, + dst_y_c + off, width, + dst_u_c + off, kStrideUV, + dst_v_c + off, kStrideUV, + width, height); + MaskCpuFlags(benchmark_cpu_info); + for (int i = 0; i < benchmark_iterations; ++i) { + I420Blend(src_y0 + off, width, + src_u0 + off, kStrideUV, + src_v0 + off, kStrideUV, + src_y1 + off, width, + src_u1 + off, kStrideUV, + src_v1 + off, kStrideUV, + src_a + off, width, + dst_y_opt + off, width, + dst_u_opt + off, kStrideUV, + dst_v_opt + off, kStrideUV, + width, height); + } + for (int i = 0; i < width * height; ++i) { + EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]); + } + for (int i = 0; i < kSizeUV; ++i) { + EXPECT_NEAR(dst_u_c[i + off], dst_u_opt[i + off], 1); // Subsample off by 1 + EXPECT_NEAR(dst_v_c[i + off], dst_v_opt[i + off], 1); + } + free_aligned_buffer_64(src_y0); + free_aligned_buffer_64(src_u0); + free_aligned_buffer_64(src_v0); + free_aligned_buffer_64(src_y1); + free_aligned_buffer_64(src_u1); + free_aligned_buffer_64(src_v1); + free_aligned_buffer_64(src_a); + free_aligned_buffer_64(dst_y_c); + free_aligned_buffer_64(dst_u_c); + free_aligned_buffer_64(dst_v_c); + free_aligned_buffer_64(dst_y_opt); + free_aligned_buffer_64(dst_u_opt); + free_aligned_buffer_64(dst_v_opt); + return; +} + +TEST_F(LibYUVPlanarTest, I420Blend_Opt) { + TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); +} +TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) { + TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); } -#endif TEST_F(LibYUVPlanarTest, TestAffine) { SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);