diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 0b1e01744..5ccf94d9d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -271,7 +271,7 @@ extern "C" { #define HAS_I422TOARGBROW_SSSE3 #endif -// The following are available forr gcc/clang x86 platforms: +// The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) @@ -279,6 +279,14 @@ extern "C" { #define HAS_SPLITRGBROW_SSSE3 #endif +// The following are available for AVX2 gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_MERGEUV10ROW_AVX2 +#endif + // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) @@ -1523,6 +1531,15 @@ void MergeRGBRow_Any_NEON(const uint8* src_r, uint8* dst_rgb, int width); +void MergeUV10Row_C(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int width); +void MergeUV10Row_AVX2(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int width); + void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count); diff --git a/source/row_common.cc b/source/row_common.cc index 2d01a789b..c3294ece5 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1798,6 +1798,24 @@ void MergeRGBRow_C(const uint8* src_r, } } +void MergeUV10Row_C(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x] << 6; + dst_uv[1] = src_v[x] << 6; + dst_uv[2] = src_u[x + 1] << 6; + dst_uv[3] = src_v[x + 1] << 6; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1] << 6; + dst_uv[1] = src_v[width - 1] << 6; + } +} + void CopyRow_C(const uint8* src, uint8* dst, int count) { memcpy(dst, src, count); } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 86f0880be..3af320454 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2753,6 +2753,48 @@ void MergeUVRow_SSE2(const uint8* src_u, } #endif // HAS_MERGEUVROW_SSE2 +#ifdef HAS_MERGEUV10ROW_AVX2 +void MergeUV10Row_AVX2(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int width) { + asm volatile ( + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + "vpsllw $0x6,%%ymm0,%%ymm0 \n" + "vpsllw $0x6,%%ymm1,%%ymm1 \n" +// "vpermq $0xd8,%%ymm0,%%ymm0 \n" +// "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + +// "vmovdqu %%ymm2, (%2) \n" +// "vmovdqu %%ymm0, 0x20(%2) \n" + + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2" + ); +} +#endif // HAS_MERGEUVROW_AVX2 + + #ifdef HAS_SPLITRGBROW_SSSE3 // Shuffle table for converting RGB to Planar. diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 04591fbcf..1cbd13f8b 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2617,6 +2617,48 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_c); } +// TODO(fbarchard): improve test for platforms and cpu detect +#ifdef HAS_MERGEUV10ROW_AVX2 +TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_u, kPixels * 2); + align_buffer_page_end(src_pixels_v, kPixels * 2); + align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2); + align_buffer_page_end(dst_pixels_uv_c, kPixels * 2 * 2); + + MemRandomize(src_pixels_u, kPixels * 2); + MemRandomize(src_pixels_v, kPixels * 2); + memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2); + memset(dst_pixels_uv_c, 1, kPixels * 2 * 2); + + MergeUV10Row_C(reinterpret_cast(src_pixels_u), + reinterpret_cast(src_pixels_v), + reinterpret_cast(dst_pixels_uv_c), kPixels); + + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + for (int i = 0; i < benchmark_iterations_; ++i) { + if (has_avx2) { + MergeUV10Row_AVX2(reinterpret_cast(src_pixels_u), + reinterpret_cast(src_pixels_v), + reinterpret_cast(dst_pixels_uv_opt), kPixels); + } else { + MergeUV10Row_C(reinterpret_cast(src_pixels_u), + reinterpret_cast(src_pixels_v), + reinterpret_cast(dst_pixels_uv_opt), kPixels); + } + } + + for (int i = 0; i < kPixels * 2 * 2; ++i) { + EXPECT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]); + } + + free_aligned_buffer_page_end(src_pixels_u); + free_aligned_buffer_page_end(src_pixels_v); + free_aligned_buffer_page_end(dst_pixels_uv_opt); + free_aligned_buffer_page_end(dst_pixels_uv_c); +} +#endif + float TestScaleMaxSamples(int benchmark_width, int benchmark_height, int benchmark_iterations,