From a0c32b9e495d8be4083e943bcb32a5dc99214e12 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 2 Nov 2017 19:52:44 -0700 Subject: [PATCH] MergeUV10Row_AVX2 for converting H010 to P010 H010 is 10 bit planar format with 10 bits in lower bits. P010 is 10 bit biplanar format with 10 bits in upper bits. This function weaves the U and V channels and shifts the bits into the upper bits. Bug: libyuv:751 Test: LibYUVPlanarTest.MergeUV10Row_Opt Change-Id: I4a0bac0ef1ff95aa1b8d68261ec8e8e86f2d1fbf Reviewed-on: https://chromium-review.googlesource.com/752692 Reviewed-by: Cheng Wang Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- include/libyuv/row.h | 19 +++++++++++++++++- source/row_common.cc | 18 +++++++++++++++++ source/row_gcc.cc | 42 ++++++++++++++++++++++++++++++++++++++++ unit_test/planar_test.cc | 42 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+), 1 deletion(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 0b1e01744..5ccf94d9d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -271,7 +271,7 @@ extern "C" { #define HAS_I422TOARGBROW_SSSE3 #endif -// The following are available forr gcc/clang x86 platforms: +// The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) @@ -279,6 +279,14 @@ extern "C" { #define HAS_SPLITRGBROW_SSSE3 #endif +// The following are available for AVX2 gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_MERGEUV10ROW_AVX2 +#endif + // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) @@ -1523,6 +1531,15 @@ void MergeRGBRow_Any_NEON(const uint8* src_r, uint8* dst_rgb, int width); +void MergeUV10Row_C(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int width); +void MergeUV10Row_AVX2(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int width); + void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count); diff --git a/source/row_common.cc b/source/row_common.cc index 2d01a789b..c3294ece5 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1798,6 +1798,24 @@ void MergeRGBRow_C(const uint8* src_r, } } +void MergeUV10Row_C(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x] << 6; + dst_uv[1] = src_v[x] << 6; + dst_uv[2] = src_u[x + 1] << 6; + dst_uv[3] = src_v[x + 1] << 6; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1] << 6; + dst_uv[1] = src_v[width - 1] << 6; + } +} + void CopyRow_C(const uint8* src, uint8* dst, int count) { memcpy(dst, src, count); } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 86f0880be..3af320454 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2753,6 +2753,48 @@ void MergeUVRow_SSE2(const uint8* src_u, } #endif // HAS_MERGEUVROW_SSE2 +#ifdef HAS_MERGEUV10ROW_AVX2 +void MergeUV10Row_AVX2(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int width) { + asm volatile ( + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + "vpsllw $0x6,%%ymm0,%%ymm0 \n" + "vpsllw $0x6,%%ymm1,%%ymm1 \n" +// "vpermq $0xd8,%%ymm0,%%ymm0 \n" +// "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + +// "vmovdqu %%ymm2, (%2) \n" +// "vmovdqu %%ymm0, 0x20(%2) \n" + + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2" + ); +} +#endif // HAS_MERGEUVROW_AVX2 + + #ifdef HAS_SPLITRGBROW_SSSE3 // Shuffle table for converting RGB to Planar. diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 04591fbcf..1cbd13f8b 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2617,6 +2617,48 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_c); } +// TODO(fbarchard): improve test for platforms and cpu detect +#ifdef HAS_MERGEUV10ROW_AVX2 +TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_u, kPixels * 2); + align_buffer_page_end(src_pixels_v, kPixels * 2); + align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2); + align_buffer_page_end(dst_pixels_uv_c, kPixels * 2 * 2); + + MemRandomize(src_pixels_u, kPixels * 2); + MemRandomize(src_pixels_v, kPixels * 2); + memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2); + memset(dst_pixels_uv_c, 1, kPixels * 2 * 2); + + MergeUV10Row_C(reinterpret_cast(src_pixels_u), + reinterpret_cast(src_pixels_v), + reinterpret_cast(dst_pixels_uv_c), kPixels); + + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + for (int i = 0; i < benchmark_iterations_; ++i) { + if (has_avx2) { + MergeUV10Row_AVX2(reinterpret_cast(src_pixels_u), + reinterpret_cast(src_pixels_v), + reinterpret_cast(dst_pixels_uv_opt), kPixels); + } else { + MergeUV10Row_C(reinterpret_cast(src_pixels_u), + reinterpret_cast(src_pixels_v), + reinterpret_cast(dst_pixels_uv_opt), kPixels); + } + } + + for (int i = 0; i < kPixels * 2 * 2; ++i) { + EXPECT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]); + } + + free_aligned_buffer_page_end(src_pixels_u); + free_aligned_buffer_page_end(src_pixels_v); + free_aligned_buffer_page_end(dst_pixels_uv_opt); + free_aligned_buffer_page_end(dst_pixels_uv_c); +} +#endif + float TestScaleMaxSamples(int benchmark_width, int benchmark_height, int benchmark_iterations,