From 49d1e3b0363b28b8fbfacb680b42aa8d8db09ace Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 9 Nov 2017 21:09:42 -0800 Subject: [PATCH] MultiplyRow_16_AVX2 for converting 10 bit YUV When converting from lsb 10 bit formats to msb, the values need to be shifted to the top 10 bits. Using a multiply allows the different numbers of bits to be copied: // 128 = 9 bits // 64 = 10 bits // 16 = 12 bits // 1 = 16 bits Bug: libyuv:751 Test: LibYUVPlanarTest.MultiplyRow_16_Opt Change-Id: I9cf226053a164baa14155215cb175065b1c4f169 Reviewed-on: https://chromium-review.googlesource.com/762951 Reviewed-by: richard winterton Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- include/libyuv/row.h | 10 ++++++++++ source/row_common.cc | 10 ++++++++++ source/row_gcc.cc | 36 +++++++++++++++++++++++++++++++++++- unit_test/planar_test.cc | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+), 1 deletion(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 507509519..fc5caba4c 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -278,6 +278,7 @@ extern "C" { (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_MERGEUVROW_16_AVX2 +#define HAS_MULTIPLYROW_16_AVX2 #endif // The following are available on Neon platforms: @@ -1532,6 +1533,15 @@ void MergeUVRow_16_AVX2(const uint16* src_u, int scale, int width); +void MultiplyRow_16_AVX2(const uint16* src_y, + uint16* dst_y, + int scale, + int width); +void MultiplyRow_16_C(const uint16* src_y, + uint16* dst_y, + int scale, + int width); + void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count); diff --git a/source/row_common.cc b/source/row_common.cc index 8612665e5..6ffc4febb 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1817,6 +1817,16 @@ void MergeUVRow_16_C(const uint16* src_u, } } +void MultiplyRow_16_C(const uint16* src_y, + uint16* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = src_y[x] * scale; + } +} + void CopyRow_C(const uint8* src, uint8* dst, int count) { memcpy(dst, src, count); } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index ecb77983e..ca220a22b 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2758,7 +2758,6 @@ void MergeUVRow_SSE2(const uint8* src_u, // 64 = 10 bits // 16 = 12 bits // 1 = 16 bits - #ifdef HAS_MERGEUVROW_16_AVX2 void MergeUVRow_16_AVX2(const uint16* src_u, const uint16* src_v, @@ -2801,6 +2800,41 @@ void MergeUVRow_16_AVX2(const uint16* src_u, } #endif // HAS_MERGEUVROW_AVX2 + +#ifdef HAS_MULTIPLYROW_16_AVX2 +void MultiplyRow_16_AVX2(const uint16* src_y, + uint16* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + #ifdef HAS_SPLITRGBROW_SSSE3 // Shuffle table for converting RGB to Planar. diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 34414c9fc..f9e6f8abb 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2661,6 +2661,44 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { } #endif +// TODO(fbarchard): improve test for platforms and cpu detect +#ifdef HAS_MULTIPLYROW_16_AVX2 +TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_y, kPixels * 2); + align_buffer_page_end(dst_pixels_y_opt, kPixels * 2); + align_buffer_page_end(dst_pixels_y_c, kPixels * 2); + + MemRandomize(src_pixels_y, kPixels * 2); + memset(dst_pixels_y_opt, 0, kPixels * 2); + memset(dst_pixels_y_c, 1, kPixels * 2); + + MultiplyRow_16_C(reinterpret_cast(src_pixels_y), + reinterpret_cast(dst_pixels_y_c), 64, kPixels); + + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + for (int i = 0; i < benchmark_iterations_; ++i) { + if (has_avx2) { + MultiplyRow_16_AVX2(reinterpret_cast(src_pixels_y), + reinterpret_cast(dst_pixels_y_opt), 64, + kPixels); + } else { + MultiplyRow_16_C(reinterpret_cast(src_pixels_y), + reinterpret_cast(dst_pixels_y_opt), 64, + kPixels); + } + } + + for (int i = 0; i < kPixels * 2; ++i) { + EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]); + } + + free_aligned_buffer_page_end(src_pixels_y); + free_aligned_buffer_page_end(dst_pixels_y_opt); + free_aligned_buffer_page_end(dst_pixels_y_c); +} +#endif + float TestScaleMaxSamples(int benchmark_width, int benchmark_height, int benchmark_iterations,