diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 96973da67..507509519 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -277,7 +277,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) -#define HAS_MERGEUV10ROW_AVX2 +#define HAS_MERGEUVROW_16_AVX2 #endif // The following are available on Neon platforms: @@ -1521,14 +1521,16 @@ void MergeRGBRow_Any_NEON(const uint8* src_r, uint8* dst_rgb, int width); -void MergeUV10Row_C(const uint16* src_u, - const uint16* src_v, - uint16* dst_uv, - int width); -void MergeUV10Row_AVX2(const uint16* src_u, - const uint16* src_v, - uint16* dst_uv, - int width); +void MergeUVRow_16_C(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int scale, /* 64 for 10 bit */ + int width); +void MergeUVRow_16_AVX2(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int scale, + int width); void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count); diff --git a/source/row_common.cc b/source/row_common.cc index c3294ece5..8612665e5 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1798,21 +1798,22 @@ void MergeRGBRow_C(const uint8* src_r, } } -void MergeUV10Row_C(const uint16* src_u, - const uint16* src_v, - uint16* dst_uv, - int width) { +void MergeUVRow_16_C(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int scale, + int width) { int x; for (x = 0; x < width - 1; x += 2) { - dst_uv[0] = src_u[x] << 6; - dst_uv[1] = src_v[x] << 6; - dst_uv[2] = src_u[x + 1] << 6; - dst_uv[3] = src_v[x + 1] << 6; + dst_uv[0] = src_u[x] * scale; + dst_uv[1] = src_v[x] * scale; + dst_uv[2] = src_u[x + 1] * scale; + dst_uv[3] = src_v[x + 1] * scale; dst_uv += 4; } if (width & 1) { - dst_uv[0] = src_u[width - 1] << 6; - dst_uv[1] = src_v[width - 1] << 6; + dst_uv[0] = src_u[width - 1] * scale; + dst_uv[1] = src_v[width - 1] * scale; } } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index ff2e8a378..ecb77983e 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2753,13 +2753,23 @@ void MergeUVRow_SSE2(const uint8* src_u, } #endif // HAS_MERGEUVROW_SSE2 -#ifdef HAS_MERGEUV10ROW_AVX2 -void MergeUV10Row_AVX2(const uint16* src_u, - const uint16* src_v, - uint16* dst_uv, - int width) { +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits + +#ifdef HAS_MERGEUVROW_16_AVX2 +void MergeUVRow_16_AVX2(const uint16* src_u, + const uint16* src_v, + uint16* dst_uv, + int scale, + int width) { // clang-format off asm volatile ( + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" // 16 pixels per loop. @@ -2768,8 +2778,9 @@ void MergeUV10Row_AVX2(const uint16* src_u, "vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0,%1,1),%%ymm1 \n" "add $0x20,%0 \n" - "vpsllw $0x6,%%ymm0,%%ymm0 \n" - "vpsllw $0x6,%%ymm1,%%ymm1 \n" + + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm2,(%2) \n" @@ -2784,8 +2795,8 @@ void MergeUV10Row_AVX2(const uint16* src_u, "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "r"(scale) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); // clang-format on } #endif // HAS_MERGEUVROW_AVX2 diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 1cbd13f8b..34414c9fc 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2618,8 +2618,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { } // TODO(fbarchard): improve test for platforms and cpu detect -#ifdef HAS_MERGEUV10ROW_AVX2 -TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) { +#ifdef HAS_MERGEUVROW_16_AVX2 +TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels_u, kPixels * 2); align_buffer_page_end(src_pixels_v, kPixels * 2); @@ -2631,20 +2631,22 @@ TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) { memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2); memset(dst_pixels_uv_c, 1, kPixels * 2 * 2); - MergeUV10Row_C(reinterpret_cast(src_pixels_u), - reinterpret_cast(src_pixels_v), - reinterpret_cast(dst_pixels_uv_c), kPixels); + MergeUVRow_16_C(reinterpret_cast(src_pixels_u), + reinterpret_cast(src_pixels_v), + reinterpret_cast(dst_pixels_uv_c), 64, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { - MergeUV10Row_AVX2(reinterpret_cast(src_pixels_u), - reinterpret_cast(src_pixels_v), - reinterpret_cast(dst_pixels_uv_opt), kPixels); + MergeUVRow_16_AVX2(reinterpret_cast(src_pixels_u), + reinterpret_cast(src_pixels_v), + reinterpret_cast(dst_pixels_uv_opt), 64, + kPixels); } else { - MergeUV10Row_C(reinterpret_cast(src_pixels_u), - reinterpret_cast(src_pixels_v), - reinterpret_cast(dst_pixels_uv_opt), kPixels); + MergeUVRow_16_C(reinterpret_cast(src_pixels_u), + reinterpret_cast(src_pixels_v), + reinterpret_cast(dst_pixels_uv_opt), 64, + kPixels); } }