diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ea0186dd1..2b8aabfb6 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -270,6 +270,7 @@ extern "C" { (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 +#define HAS_CONVERT8TO16ROW_SSE2 #define HAS_MERGERGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #endif @@ -281,6 +282,7 @@ extern "C" { (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 +#define HAS_CONVERT8TO16ROW_AVX2 #define HAS_MERGEUVROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2 #endif @@ -1428,6 +1430,24 @@ void MultiplyRow_16_AVX2(const uint16* src_y, int width); void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width); +void Convert8To16Row_C(const uint8* src_y, uint16* dst_y, int scale, int width); +void Convert8To16Row_SSE2(const uint8* src_y, + uint16* dst_y, + int scale, + int width); +void Convert8To16Row_AVX2(const uint8* src_y, + uint16* dst_y, + int scale, + int width); +void Convert8To16Row_Any_SSE2(const uint8* src_y, + uint16* dst_y, + int scale, + int width); +void Convert8To16Row_Any_AVX2(const uint8* src_y, + uint16* dst_y, + int scale, + int width); + void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width); void Convert16To8Row_SSSE3(const uint16* src_y, uint8* dst_y, diff --git a/source/row_any.cc b/source/row_any.cc index cc9fb50c9..64695288a 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -699,26 +699,38 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7) #undef ANY11P // Any 1 to 1 with parameter and shorts. BPP measures in shorts. -#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(const uint16* src_ptr, uint8* dst_ptr, int scale, int width) { \ - SIMD_ALIGNED(uint16 temp[32]); \ - SIMD_ALIGNED(uint8 out[32]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, scale, n); \ - } \ - memcpy(temp, src_ptr + n, r * SBPP); \ - ANY_SIMD(temp, out, scale, MASK + 1); \ - memcpy(dst_ptr + n, out, r * BPP); \ +#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ + SIMD_ALIGNED(STYPE temp[32]); \ + SIMD_ALIGNED(DTYPE out[32]); \ + memset(temp, 0, 32 * SBPP); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, scale, n); \ + } \ + memcpy(temp, src_ptr + n, r * SBPP); \ + ANY_SIMD(temp, out, scale, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ } #ifdef HAS_CONVERT16TO8ROW_SSSE3 -ANY11C(Convert16To8Row_Any_SSSE3, Convert16To8Row_SSSE3, 2, 1, 15) +ANY11C(Convert16To8Row_Any_SSSE3, + Convert16To8Row_SSSE3, + 2, + 1, + uint16, + uint8, + 15) #endif #ifdef HAS_CONVERT16TO8ROW_AVX2 -ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, 31) +ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, uint16, uint8, 31) +#endif +#ifdef HAS_CONVERT8TO16ROW_SSE2 +ANY11C(Convert8To16Row_Any_SSE2, Convert8To16Row_SSE2, 1, 2, uint8, uint16, 15) +#endif +#ifdef HAS_CONVERT8TO16ROW_AVX2 +ANY11C(Convert8To16Row_Any_AVX2, Convert8To16Row_AVX2, 1, 2, uint8, uint16, 31) #endif #undef ANY11C diff --git a/source/row_common.cc b/source/row_common.cc index 4a5f53710..e5fef5d92 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1877,6 +1877,19 @@ void Convert16To8Row_C(const uint16* src_y, } } +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 1024 = 10 bits +void Convert8To16Row_C(const uint8* src_y, + uint16* dst_y, + int scale, + int width) { + int x; + scale *= 0x0101; // replicates the byte. + for (x = 0; x < width; ++x) { + dst_y[x] = (src_y[x] * scale) >> 16; + } +} + void CopyRow_C(const uint8* src, uint8* dst, int count) { memcpy(dst, src, count); } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 70f72b207..a4a26c1ed 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2981,7 +2981,7 @@ void Convert16To8Row_SSSE3(const uint16* src_y, // clang-format on } -#ifdef HAS_MULTIPLYROW_16_AVX2 +#ifdef HAS_CONVERT16TO8ROW_AVX2 void Convert16To8Row_AVX2(const uint16* src_y, uint8* dst_y, int scale, @@ -3014,7 +3014,81 @@ void Convert16To8Row_AVX2(const uint16* src_y, : "memory", "cc", "xmm0", "xmm1", "xmm2"); // clang-format on } -#endif // HAS_MULTIPLYROW_16_AVX2 +#endif // HAS_CONVERT16TO8ROW_AVX2 + +// Use scale to convert to lsb formats depending how many bits there are: +// 512 = 9 bits +// 1024 = 10 bits +// 4096 = 12 bits +// TODO(fbarchard): reduce to SSE2 +void Convert8To16Row_SSE2(const uint8* src_y, + uint16* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "add $0x10,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "add $0x20,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} + +#ifdef HAS_CONVERT8TO16ROW_AVX2 +void Convert8To16Row_AVX2(const uint8* src_y, + uint16* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT8TO16ROW_AVX2 #ifdef HAS_SPLITRGBROW_SSSE3 diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 02a9d5272..ce1736891 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2733,13 +2733,14 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) { // TODO(fbarchard): Improve test for more platforms. #ifdef HAS_CONVERT16TO8ROW_AVX2 TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) { - const int kPixels = benchmark_width_ * benchmark_height_; + // AVX2 does multiple of 32, so round count up + const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31; align_buffer_page_end(src_pixels_y, kPixels * 2); align_buffer_page_end(dst_pixels_y_opt, kPixels); align_buffer_page_end(dst_pixels_y_c, kPixels); MemRandomize(src_pixels_y, kPixels * 2); - // C code does not clamp so limit source range to 10 bits. + // clamp source range to 10 bits. for (int i = 0; i < kPixels; ++i) { reinterpret_cast(src_pixels_y)[i] &= 1023; } @@ -2775,6 +2776,50 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) { } #endif // HAS_CONVERT16TO8ROW_AVX2 +// TODO(fbarchard): Improve test for more platforms. +#ifdef HAS_CONVERT8TO16ROW_AVX2 +TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) { + const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31; + align_buffer_page_end(src_pixels_y, kPixels); + align_buffer_page_end(dst_pixels_y_opt, kPixels * 2); + align_buffer_page_end(dst_pixels_y_c, kPixels * 2); + + MemRandomize(src_pixels_y, kPixels); + + memset(dst_pixels_y_opt, 0, kPixels); + memset(dst_pixels_y_c, 1, kPixels); + + Convert8To16Row_C(src_pixels_y, reinterpret_cast(dst_pixels_y_c), + 1024, kPixels); + + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + int has_sse2 = TestCpuFlag(kCpuHasSSE2); + for (int i = 0; i < benchmark_iterations_; ++i) { + if (has_avx2) { + Convert8To16Row_AVX2(src_pixels_y, + reinterpret_cast(dst_pixels_y_opt), 1024, + kPixels); + } else if (has_sse2) { + Convert8To16Row_SSE2(src_pixels_y, + reinterpret_cast(dst_pixels_y_opt), 1024, + kPixels); + } else { + Convert8To16Row_C(src_pixels_y, + reinterpret_cast(dst_pixels_y_opt), 1024, + kPixels); + } + } + + for (int i = 0; i < kPixels; ++i) { + EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]); + } + + free_aligned_buffer_page_end(src_pixels_y); + free_aligned_buffer_page_end(dst_pixels_y_opt); + free_aligned_buffer_page_end(dst_pixels_y_c); +} +#endif // HAS_CONVERT8TO16ROW_AVX2 + float TestScaleMaxSamples(int benchmark_width, int benchmark_height, int benchmark_iterations,