diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 743f6b154..16e0fd834 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -37,7 +37,7 @@ extern "C" { // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) -#define LIBYUV_DISABLE_X86 +// define LIBYUV_DISABLE_X86 #endif #endif // True if compiling for SSSE3 as a requirement. @@ -268,6 +268,7 @@ extern "C" { // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_MERGERGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #endif @@ -1541,11 +1542,23 @@ void MultiplyRow_16_AVX2(const uint16* src_y, int width); void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width); +void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width); +void Convert16To8Row_SSSE3(const uint16* src_y, + uint8* dst_y, + int scale, + int width); void Convert16To8Row_AVX2(const uint16* src_y, uint8* dst_y, int scale, int width); -void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width); +void Convert16To8Row_Any_SSSE3(const uint16* src_y, + uint8* dst_y, + int scale, + int width); +void Convert16To8Row_Any_AVX2(const uint16* src_y, + uint8* dst_y, + int scale, + int width); void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 8875da57a..0fdaf6c5e 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -462,15 +462,22 @@ static int H010ToAR30Matrix(const uint16* src_y, dst_stride_ar30 = -dst_stride_ar30; } +#if defined(HAS_CONVERT16TO8ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Convert16To8Row = Convert16To8Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_SSSE3; + } + } +#endif #if defined(HAS_CONVERT16TO8ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - Convert16To8Row = Convert16To8Row_C; // TODO(fbarchard): Any AVX2 - if (IS_ALIGNED(width, 64)) { + Convert16To8Row = Convert16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { Convert16To8Row = Convert16To8Row_AVX2; } } #endif - #if defined(HAS_ARGBTOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; @@ -479,7 +486,6 @@ static int H010ToAR30Matrix(const uint16* src_y, } } #endif - #if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index 4f1877656..940f13983 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -732,10 +732,34 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7) #undef ANY11P // Any 1 to 1 with parameter and shorts. BPP measures in shorts. +#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(const uint16* src_ptr, uint8* dst_ptr, int scale, int width) { \ + SIMD_ALIGNED(uint16 temp[32]); \ + SIMD_ALIGNED(uint8 out[32]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, scale, n); \ + } \ + memcpy(temp, src_ptr + n, r * SBPP); \ + ANY_SIMD(temp, out, scale, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ + } + +#ifdef HAS_CONVERT16TO8ROW_SSSE3 +ANY11C(Convert16To8Row_Any_SSSE3, Convert16To8Row_SSSE3, 2, 1, 15) +#endif +#ifdef HAS_CONVERT16TO8ROW_AVX2 +ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, 31) +#endif +#undef ANY11C + +// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. #define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T param, int width) { \ - SIMD_ALIGNED(uint16 temp[16 * 2]); \ - memset(temp, 0, 32); /* for msan */ \ + SIMD_ALIGNED(uint16 temp[32 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 45d287faa..a66da6fab 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2894,6 +2894,37 @@ void MultiplyRow_16_AVX2(const uint16* src_y, // 16384 = 10 bits // 4096 = 12 bits // 256 = 16 bits +void Convert16To8Row_SSSE3(const uint16* src_y, + uint8* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm3 \n" + "punpcklwd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmulhuw %%xmm3,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "add $0x20,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} + #ifdef HAS_MULTIPLYROW_16_AVX2 void Convert16To8Row_AVX2(const uint16* src_y, uint8* dst_y, diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index ff39b2b0f..1c6d988ef 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -338,7 +338,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848 TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { uint32 h1 = 0; - const int kMaxWidth = benchmark_width_ * benchmark_height_; + const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31; align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_b, kMaxWidth); memset(src_a, 255u, kMaxWidth); diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 8bcb63d3c..b0bbb590a 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1966,63 +1966,73 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { // Alias to copy pixels as is #define AR30ToAR30 ARGBToARGB -#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - ALIGN, YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, \ - BPP_C) \ - TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ - const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - const int kBpc = 2; \ - align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF); \ - align_buffer_page_end(src_u, kSizeUV* kBpc + OFF); \ - align_buffer_page_end(src_v, kSizeUV* kBpc + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - reinterpret_cast(src_y)[i + OFF] = (fastrand() & 0x3ff); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - reinterpret_cast(src_u)[i + OFF] = (fastrand() & 0x3ff); \ - reinterpret_cast(src_v)[i + OFF] = (fastrand() & 0x3ff); \ - } \ - memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ - memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_PLANAR##To##FMT_B(reinterpret_cast(src_y) + OFF, kWidth, \ - reinterpret_cast(src_u) + OFF, kStrideUV, \ - reinterpret_cast(src_v) + OFF, kStrideUV, \ - dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B(reinterpret_cast(src_y) + OFF, kWidth, \ - reinterpret_cast(src_u) + OFF, kStrideUV, \ - reinterpret_cast(src_v) + OFF, kStrideUV, \ - dst_argb_opt + OFF, kStrideB, kWidth, \ - NEG kHeight); \ - } \ - int max_diff = 0; \ - for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ - int abs_diff = abs(static_cast(dst_argb_c[i]) - \ - static_cast(dst_argb_opt[i])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - EXPECT_LE(max_diff, DIFF); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ +#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF, \ + FMT_C, BPP_C) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + const int kBpc = 2; \ + align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ + align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \ + align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + reinterpret_cast(src_y + SOFF)[i] = (fastrand() & 0x3ff); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + reinterpret_cast(src_u + SOFF)[i] = (fastrand() & 0x3ff); \ + reinterpret_cast(src_v + SOFF)[i] = (fastrand() & 0x3ff); \ + } \ + memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B(reinterpret_cast(src_y + SOFF), kWidth, \ + reinterpret_cast(src_u + SOFF), kStrideUV, \ + reinterpret_cast(src_v + SOFF), kStrideUV, \ + dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B( \ + reinterpret_cast(src_y + SOFF), kWidth, \ + reinterpret_cast(src_u + SOFF), kStrideUV, \ + reinterpret_cast(src_v + SOFF), kStrideUV, \ + dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ + int abs_diff = abs(static_cast(dst_argb_c[i + DOFF]) - \ + static_cast(dst_argb_opt[i + DOFF])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ } #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, DIFF, FMT_C, BPP_C) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C) + YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0, FMT_C, \ + BPP_C) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1, FMT_C, \ + BPP_C) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0, FMT_C, \ + BPP_C) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0, FMT_C, \ + BPP_C) TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2, AR30, 4) diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 151bcafd1..6e1c27cad 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2720,10 +2720,14 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) { dst_pixels_y_c, 16384, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { Convert16To8Row_AVX2(reinterpret_cast(src_pixels_y), dst_pixels_y_opt, 16384, kPixels); + } else if (has_ssse3) { + Convert16To8Row_SSSE3(reinterpret_cast(src_pixels_y), + dst_pixels_y_opt, 16384, kPixels); } else { Convert16To8Row_C(reinterpret_cast(src_pixels_y), dst_pixels_y_opt, 16384, kPixels);