Convert16To8Row_SSSE3 port from AVX2

H010ToAR30 uses Convert16To8Row_SSSE3 to convert 10 bit YUV to 8 bit.
Then standard YUV conversion can be used.  This improves performance
on low end CPUs.
Future CL will by pass this conversion allowing for 10 bit YUV source,
but the function will be useful as a utility for YUV conversions.

Bug: libyuv:559, libyuv:751
Test: out/Release/libyuv_unittest --gtest_filter=*H010ToAR30* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1
Change-Id: I9b3ef22d88a5fd861de4cf1900b4c6e8fd24d0af
Reviewed-on: https://chromium-review.googlesource.com/792334
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2017-11-28 10:30:55 -08:00 committed by Commit Bot
parent 8445617191
commit 324fa32739
7 changed files with 149 additions and 61 deletions

View File

@ -37,7 +37,7 @@ extern "C" {
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
// define LIBYUV_DISABLE_X86
#endif
#endif
// True if compiling for SSSE3 as a requirement.
@ -268,6 +268,7 @@ extern "C" {
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#endif
@ -1541,11 +1542,23 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
int width);
void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
void Convert16To8Row_SSSE3(const uint16* src_y,
uint8* dst_y,
int scale,
int width);
void Convert16To8Row_AVX2(const uint16* src_y,
uint8* dst_y,
int scale,
int width);
void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
void Convert16To8Row_Any_SSSE3(const uint16* src_y,
uint8* dst_y,
int scale,
int width);
void Convert16To8Row_Any_AVX2(const uint16* src_y,
uint8* dst_y,
int scale,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);

View File

@ -462,15 +462,22 @@ static int H010ToAR30Matrix(const uint16* src_y,
dst_stride_ar30 = -dst_stride_ar30;
}
#if defined(HAS_CONVERT16TO8ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
Convert16To8Row = Convert16To8Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
Convert16To8Row = Convert16To8Row_SSSE3;
}
}
#endif
#if defined(HAS_CONVERT16TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
Convert16To8Row = Convert16To8Row_C; // TODO(fbarchard): Any AVX2
if (IS_ALIGNED(width, 64)) {
Convert16To8Row = Convert16To8Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
Convert16To8Row = Convert16To8Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOAR30ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
@ -479,7 +486,6 @@ static int H010ToAR30Matrix(const uint16* src_y,
}
}
#endif
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;

View File

@ -732,10 +732,34 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
#undef ANY11P
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(const uint16* src_ptr, uint8* dst_ptr, int scale, int width) { \
SIMD_ALIGNED(uint16 temp[32]); \
SIMD_ALIGNED(uint8 out[32]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, scale, n); \
} \
memcpy(temp, src_ptr + n, r * SBPP); \
ANY_SIMD(temp, out, scale, MASK + 1); \
memcpy(dst_ptr + n, out, r * BPP); \
}
#ifdef HAS_CONVERT16TO8ROW_SSSE3
ANY11C(Convert16To8Row_Any_SSSE3, Convert16To8Row_SSSE3, 2, 1, 15)
#endif
#ifdef HAS_CONVERT16TO8ROW_AVX2
ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, 31)
#endif
#undef ANY11C
// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T param, int width) { \
SIMD_ALIGNED(uint16 temp[16 * 2]); \
memset(temp, 0, 32); /* for msan */ \
SIMD_ALIGNED(uint16 temp[32 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \

View File

@ -2894,6 +2894,37 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
void Convert16To8Row_SSSE3(const uint16* src_y,
uint8* dst_y,
int scale,
int width) {
// clang-format off
asm volatile (
"movd %3,%%xmm3 \n"
"punpcklwd %%xmm3,%%xmm3 \n"
"pshufd $0x0,%%xmm3,%%xmm3 \n"
// 32 pixels per loop.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"pmulhuw %%xmm3,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"add $0x20,%0 \n"
"add $0x10,%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm3");
// clang-format on
}
#ifdef HAS_MULTIPLYROW_16_AVX2
void Convert16To8Row_AVX2(const uint16* src_y,
uint8* dst_y,

View File

@ -338,7 +338,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848
TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
uint32 h1 = 0;
const int kMaxWidth = benchmark_width_ * benchmark_height_;
const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31;
align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth);
memset(src_a, 255u, kMaxWidth);

View File

@ -1966,63 +1966,73 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
// Alias to copy pixels as is
#define AR30ToAR30 ARGBToARGB
#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
ALIGN, YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, \
BPP_C) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
const int kBpc = 2; \
align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF); \
align_buffer_page_end(src_u, kSizeUV* kBpc + OFF); \
align_buffer_page_end(src_v, kSizeUV* kBpc + OFF); \
align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
for (int i = 0; i < kWidth * kHeight; ++i) { \
reinterpret_cast<uint16*>(src_y)[i + OFF] = (fastrand() & 0x3ff); \
} \
for (int i = 0; i < kSizeUV; ++i) { \
reinterpret_cast<uint16*>(src_u)[i + OFF] = (fastrand() & 0x3ff); \
reinterpret_cast<uint16*>(src_v)[i + OFF] = (fastrand() & 0x3ff); \
} \
memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth, \
reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV, \
reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV, \
dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth, \
reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV, \
reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV, \
dst_argb_opt + OFF, kStrideB, kWidth, \
NEG kHeight); \
} \
int max_diff = 0; \
for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
static_cast<int>(dst_argb_opt[i])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \
free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \
#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF, \
FMT_C, BPP_C) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
const int kBpc = 2; \
align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \
align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \
align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \
align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \
align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \
for (int i = 0; i < kWidth * kHeight; ++i) { \
reinterpret_cast<uint16*>(src_y + SOFF)[i] = (fastrand() & 0x3ff); \
} \
for (int i = 0; i < kSizeUV; ++i) { \
reinterpret_cast<uint16*>(src_u + SOFF)[i] = (fastrand() & 0x3ff); \
reinterpret_cast<uint16*>(src_v + SOFF)[i] = (fastrand() & 0x3ff); \
} \
memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \
memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y + SOFF), kWidth, \
reinterpret_cast<uint16*>(src_u + SOFF), kStrideUV, \
reinterpret_cast<uint16*>(src_v + SOFF), kStrideUV, \
dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
FMT_PLANAR##To##FMT_B( \
reinterpret_cast<uint16*>(src_y + SOFF), kWidth, \
reinterpret_cast<uint16*>(src_u + SOFF), kStrideUV, \
reinterpret_cast<uint16*>(src_v + SOFF), kStrideUV, \
dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \
} \
int max_diff = 0; \
for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) - \
static_cast<int>(dst_argb_opt[i + DOFF])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_u); \
free_aligned_buffer_page_end(src_v); \
free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \
}
#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, DIFF, FMT_C, BPP_C) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0, FMT_C, \
BPP_C) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1, FMT_C, \
BPP_C) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0, FMT_C, \
BPP_C) \
TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0, FMT_C, \
BPP_C)
TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2, AR30, 4)

View File

@ -2720,10 +2720,14 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
dst_pixels_y_c, 16384, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) {
Convert16To8Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
dst_pixels_y_opt, 16384, kPixels);
} else if (has_ssse3) {
Convert16To8Row_SSSE3(reinterpret_cast<const uint16*>(src_pixels_y),
dst_pixels_y_opt, 16384, kPixels);
} else {
Convert16To8Row_C(reinterpret_cast<const uint16*>(src_pixels_y),
dst_pixels_y_opt, 16384, kPixels);