mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
Convert8To16 for better H010 support
Convert planar 8 bit formats to planar 16 bit formats. Accepts a parameter that determines the number of bits. Bug: libyuv:751 Test: Convert8To16 unittest Change-Id: I8f6ffe64428ddf5769b87e0c069093a50a2541e9 Reviewed-on: https://chromium-review.googlesource.com/835410 Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
c67db60534
commit
768f103b8b
@ -270,6 +270,7 @@ extern "C" {
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
#define HAS_ARGBTOAR30ROW_SSSE3
|
||||
#define HAS_CONVERT16TO8ROW_SSSE3
|
||||
#define HAS_CONVERT8TO16ROW_SSE2
|
||||
#define HAS_MERGERGBROW_SSSE3
|
||||
#define HAS_SPLITRGBROW_SSSE3
|
||||
#endif
|
||||
@ -281,6 +282,7 @@ extern "C" {
|
||||
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
|
||||
#define HAS_ARGBTOAR30ROW_AVX2
|
||||
#define HAS_CONVERT16TO8ROW_AVX2
|
||||
#define HAS_CONVERT8TO16ROW_AVX2
|
||||
#define HAS_MERGEUVROW_16_AVX2
|
||||
#define HAS_MULTIPLYROW_16_AVX2
|
||||
#endif
|
||||
@ -1428,6 +1430,24 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
|
||||
int width);
|
||||
void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
|
||||
|
||||
void Convert8To16Row_C(const uint8* src_y, uint16* dst_y, int scale, int width);
|
||||
void Convert8To16Row_SSE2(const uint8* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert8To16Row_AVX2(const uint8* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert8To16Row_Any_SSE2(const uint8* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert8To16Row_Any_AVX2(const uint8* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
|
||||
void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
|
||||
void Convert16To8Row_SSSE3(const uint16* src_y,
|
||||
uint8* dst_y,
|
||||
|
||||
@ -699,26 +699,38 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
|
||||
#undef ANY11P
|
||||
|
||||
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
|
||||
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint16* src_ptr, uint8* dst_ptr, int scale, int width) { \
|
||||
SIMD_ALIGNED(uint16 temp[32]); \
|
||||
SIMD_ALIGNED(uint8 out[32]); \
|
||||
memset(temp, 0, 64); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, scale, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + n, r * SBPP); \
|
||||
ANY_SIMD(temp, out, scale, MASK + 1); \
|
||||
memcpy(dst_ptr + n, out, r * BPP); \
|
||||
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
|
||||
void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
|
||||
SIMD_ALIGNED(STYPE temp[32]); \
|
||||
SIMD_ALIGNED(DTYPE out[32]); \
|
||||
memset(temp, 0, 32 * SBPP); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, scale, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + n, r * SBPP); \
|
||||
ANY_SIMD(temp, out, scale, MASK + 1); \
|
||||
memcpy(dst_ptr + n, out, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_CONVERT16TO8ROW_SSSE3
|
||||
ANY11C(Convert16To8Row_Any_SSSE3, Convert16To8Row_SSSE3, 2, 1, 15)
|
||||
ANY11C(Convert16To8Row_Any_SSSE3,
|
||||
Convert16To8Row_SSSE3,
|
||||
2,
|
||||
1,
|
||||
uint16,
|
||||
uint8,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_CONVERT16TO8ROW_AVX2
|
||||
ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, 31)
|
||||
ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, uint16, uint8, 31)
|
||||
#endif
|
||||
#ifdef HAS_CONVERT8TO16ROW_SSE2
|
||||
ANY11C(Convert8To16Row_Any_SSE2, Convert8To16Row_SSE2, 1, 2, uint8, uint16, 15)
|
||||
#endif
|
||||
#ifdef HAS_CONVERT8TO16ROW_AVX2
|
||||
ANY11C(Convert8To16Row_Any_AVX2, Convert8To16Row_AVX2, 1, 2, uint8, uint16, 31)
|
||||
#endif
|
||||
#undef ANY11C
|
||||
|
||||
|
||||
@ -1877,6 +1877,19 @@ void Convert16To8Row_C(const uint16* src_y,
|
||||
}
|
||||
}
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
// 1024 = 10 bits
|
||||
void Convert8To16Row_C(const uint8* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
int x;
|
||||
scale *= 0x0101; // replicates the byte.
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_y[x] = (src_y[x] * scale) >> 16;
|
||||
}
|
||||
}
|
||||
|
||||
void CopyRow_C(const uint8* src, uint8* dst, int count) {
|
||||
memcpy(dst, src, count);
|
||||
}
|
||||
|
||||
@ -2981,7 +2981,7 @@ void Convert16To8Row_SSSE3(const uint16* src_y,
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
#ifdef HAS_MULTIPLYROW_16_AVX2
|
||||
#ifdef HAS_CONVERT16TO8ROW_AVX2
|
||||
void Convert16To8Row_AVX2(const uint16* src_y,
|
||||
uint8* dst_y,
|
||||
int scale,
|
||||
@ -3014,7 +3014,81 @@ void Convert16To8Row_AVX2(const uint16* src_y,
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
// clang-format on
|
||||
}
|
||||
#endif // HAS_MULTIPLYROW_16_AVX2
|
||||
#endif // HAS_CONVERT16TO8ROW_AVX2
|
||||
|
||||
// Use scale to convert to lsb formats depending how many bits there are:
|
||||
// 512 = 9 bits
|
||||
// 1024 = 10 bits
|
||||
// 4096 = 12 bits
|
||||
// TODO(fbarchard): reduce to SSE2
|
||||
void Convert8To16Row_SSE2(const uint8* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
// clang-format off
|
||||
asm volatile (
|
||||
"movd %3,%%xmm2 \n"
|
||||
"punpcklwd %%xmm2,%%xmm2 \n"
|
||||
"pshufd $0x0,%%xmm2,%%xmm2 \n"
|
||||
|
||||
// 32 pixels per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm1 \n"
|
||||
"add $0x10,%0 \n"
|
||||
"pmulhuw %%xmm2,%%xmm0 \n"
|
||||
"pmulhuw %%xmm2,%%xmm1 \n"
|
||||
"movdqu %%xmm0,(%1) \n"
|
||||
"movdqu %%xmm1,0x10(%1) \n"
|
||||
"add $0x20,%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
#ifdef HAS_CONVERT8TO16ROW_AVX2
|
||||
void Convert8To16Row_AVX2(const uint8* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
// clang-format off
|
||||
asm volatile (
|
||||
"vmovd %3,%%xmm2 \n"
|
||||
"vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
|
||||
"vbroadcastss %%xmm2,%%ymm2 \n"
|
||||
|
||||
// 32 pixels per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"add $0x20,%0 \n"
|
||||
"vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
|
||||
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
|
||||
"vmovdqu %%ymm0,(%1) \n"
|
||||
"vmovdqu %%ymm1,0x20(%1) \n"
|
||||
"add $0x40,%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
// clang-format on
|
||||
}
|
||||
#endif // HAS_CONVERT8TO16ROW_AVX2
|
||||
|
||||
#ifdef HAS_SPLITRGBROW_SSSE3
|
||||
|
||||
|
||||
@ -2733,13 +2733,14 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
|
||||
// TODO(fbarchard): Improve test for more platforms.
|
||||
#ifdef HAS_CONVERT16TO8ROW_AVX2
|
||||
TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
|
||||
const int kPixels = benchmark_width_ * benchmark_height_;
|
||||
// AVX2 does multiple of 32, so round count up
|
||||
const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
|
||||
align_buffer_page_end(src_pixels_y, kPixels * 2);
|
||||
align_buffer_page_end(dst_pixels_y_opt, kPixels);
|
||||
align_buffer_page_end(dst_pixels_y_c, kPixels);
|
||||
|
||||
MemRandomize(src_pixels_y, kPixels * 2);
|
||||
// C code does not clamp so limit source range to 10 bits.
|
||||
// clamp source range to 10 bits.
|
||||
for (int i = 0; i < kPixels; ++i) {
|
||||
reinterpret_cast<uint16*>(src_pixels_y)[i] &= 1023;
|
||||
}
|
||||
@ -2775,6 +2776,50 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
|
||||
}
|
||||
#endif // HAS_CONVERT16TO8ROW_AVX2
|
||||
|
||||
// TODO(fbarchard): Improve test for more platforms.
|
||||
#ifdef HAS_CONVERT8TO16ROW_AVX2
|
||||
TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) {
|
||||
const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
|
||||
align_buffer_page_end(src_pixels_y, kPixels);
|
||||
align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
|
||||
align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
|
||||
|
||||
MemRandomize(src_pixels_y, kPixels);
|
||||
|
||||
memset(dst_pixels_y_opt, 0, kPixels);
|
||||
memset(dst_pixels_y_c, 1, kPixels);
|
||||
|
||||
Convert8To16Row_C(src_pixels_y, reinterpret_cast<uint16*>(dst_pixels_y_c),
|
||||
1024, kPixels);
|
||||
|
||||
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
if (has_avx2) {
|
||||
Convert8To16Row_AVX2(src_pixels_y,
|
||||
reinterpret_cast<uint16*>(dst_pixels_y_opt), 1024,
|
||||
kPixels);
|
||||
} else if (has_sse2) {
|
||||
Convert8To16Row_SSE2(src_pixels_y,
|
||||
reinterpret_cast<uint16*>(dst_pixels_y_opt), 1024,
|
||||
kPixels);
|
||||
} else {
|
||||
Convert8To16Row_C(src_pixels_y,
|
||||
reinterpret_cast<uint16*>(dst_pixels_y_opt), 1024,
|
||||
kPixels);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < kPixels; ++i) {
|
||||
EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(src_pixels_y);
|
||||
free_aligned_buffer_page_end(dst_pixels_y_opt);
|
||||
free_aligned_buffer_page_end(dst_pixels_y_c);
|
||||
}
|
||||
#endif // HAS_CONVERT8TO16ROW_AVX2
|
||||
|
||||
float TestScaleMaxSamples(int benchmark_width,
|
||||
int benchmark_height,
|
||||
int benchmark_iterations,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user