mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
MergeUV10Row_AVX2 use multiply to handle different bit depths
Instead of hardcoded shift, use a multiply by a parameter. 128 = 9 bits 64 = 10 bits 16 = 12 bits 1 = 16 bits Bug: libyuv:751 Test: LibYUVPlanarTest.MergeUV10Row_Opt Change-Id: Id925edfdbf91243370c90641b50eb8e7625ec329 Reviewed-on: https://chromium-review.googlesource.com/762523 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
e26b0a7e0e
commit
2f58d126b9
@ -277,7 +277,7 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
|
||||
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
|
||||
#define HAS_MERGEUV10ROW_AVX2
|
||||
#define HAS_MERGEUVROW_16_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms:
|
||||
@ -1521,14 +1521,16 @@ void MergeRGBRow_Any_NEON(const uint8* src_r,
|
||||
uint8* dst_rgb,
|
||||
int width);
|
||||
|
||||
void MergeUV10Row_C(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int width);
|
||||
void MergeUV10Row_AVX2(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int width);
|
||||
void MergeUVRow_16_C(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int scale, /* 64 for 10 bit */
|
||||
int width);
|
||||
void MergeUVRow_16_AVX2(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int scale,
|
||||
int width);
|
||||
|
||||
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
|
||||
|
||||
@ -1798,21 +1798,22 @@ void MergeRGBRow_C(const uint8* src_r,
|
||||
}
|
||||
}
|
||||
|
||||
void MergeUV10Row_C(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int width) {
|
||||
void MergeUVRow_16_C(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int scale,
|
||||
int width) {
|
||||
int x;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
dst_uv[0] = src_u[x] << 6;
|
||||
dst_uv[1] = src_v[x] << 6;
|
||||
dst_uv[2] = src_u[x + 1] << 6;
|
||||
dst_uv[3] = src_v[x + 1] << 6;
|
||||
dst_uv[0] = src_u[x] * scale;
|
||||
dst_uv[1] = src_v[x] * scale;
|
||||
dst_uv[2] = src_u[x + 1] * scale;
|
||||
dst_uv[3] = src_v[x + 1] * scale;
|
||||
dst_uv += 4;
|
||||
}
|
||||
if (width & 1) {
|
||||
dst_uv[0] = src_u[width - 1] << 6;
|
||||
dst_uv[1] = src_v[width - 1] << 6;
|
||||
dst_uv[0] = src_u[width - 1] * scale;
|
||||
dst_uv[1] = src_v[width - 1] * scale;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -2753,13 +2753,23 @@ void MergeUVRow_SSE2(const uint8* src_u,
|
||||
}
|
||||
#endif // HAS_MERGEUVROW_SSE2
|
||||
|
||||
#ifdef HAS_MERGEUV10ROW_AVX2
|
||||
void MergeUV10Row_AVX2(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int width) {
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
// 128 = 9 bits
|
||||
// 64 = 10 bits
|
||||
// 16 = 12 bits
|
||||
// 1 = 16 bits
|
||||
|
||||
#ifdef HAS_MERGEUVROW_16_AVX2
|
||||
void MergeUVRow_16_AVX2(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
uint16* dst_uv,
|
||||
int scale,
|
||||
int width) {
|
||||
// clang-format off
|
||||
asm volatile (
|
||||
"vmovd %4,%%xmm3 \n"
|
||||
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
||||
"vbroadcastss %%xmm3,%%ymm3 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
// 16 pixels per loop.
|
||||
@ -2768,8 +2778,9 @@ void MergeUV10Row_AVX2(const uint16* src_u,
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vmovdqu (%0,%1,1),%%ymm1 \n"
|
||||
"add $0x20,%0 \n"
|
||||
"vpsllw $0x6,%%ymm0,%%ymm0 \n"
|
||||
"vpsllw $0x6,%%ymm1,%%ymm1 \n"
|
||||
|
||||
"vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
|
||||
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
|
||||
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vextractf128 $0x0,%%ymm2,(%2) \n"
|
||||
@ -2784,8 +2795,8 @@ void MergeUV10Row_AVX2(const uint16* src_u,
|
||||
"+r"(src_v), // %1
|
||||
"+r"(dst_uv), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
: "r"(scale) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||
// clang-format on
|
||||
}
|
||||
#endif // HAS_MERGEUVROW_AVX2
|
||||
|
||||
@ -2618,8 +2618,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
|
||||
}
|
||||
|
||||
// TODO(fbarchard): improve test for platforms and cpu detect
|
||||
#ifdef HAS_MERGEUV10ROW_AVX2
|
||||
TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
|
||||
#ifdef HAS_MERGEUVROW_16_AVX2
|
||||
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
|
||||
const int kPixels = benchmark_width_ * benchmark_height_;
|
||||
align_buffer_page_end(src_pixels_u, kPixels * 2);
|
||||
align_buffer_page_end(src_pixels_v, kPixels * 2);
|
||||
@ -2631,20 +2631,22 @@ TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
|
||||
memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
|
||||
memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
|
||||
|
||||
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
|
||||
reinterpret_cast<const uint16*>(src_pixels_v),
|
||||
reinterpret_cast<uint16*>(dst_pixels_uv_c), kPixels);
|
||||
MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
|
||||
reinterpret_cast<const uint16*>(src_pixels_v),
|
||||
reinterpret_cast<uint16*>(dst_pixels_uv_c), 64, kPixels);
|
||||
|
||||
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
if (has_avx2) {
|
||||
MergeUV10Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
|
||||
reinterpret_cast<const uint16*>(src_pixels_v),
|
||||
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
|
||||
MergeUVRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
|
||||
reinterpret_cast<const uint16*>(src_pixels_v),
|
||||
reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
|
||||
kPixels);
|
||||
} else {
|
||||
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
|
||||
reinterpret_cast<const uint16*>(src_pixels_v),
|
||||
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
|
||||
MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
|
||||
reinterpret_cast<const uint16*>(src_pixels_v),
|
||||
reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
|
||||
kPixels);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user