mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
MergeUV10Row_AVX2 use multiply to handle different bit depths
Instead of hardcoded shift, use a multiply by a parameter. 128 = 9 bits 64 = 10 bits 16 = 12 bits 1 = 16 bits Bug: libyuv:751 Test: LibYUVPlanarTest.MergeUV10Row_Opt Change-Id: Id925edfdbf91243370c90641b50eb8e7625ec329 Reviewed-on: https://chromium-review.googlesource.com/762523 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
e26b0a7e0e
commit
2f58d126b9
@ -277,7 +277,7 @@ extern "C" {
|
|||||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
|
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
|
||||||
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
|
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
|
||||||
#define HAS_MERGEUV10ROW_AVX2
|
#define HAS_MERGEUVROW_16_AVX2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// The following are available on Neon platforms:
|
// The following are available on Neon platforms:
|
||||||
@ -1521,13 +1521,15 @@ void MergeRGBRow_Any_NEON(const uint8* src_r,
|
|||||||
uint8* dst_rgb,
|
uint8* dst_rgb,
|
||||||
int width);
|
int width);
|
||||||
|
|
||||||
void MergeUV10Row_C(const uint16* src_u,
|
void MergeUVRow_16_C(const uint16* src_u,
|
||||||
const uint16* src_v,
|
const uint16* src_v,
|
||||||
uint16* dst_uv,
|
uint16* dst_uv,
|
||||||
|
int scale, /* 64 for 10 bit */
|
||||||
int width);
|
int width);
|
||||||
void MergeUV10Row_AVX2(const uint16* src_u,
|
void MergeUVRow_16_AVX2(const uint16* src_u,
|
||||||
const uint16* src_v,
|
const uint16* src_v,
|
||||||
uint16* dst_uv,
|
uint16* dst_uv,
|
||||||
|
int scale,
|
||||||
int width);
|
int width);
|
||||||
|
|
||||||
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
|
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
|
||||||
|
|||||||
@ -1798,21 +1798,22 @@ void MergeRGBRow_C(const uint8* src_r,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void MergeUV10Row_C(const uint16* src_u,
|
void MergeUVRow_16_C(const uint16* src_u,
|
||||||
const uint16* src_v,
|
const uint16* src_v,
|
||||||
uint16* dst_uv,
|
uint16* dst_uv,
|
||||||
|
int scale,
|
||||||
int width) {
|
int width) {
|
||||||
int x;
|
int x;
|
||||||
for (x = 0; x < width - 1; x += 2) {
|
for (x = 0; x < width - 1; x += 2) {
|
||||||
dst_uv[0] = src_u[x] << 6;
|
dst_uv[0] = src_u[x] * scale;
|
||||||
dst_uv[1] = src_v[x] << 6;
|
dst_uv[1] = src_v[x] * scale;
|
||||||
dst_uv[2] = src_u[x + 1] << 6;
|
dst_uv[2] = src_u[x + 1] * scale;
|
||||||
dst_uv[3] = src_v[x + 1] << 6;
|
dst_uv[3] = src_v[x + 1] * scale;
|
||||||
dst_uv += 4;
|
dst_uv += 4;
|
||||||
}
|
}
|
||||||
if (width & 1) {
|
if (width & 1) {
|
||||||
dst_uv[0] = src_u[width - 1] << 6;
|
dst_uv[0] = src_u[width - 1] * scale;
|
||||||
dst_uv[1] = src_v[width - 1] << 6;
|
dst_uv[1] = src_v[width - 1] * scale;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -2753,13 +2753,23 @@ void MergeUVRow_SSE2(const uint8* src_u,
|
|||||||
}
|
}
|
||||||
#endif // HAS_MERGEUVROW_SSE2
|
#endif // HAS_MERGEUVROW_SSE2
|
||||||
|
|
||||||
#ifdef HAS_MERGEUV10ROW_AVX2
|
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||||
void MergeUV10Row_AVX2(const uint16* src_u,
|
// 128 = 9 bits
|
||||||
|
// 64 = 10 bits
|
||||||
|
// 16 = 12 bits
|
||||||
|
// 1 = 16 bits
|
||||||
|
|
||||||
|
#ifdef HAS_MERGEUVROW_16_AVX2
|
||||||
|
void MergeUVRow_16_AVX2(const uint16* src_u,
|
||||||
const uint16* src_v,
|
const uint16* src_v,
|
||||||
uint16* dst_uv,
|
uint16* dst_uv,
|
||||||
|
int scale,
|
||||||
int width) {
|
int width) {
|
||||||
// clang-format off
|
// clang-format off
|
||||||
asm volatile (
|
asm volatile (
|
||||||
|
"vmovd %4,%%xmm3 \n"
|
||||||
|
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
||||||
|
"vbroadcastss %%xmm3,%%ymm3 \n"
|
||||||
"sub %0,%1 \n"
|
"sub %0,%1 \n"
|
||||||
|
|
||||||
// 16 pixels per loop.
|
// 16 pixels per loop.
|
||||||
@ -2768,8 +2778,9 @@ void MergeUV10Row_AVX2(const uint16* src_u,
|
|||||||
"vmovdqu (%0),%%ymm0 \n"
|
"vmovdqu (%0),%%ymm0 \n"
|
||||||
"vmovdqu (%0,%1,1),%%ymm1 \n"
|
"vmovdqu (%0,%1,1),%%ymm1 \n"
|
||||||
"add $0x20,%0 \n"
|
"add $0x20,%0 \n"
|
||||||
"vpsllw $0x6,%%ymm0,%%ymm0 \n"
|
|
||||||
"vpsllw $0x6,%%ymm1,%%ymm1 \n"
|
"vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||||
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
|
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
|
||||||
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
|
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
|
||||||
"vextractf128 $0x0,%%ymm2,(%2) \n"
|
"vextractf128 $0x0,%%ymm2,(%2) \n"
|
||||||
@ -2784,8 +2795,8 @@ void MergeUV10Row_AVX2(const uint16* src_u,
|
|||||||
"+r"(src_v), // %1
|
"+r"(src_v), // %1
|
||||||
"+r"(dst_uv), // %2
|
"+r"(dst_uv), // %2
|
||||||
"+r"(width) // %3
|
"+r"(width) // %3
|
||||||
:
|
: "r"(scale) // %4
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||||
// clang-format on
|
// clang-format on
|
||||||
}
|
}
|
||||||
#endif // HAS_MERGEUVROW_AVX2
|
#endif // HAS_MERGEUVROW_AVX2
|
||||||
|
|||||||
@ -2618,8 +2618,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO(fbarchard): improve test for platforms and cpu detect
|
// TODO(fbarchard): improve test for platforms and cpu detect
|
||||||
#ifdef HAS_MERGEUV10ROW_AVX2
|
#ifdef HAS_MERGEUVROW_16_AVX2
|
||||||
TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
|
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
|
||||||
const int kPixels = benchmark_width_ * benchmark_height_;
|
const int kPixels = benchmark_width_ * benchmark_height_;
|
||||||
align_buffer_page_end(src_pixels_u, kPixels * 2);
|
align_buffer_page_end(src_pixels_u, kPixels * 2);
|
||||||
align_buffer_page_end(src_pixels_v, kPixels * 2);
|
align_buffer_page_end(src_pixels_v, kPixels * 2);
|
||||||
@ -2631,20 +2631,22 @@ TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
|
|||||||
memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
|
memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
|
||||||
memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
|
memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
|
||||||
|
|
||||||
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
|
MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
|
||||||
reinterpret_cast<const uint16*>(src_pixels_v),
|
reinterpret_cast<const uint16*>(src_pixels_v),
|
||||||
reinterpret_cast<uint16*>(dst_pixels_uv_c), kPixels);
|
reinterpret_cast<uint16*>(dst_pixels_uv_c), 64, kPixels);
|
||||||
|
|
||||||
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||||
if (has_avx2) {
|
if (has_avx2) {
|
||||||
MergeUV10Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
|
MergeUVRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
|
||||||
reinterpret_cast<const uint16*>(src_pixels_v),
|
reinterpret_cast<const uint16*>(src_pixels_v),
|
||||||
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
|
reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
|
||||||
|
kPixels);
|
||||||
} else {
|
} else {
|
||||||
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
|
MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
|
||||||
reinterpret_cast<const uint16*>(src_pixels_v),
|
reinterpret_cast<const uint16*>(src_pixels_v),
|
||||||
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
|
reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
|
||||||
|
kPixels);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user