MergeUV10Row_AVX2 for converting H010 to P010

H010 is 10 bit planar format with 10 bits in lower bits.
P010 is 10 bit biplanar format with 10 bits in upper bits.
This function weaves the U and V channels and shifts the bits
into the upper bits.

Bug: libyuv:751
Test: LibYUVPlanarTest.MergeUV10Row_Opt
Change-Id: I4a0bac0ef1ff95aa1b8d68261ec8e8e86f2d1fbf
Reviewed-on: https://chromium-review.googlesource.com/752692
Reviewed-by: Cheng Wang <wangcheng@google.com>
Reviewed-by: Frank Barchard <fbarchard@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
Frank Barchard 2017-11-02 19:52:44 -07:00 committed by Frank Barchard
parent 75ec56b55a
commit a0c32b9e49
4 changed files with 120 additions and 1 deletions

View File

@ -271,7 +271,7 @@ extern "C" {
#define HAS_I422TOARGBROW_SSSE3
#endif
// The following are available forr gcc/clang x86 platforms:
// The following are available for gcc/clang x86 platforms:
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
@ -279,6 +279,14 @@ extern "C" {
#define HAS_SPLITRGBROW_SSSE3
#endif
// The following are available for AVX2 gcc/clang x86 platforms:
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_MERGEUV10ROW_AVX2
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@ -1523,6 +1531,15 @@ void MergeRGBRow_Any_NEON(const uint8* src_r,
uint8* dst_rgb,
int width);
void MergeUV10Row_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width);
void MergeUV10Row_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width);
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);

View File

@ -1798,6 +1798,24 @@ void MergeRGBRow_C(const uint8* src_r,
}
}
void MergeUV10Row_C(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
dst_uv[0] = src_u[x] << 6;
dst_uv[1] = src_v[x] << 6;
dst_uv[2] = src_u[x + 1] << 6;
dst_uv[3] = src_v[x + 1] << 6;
dst_uv += 4;
}
if (width & 1) {
dst_uv[0] = src_u[width - 1] << 6;
dst_uv[1] = src_v[width - 1] << 6;
}
}
void CopyRow_C(const uint8* src, uint8* dst, int count) {
memcpy(dst, src, count);
}

View File

@ -2753,6 +2753,48 @@ void MergeUVRow_SSE2(const uint8* src_u,
}
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUV10ROW_AVX2
void MergeUV10Row_AVX2(const uint16* src_u,
const uint16* src_v,
uint16* dst_uv,
int width) {
asm volatile (
"sub %0,%1 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu (%0,%1,1),%%ymm1 \n"
"add $0x20,%0 \n"
"vpsllw $0x6,%%ymm0,%%ymm0 \n"
"vpsllw $0x6,%%ymm1,%%ymm1 \n"
// "vpermq $0xd8,%%ymm0,%%ymm0 \n"
// "vpermq $0xd8,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
// "vmovdqu %%ymm2, (%2) \n"
// "vmovdqu %%ymm0, 0x20(%2) \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
"add $0x40,%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc", "xmm0", "xmm1", "xmm2"
);
}
#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_SPLITRGBROW_SSSE3
// Shuffle table for converting RGB to Planar.

View File

@ -2617,6 +2617,48 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
// TODO(fbarchard): improve test for platforms and cpu detect
#ifdef HAS_MERGEUV10ROW_AVX2
TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_u, kPixels * 2);
align_buffer_page_end(src_pixels_v, kPixels * 2);
align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
align_buffer_page_end(dst_pixels_uv_c, kPixels * 2 * 2);
MemRandomize(src_pixels_u, kPixels * 2);
MemRandomize(src_pixels_v, kPixels * 2);
memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_c), kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) {
MergeUV10Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
} else {
MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
reinterpret_cast<const uint16*>(src_pixels_v),
reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
}
}
for (int i = 0; i < kPixels * 2 * 2; ++i) {
EXPECT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]);
}
free_aligned_buffer_page_end(src_pixels_u);
free_aligned_buffer_page_end(src_pixels_v);
free_aligned_buffer_page_end(dst_pixels_uv_opt);
free_aligned_buffer_page_end(dst_pixels_uv_c);
}
#endif
float TestScaleMaxSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,