mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
MultiplyRow_16_AVX2 for converting 10 bit YUV
When converting from lsb 10 bit formats to msb, the values need to be shifted to the top 10 bits. Using a multiply allows the different numbers of bits to be copied: // 128 = 9 bits // 64 = 10 bits // 16 = 12 bits // 1 = 16 bits Bug: libyuv:751 Test: LibYUVPlanarTest.MultiplyRow_16_Opt Change-Id: I9cf226053a164baa14155215cb175065b1c4f169 Reviewed-on: https://chromium-review.googlesource.com/762951 Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Frank Barchard <fbarchard@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
2f58d126b9
commit
49d1e3b036
@ -278,6 +278,7 @@ extern "C" {
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
|
||||
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
|
||||
#define HAS_MERGEUVROW_16_AVX2
|
||||
#define HAS_MULTIPLYROW_16_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms:
|
||||
@ -1532,6 +1533,15 @@ void MergeUVRow_16_AVX2(const uint16* src_u,
|
||||
int scale,
|
||||
int width);
|
||||
|
||||
void MultiplyRow_16_AVX2(const uint16* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
void MultiplyRow_16_C(const uint16* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
|
||||
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
|
||||
|
||||
@ -1817,6 +1817,16 @@ void MergeUVRow_16_C(const uint16* src_u,
|
||||
}
|
||||
}
|
||||
|
||||
void MultiplyRow_16_C(const uint16* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_y[x] = src_y[x] * scale;
|
||||
}
|
||||
}
|
||||
|
||||
void CopyRow_C(const uint8* src, uint8* dst, int count) {
|
||||
memcpy(dst, src, count);
|
||||
}
|
||||
|
||||
@ -2758,7 +2758,6 @@ void MergeUVRow_SSE2(const uint8* src_u,
|
||||
// 64 = 10 bits
|
||||
// 16 = 12 bits
|
||||
// 1 = 16 bits
|
||||
|
||||
#ifdef HAS_MERGEUVROW_16_AVX2
|
||||
void MergeUVRow_16_AVX2(const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
@ -2801,6 +2800,41 @@ void MergeUVRow_16_AVX2(const uint16* src_u,
|
||||
}
|
||||
#endif // HAS_MERGEUVROW_AVX2
|
||||
|
||||
|
||||
#ifdef HAS_MULTIPLYROW_16_AVX2
|
||||
void MultiplyRow_16_AVX2(const uint16* src_y,
|
||||
uint16* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
// clang-format off
|
||||
asm volatile (
|
||||
"vmovd %3,%%xmm3 \n"
|
||||
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
||||
"vbroadcastss %%xmm3,%%ymm3 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
// 16 pixels per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vmovdqu 0x20(%0),%%ymm1 \n"
|
||||
"vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
|
||||
"vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
"vmovdqu %%ymm0,(%0,%1) \n"
|
||||
"vmovdqu %%ymm1,0x20(%0,%1) \n"
|
||||
"add $0x40,%0 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm3");
|
||||
// clang-format on
|
||||
}
|
||||
#endif // HAS_MULTIPLYROW_16_AVX2
|
||||
|
||||
#ifdef HAS_SPLITRGBROW_SSSE3
|
||||
|
||||
// Shuffle table for converting RGB to Planar.
|
||||
|
||||
@ -2661,6 +2661,44 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
|
||||
}
|
||||
#endif
|
||||
|
||||
// TODO(fbarchard): improve test for platforms and cpu detect
|
||||
#ifdef HAS_MULTIPLYROW_16_AVX2
|
||||
TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
|
||||
const int kPixels = benchmark_width_ * benchmark_height_;
|
||||
align_buffer_page_end(src_pixels_y, kPixels * 2);
|
||||
align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
|
||||
align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
|
||||
|
||||
MemRandomize(src_pixels_y, kPixels * 2);
|
||||
memset(dst_pixels_y_opt, 0, kPixels * 2);
|
||||
memset(dst_pixels_y_c, 1, kPixels * 2);
|
||||
|
||||
MultiplyRow_16_C(reinterpret_cast<const uint16*>(src_pixels_y),
|
||||
reinterpret_cast<uint16*>(dst_pixels_y_c), 64, kPixels);
|
||||
|
||||
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
if (has_avx2) {
|
||||
MultiplyRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
|
||||
reinterpret_cast<uint16*>(dst_pixels_y_opt), 64,
|
||||
kPixels);
|
||||
} else {
|
||||
MultiplyRow_16_C(reinterpret_cast<const uint16*>(src_pixels_y),
|
||||
reinterpret_cast<uint16*>(dst_pixels_y_opt), 64,
|
||||
kPixels);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < kPixels * 2; ++i) {
|
||||
EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(src_pixels_y);
|
||||
free_aligned_buffer_page_end(dst_pixels_y_opt);
|
||||
free_aligned_buffer_page_end(dst_pixels_y_c);
|
||||
}
|
||||
#endif
|
||||
|
||||
float TestScaleMaxSamples(int benchmark_width,
|
||||
int benchmark_height,
|
||||
int benchmark_iterations,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user