mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
I420ToARGB for AVX512
On Skylake Xeon AVX512 I420ToARGB_Opt (2050 ms) AVX2 I420ToARGB_Opt (2533 ms) SSSE3 I420ToARGB_Opt (3688 ms) Bug: libyuv:911 Change-Id: I2214cc15dec24b06541895ca59d88990edbb2216 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3382100 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
cdd62da670
commit
90ffd5cba9
@ -398,6 +398,13 @@ extern "C" {
|
||||
#define HAS_ARGBTORGB24ROW_AVX512VBMI
|
||||
#endif
|
||||
|
||||
// The following are available for AVX512 clang x64 platforms:
|
||||
// TODO(fbarchard): Port to x86
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
defined(__x86_64__) && (defined(CLANG_HAS_AVX512))
|
||||
#define HAS_I422TOARGBROW_AVX512BW
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
@ -3027,6 +3034,12 @@ void I422ToARGBRow_AVX2(const uint8_t* y_buf,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToRGBARow_AVX2(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
@ -3368,6 +3381,12 @@ void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
|
||||
uint8_t* dst_ptr,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToARGBRow_Any_AVX512BW(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
uint8_t* dst_ptr,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
|
||||
@ -7,7 +7,6 @@
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/convert_argb.h"
|
||||
|
||||
#include "libyuv/cpu_id.h"
|
||||
@ -90,6 +89,14 @@ int I420ToARGBMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToARGBRow = I422ToARGBRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_NEON;
|
||||
@ -321,6 +328,14 @@ int I422ToARGBMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToARGBRow = I422ToARGBRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_NEON;
|
||||
@ -5142,6 +5157,14 @@ int I420ToRGB565Dither(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToARGBRow = I422ToARGBRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_NEON;
|
||||
|
||||
@ -374,6 +374,9 @@ ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
|
||||
#ifdef HAS_I422TOARGBROW_AVX2
|
||||
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_I422TOARGBROW_AVX512BW
|
||||
ANY31C(I422ToARGBRow_Any_AVX512BW, I422ToARGBRow_AVX512BW, 1, 0, 4, 31)
|
||||
#endif
|
||||
#ifdef HAS_I422TORGBAROW_AVX2
|
||||
ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
|
||||
#endif
|
||||
|
||||
@ -3181,6 +3181,21 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
||||
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
|
||||
"lea 0x10(%[y_buf]),%[y_buf] \n"
|
||||
|
||||
#define READYUV422_AVX512BW \
|
||||
"vmovdqu (%[u_buf]),%%xmm3 \n" \
|
||||
"vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
|
||||
"vpermq %%zmm3,%%zmm16,%%zmm3 \n" \
|
||||
"vpermq %%zmm1,%%zmm16,%%zmm1 \n" \
|
||||
"lea 0x10(%[u_buf]),%[u_buf] \n" \
|
||||
"vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \
|
||||
"vpermq $0xd8,%%zmm3,%%zmm3 \n" \
|
||||
"vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \
|
||||
"vmovdqu8 (%[y_buf]),%%ymm4 \n" \
|
||||
"vpermq %%zmm4,%%zmm17,%%zmm4 \n" \
|
||||
"vpermq $0xd8,%%zmm4,%%zmm4 \n" \
|
||||
"vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \
|
||||
"lea 0x20(%[y_buf]),%[y_buf] \n"
|
||||
|
||||
// Read 8 UV from 210, upsample to 16 UV
|
||||
// TODO(fbarchard): Consider vshufb to replace pack/unpack
|
||||
// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
|
||||
@ -3356,6 +3371,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
||||
"vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \
|
||||
"lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
|
||||
|
||||
// TODO(fbarchard): Remove broadcastb
|
||||
#if defined(__x86_64__)
|
||||
#define YUVTORGB_SETUP_AVX2(yuvconstants) \
|
||||
"vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
|
||||
@ -3367,6 +3383,24 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
||||
"vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
|
||||
"vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
|
||||
|
||||
#define YUVTORGB_SETUP_AVX512BW(yuvconstants) \
|
||||
"vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
|
||||
"movdqa (%[yuvconstants]),%%xmm8 \n" \
|
||||
"vpbroadcastq %%xmm8, %%zmm8 \n" \
|
||||
"vpsllw $7,%%xmm13,%%xmm13 \n" \
|
||||
"vpbroadcastb %%xmm13,%%zmm13 \n" \
|
||||
"movq 32(%[yuvconstants]),%%xmm9 \n" \
|
||||
"vpbroadcastq %%xmm9,%%zmm9 \n" \
|
||||
"movq 64(%[yuvconstants]),%%xmm10 \n" \
|
||||
"vpbroadcastq %%xmm10,%%zmm10 \n" \
|
||||
"movq 96(%[yuvconstants]),%%xmm11 \n" \
|
||||
"vpbroadcastq %%xmm11,%%zmm11 \n" \
|
||||
"movq 128(%[yuvconstants]),%%xmm12 \n" \
|
||||
"vpbroadcastq %%xmm12,%%zmm12 \n" \
|
||||
"vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \
|
||||
"vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \
|
||||
"vmovdqu8 (%[unperm]),%%zmm18 \n"
|
||||
|
||||
#define YUVTORGB16_AVX2(yuvconstants) \
|
||||
"vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \
|
||||
"vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \
|
||||
@ -3378,7 +3412,20 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
||||
"vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
|
||||
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
|
||||
|
||||
#define YUVTORGB16_AVX512BW(yuvconstants) \
|
||||
"vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \
|
||||
"vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \
|
||||
"vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \
|
||||
"vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \
|
||||
"vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \
|
||||
"vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \
|
||||
"vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \
|
||||
"vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \
|
||||
"vpaddsw %%zmm4,%%zmm2,%%zmm2 \n"
|
||||
|
||||
#define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
|
||||
#define YUVTORGB_REGS_AVX512BW \
|
||||
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18",
|
||||
|
||||
#else // Convert 16 pixels: 16 UV and 16 Y.
|
||||
|
||||
@ -3413,6 +3460,15 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
||||
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
|
||||
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
|
||||
|
||||
#define YUVTORGB_AVX512BW(yuvconstants) \
|
||||
YUVTORGB16_AVX512BW(yuvconstants) \
|
||||
"vpsraw $0x6,%%zmm0,%%zmm0 \n" \
|
||||
"vpsraw $0x6,%%zmm1,%%zmm1 \n" \
|
||||
"vpsraw $0x6,%%zmm2,%%zmm2 \n" \
|
||||
"vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" \
|
||||
"vpackuswb %%zmm1,%%zmm1,%%zmm1 \n" \
|
||||
"vpackuswb %%zmm2,%%zmm2,%%zmm2 \n"
|
||||
|
||||
// Store 16 ARGB values.
|
||||
#define STOREARGB_AVX2 \
|
||||
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
|
||||
@ -3425,6 +3481,18 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
||||
"vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
|
||||
"lea 0x40(%[dst_argb]), %[dst_argb] \n"
|
||||
|
||||
// Store 32 ARGB values.
|
||||
#define STOREARGB_AVX512BW \
|
||||
"vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \
|
||||
"vpermq %%zmm0,%%zmm18,%%zmm0 \n" \
|
||||
"vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \
|
||||
"vpermq %%zmm2,%%zmm18,%%zmm2 \n" \
|
||||
"vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \
|
||||
"vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \
|
||||
"vmovdqu8 %%zmm1,(%[dst_argb]) \n" \
|
||||
"vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \
|
||||
"lea 0x80(%[dst_argb]), %[dst_argb] \n"
|
||||
|
||||
// Store 16 AR30 values.
|
||||
#define STOREAR30_AVX2 \
|
||||
"vpsraw $0x4,%%ymm0,%%ymm0 \n" \
|
||||
@ -3521,6 +3589,50 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
|
||||
}
|
||||
#endif // HAS_I422TOARGBROW_AVX2
|
||||
|
||||
#if defined(HAS_I422TOARGBROW_AVX512BW)
|
||||
static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2};
|
||||
static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4};
|
||||
static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
|
||||
// 32 pixels
|
||||
// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128
|
||||
// bytes).
|
||||
void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
YUVTORGB_SETUP_AVX512BW(yuvconstants)
|
||||
"sub %[u_buf],%[v_buf] \n"
|
||||
"vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n"
|
||||
"vpbroadcastq %%xmm5,%%zmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
READYUV422_AVX512BW
|
||||
YUVTORGB_AVX512BW(yuvconstants)
|
||||
STOREARGB_AVX512BW
|
||||
"sub $0x20,%[width] \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"vzeroupper \n"
|
||||
: [y_buf]"+r"(y_buf), // %[y_buf]
|
||||
[u_buf]"+r"(u_buf), // %[u_buf]
|
||||
[v_buf]"+r"(v_buf), // %[v_buf]
|
||||
[dst_argb]"+r"(dst_argb), // %[dst_argb]
|
||||
[width]"+rm"(width) // %[width]
|
||||
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
|
||||
[quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm]
|
||||
[dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm]
|
||||
[unperm]"r"(kUnpermuteAVX512) // %[unperm]
|
||||
: "memory", "cc", YUVTORGB_REGS_AVX512BW
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_I422TOARGBROW_AVX512BW
|
||||
|
||||
#if defined(HAS_I422TOAR30ROW_AVX2)
|
||||
// 16 pixels
|
||||
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
|
||||
|
||||
@ -611,6 +611,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(src_width, 32)) {
|
||||
I422ToARGBRow = I422ToARGBRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_NEON;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user