I420ToARGB for AVX512

On Skylake Xeon
AVX512  I420ToARGB_Opt (2050 ms)
AVX2    I420ToARGB_Opt (2533 ms)
SSSE3   I420ToARGB_Opt (3688 ms)

Bug: libyuv:911
Change-Id: I2214cc15dec24b06541895ca59d88990edbb2216
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3382100
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2022-01-13 23:26:03 -08:00 committed by libyuv LUCI CQ
parent cdd62da670
commit 90ffd5cba9
6 changed files with 169 additions and 4 deletions

View File

@ -398,6 +398,13 @@ extern "C" {
#define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_ARGBTORGB24ROW_AVX512VBMI
#endif #endif
// The following are available for AVX512 clang x64 platforms:
// TODO(fbarchard): Port to x86
#if !defined(LIBYUV_DISABLE_X86) && \
defined(__x86_64__) && (defined(CLANG_HAS_AVX512))
#define HAS_I422TOARGBROW_AVX512BW
#endif
// The following are available on Neon platforms: // The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \ #if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@ -3027,6 +3034,12 @@ void I422ToARGBRow_AVX2(const uint8_t* y_buf,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_AVX2(const uint8_t* y_buf, void I422ToRGBARow_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* v_buf,
@ -3368,6 +3381,12 @@ void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
uint8_t* dst_ptr, uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422ToARGBRow_Any_AVX512BW(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf, void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* v_buf,

View File

@ -7,7 +7,6 @@
* in the file PATENTS. All contributing project authors may * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#include "libyuv/convert_argb.h" #include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
@ -90,6 +89,14 @@ int I420ToARGBMatrix(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I422TOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 32)) {
I422ToARGBRow = I422ToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON; I422ToARGBRow = I422ToARGBRow_Any_NEON;
@ -321,6 +328,14 @@ int I422ToARGBMatrix(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I422TOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 32)) {
I422ToARGBRow = I422ToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON; I422ToARGBRow = I422ToARGBRow_Any_NEON;
@ -5142,6 +5157,14 @@ int I420ToRGB565Dither(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I422TOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 32)) {
I422ToARGBRow = I422ToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON; I422ToARGBRow = I422ToARGBRow_Any_NEON;

View File

@ -374,6 +374,9 @@ ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
#ifdef HAS_I422TOARGBROW_AVX2 #ifdef HAS_I422TOARGBROW_AVX2
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif #endif
#ifdef HAS_I422TOARGBROW_AVX512BW
ANY31C(I422ToARGBRow_Any_AVX512BW, I422ToARGBRow_AVX512BW, 1, 0, 4, 31)
#endif
#ifdef HAS_I422TORGBAROW_AVX2 #ifdef HAS_I422TORGBAROW_AVX2
ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
#endif #endif

View File

@ -3181,6 +3181,21 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n" "lea 0x10(%[y_buf]),%[y_buf] \n"
#define READYUV422_AVX512BW \
"vmovdqu (%[u_buf]),%%xmm3 \n" \
"vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
"vpermq %%zmm3,%%zmm16,%%zmm3 \n" \
"vpermq %%zmm1,%%zmm16,%%zmm1 \n" \
"lea 0x10(%[u_buf]),%[u_buf] \n" \
"vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \
"vpermq $0xd8,%%zmm3,%%zmm3 \n" \
"vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \
"vmovdqu8 (%[y_buf]),%%ymm4 \n" \
"vpermq %%zmm4,%%zmm17,%%zmm4 \n" \
"vpermq $0xd8,%%zmm4,%%zmm4 \n" \
"vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n"
// Read 8 UV from 210, upsample to 16 UV // Read 8 UV from 210, upsample to 16 UV
// TODO(fbarchard): Consider vshufb to replace pack/unpack // TODO(fbarchard): Consider vshufb to replace pack/unpack
// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
@ -3356,6 +3371,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \ "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \
"lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
// TODO(fbarchard): Remove broadcastb
#if defined(__x86_64__) #if defined(__x86_64__)
#define YUVTORGB_SETUP_AVX2(yuvconstants) \ #define YUVTORGB_SETUP_AVX2(yuvconstants) \
"vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
@ -3367,6 +3383,24 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
"vmovdqa 128(%[yuvconstants]),%%ymm12 \n" "vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
#define YUVTORGB_SETUP_AVX512BW(yuvconstants) \
"vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
"movdqa (%[yuvconstants]),%%xmm8 \n" \
"vpbroadcastq %%xmm8, %%zmm8 \n" \
"vpsllw $7,%%xmm13,%%xmm13 \n" \
"vpbroadcastb %%xmm13,%%zmm13 \n" \
"movq 32(%[yuvconstants]),%%xmm9 \n" \
"vpbroadcastq %%xmm9,%%zmm9 \n" \
"movq 64(%[yuvconstants]),%%xmm10 \n" \
"vpbroadcastq %%xmm10,%%zmm10 \n" \
"movq 96(%[yuvconstants]),%%xmm11 \n" \
"vpbroadcastq %%xmm11,%%zmm11 \n" \
"movq 128(%[yuvconstants]),%%xmm12 \n" \
"vpbroadcastq %%xmm12,%%zmm12 \n" \
"vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \
"vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \
"vmovdqu8 (%[unperm]),%%zmm18 \n"
#define YUVTORGB16_AVX2(yuvconstants) \ #define YUVTORGB16_AVX2(yuvconstants) \
"vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \ "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \
"vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \ "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \
@ -3378,7 +3412,20 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
#define YUVTORGB16_AVX512BW(yuvconstants) \
"vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \
"vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \
"vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \
"vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \
"vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \
"vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \
"vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \
"vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \
"vpaddsw %%zmm4,%%zmm2,%%zmm2 \n"
#define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", #define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
#define YUVTORGB_REGS_AVX512BW \
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18",
#else // Convert 16 pixels: 16 UV and 16 Y. #else // Convert 16 pixels: 16 UV and 16 Y.
@ -3413,6 +3460,15 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
#define YUVTORGB_AVX512BW(yuvconstants) \
YUVTORGB16_AVX512BW(yuvconstants) \
"vpsraw $0x6,%%zmm0,%%zmm0 \n" \
"vpsraw $0x6,%%zmm1,%%zmm1 \n" \
"vpsraw $0x6,%%zmm2,%%zmm2 \n" \
"vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" \
"vpackuswb %%zmm1,%%zmm1,%%zmm1 \n" \
"vpackuswb %%zmm2,%%zmm2,%%zmm2 \n"
// Store 16 ARGB values. // Store 16 ARGB values.
#define STOREARGB_AVX2 \ #define STOREARGB_AVX2 \
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
@ -3425,6 +3481,18 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
"lea 0x40(%[dst_argb]), %[dst_argb] \n" "lea 0x40(%[dst_argb]), %[dst_argb] \n"
// Store 32 ARGB values.
#define STOREARGB_AVX512BW \
"vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \
"vpermq %%zmm0,%%zmm18,%%zmm0 \n" \
"vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \
"vpermq %%zmm2,%%zmm18,%%zmm2 \n" \
"vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \
"vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \
"vmovdqu8 %%zmm1,(%[dst_argb]) \n" \
"vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \
"lea 0x80(%[dst_argb]), %[dst_argb] \n"
// Store 16 AR30 values. // Store 16 AR30 values.
#define STOREAR30_AVX2 \ #define STOREAR30_AVX2 \
"vpsraw $0x4,%%ymm0,%%ymm0 \n" \ "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
@ -3521,6 +3589,50 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
} }
#endif // HAS_I422TOARGBROW_AVX2 #endif // HAS_I422TOARGBROW_AVX2
#if defined(HAS_I422TOARGBROW_AVX512BW)
static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2};
static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4};
static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7};
// 32 pixels
// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128
// bytes).
void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX512BW(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n"
"vpbroadcastq %%xmm5,%%zmm5 \n"
LABELALIGN
"1: \n"
READYUV422_AVX512BW
YUVTORGB_AVX512BW(yuvconstants)
STOREARGB_AVX512BW
"sub $0x20,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm]
[dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm]
[unperm]"r"(kUnpermuteAVX512) // %[unperm]
: "memory", "cc", YUVTORGB_REGS_AVX512BW
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_I422TOARGBROW_AVX512BW
#if defined(HAS_I422TOAR30ROW_AVX2) #if defined(HAS_I422TOAR30ROW_AVX2)
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).

View File

@ -611,6 +611,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
} }
} }
#endif #endif
#if defined(HAS_I422TOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(src_width, 32)) {
I422ToARGBRow = I422ToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON) #if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON; I422ToARGBRow = I422ToARGBRow_Any_NEON;