mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-04-30 19:09:18 +08:00
ARGBToUV444 for AVX512
1.27x faster on AMD Zen5 (turin) Now AVX512 perf record ./libyuv_test '--gunit_filter=*ARGBToI444_Opt' --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=10000 --libyuv_flags=-1 --libyuv_cpu_info=-1 [ OK ] LibYUVConvertTest.ARGBToI444_Opt (1071 ms) Overhead Symbol 53.49% ARGBToYRow_AVX2 44.70% ARGBToUV444Row_AVX512BW Was AVX2 [ OK ] LibYUVConvertTest.ARGBToI444_Opt (1369 ms) 61.06% ARGBToUV444Row_AVX2 37.67% ARGBToYRow_AVX2 Bug: libyuv:42280902 Change-Id: I306fbac656d6f7834ce1559e86d01eb34931ec3c Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7738362 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Dale Curtis <dalecurtis@chromium.org>
This commit is contained in:
parent
7903a6c632
commit
4c3d7d517a
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||
Version: 1922
|
||||
Version: 1923
|
||||
Revision: DEPS
|
||||
License: BSD-3-Clause
|
||||
License File: LICENSE
|
||||
|
||||
@ -379,6 +379,9 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \
|
||||
(defined(CLANG_HAS_AVX512))
|
||||
#define HAS_I422TOARGBROW_AVX512BW
|
||||
#define HAS_ARGBTOUV444ROW_AVX512BW
|
||||
#define HAS_ARGBTOUV444MATRIXROW_AVX512BW
|
||||
#define HAS_ARGBTOUVJ444ROW_AVX512BW
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms:
|
||||
@ -2156,6 +2159,11 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_Any_SSSE3(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
@ -2178,11 +2186,11 @@ void ARGBToUV444MatrixRow_Any_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUV444MatrixRow_Any_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUV444MatrixRow_Any_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
|
||||
void ABGRToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
|
||||
void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
|
||||
@ -2735,19 +2743,35 @@ void ARGBToUV444Row_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void ARGBToUV444Row_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void ARGBToUV444Row_Any_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void ARGBToUV444Row_Any_AVX512BW(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
|
||||
void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void ARGBToUVJ444Row_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void ARGBToUVJ444Row_Any_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void ARGBToUVJ444Row_Any_AVX512BW(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
|
||||
void ARGBToUV444Row_C(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1922
|
||||
#define LIBYUV_VERSION 1923
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -3591,6 +3591,14 @@ int RAWToI444(const uint8_t* src_raw,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUV444ROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToUV444Row = ARGBToUV444Row_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToUV444Row = ARGBToUV444Row_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUV444ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
|
||||
@ -3794,6 +3802,14 @@ int RAWToJ444(const uint8_t* src_raw,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVJ444ROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToUVJ444Row = ARGBToUVJ444Row_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVJ444ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON;
|
||||
|
||||
@ -68,6 +68,14 @@ int ARGBToI444(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUV444ROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToUV444Row = ARGBToUV444Row_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToUV444Row = ARGBToUV444Row_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUV444ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
|
||||
@ -200,6 +208,14 @@ int ARGBToI444Matrix(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUV444MATRIXROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
|
||||
@ -2638,6 +2654,14 @@ int ARGBToJ444(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVJ444ROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToUVJ444Row = ARGBToUVJ444Row_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVJ444ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON;
|
||||
|
||||
@ -2030,9 +2030,15 @@ ANY12(ARGBToUVJ444Row_Any_SSSE3, ARGBToUVJ444Row_SSSE3, 0, 4, 0, 15)
|
||||
#ifdef HAS_ARGBTOUV444ROW_AVX2
|
||||
ANY12(ARGBToUV444Row_Any_AVX2, ARGBToUV444Row_AVX2, 0, 4, 0, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUV444ROW_AVX512BW
|
||||
ANY12(ARGBToUV444Row_Any_AVX512BW, ARGBToUV444Row_AVX512BW, 0, 4, 0, 63)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVJ444ROW_AVX2
|
||||
ANY12(ARGBToUVJ444Row_Any_AVX2, ARGBToUVJ444Row_AVX2, 0, 4, 0, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVJ444ROW_AVX512BW
|
||||
ANY12(ARGBToUVJ444Row_Any_AVX512BW, ARGBToUVJ444Row_AVX512BW, 0, 4, 0, 63)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOUV422ROW_AVX2
|
||||
ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
|
||||
ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
|
||||
@ -2250,6 +2256,9 @@ ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7)
|
||||
#ifdef HAS_ARGBTOUV444MATRIXROW_AVX2
|
||||
ANY12M(ARGBToUV444MatrixRow_Any_AVX2, ARGBToUV444MatrixRow_AVX2, 4, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUV444MATRIXROW_AVX512BW
|
||||
ANY12M(ARGBToUV444MatrixRow_Any_AVX512BW, ARGBToUV444MatrixRow_AVX512BW, 4, 63)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUV444MATRIXROW_SSSE3
|
||||
ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15)
|
||||
#endif
|
||||
|
||||
@ -1723,6 +1723,88 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
|
||||
}
|
||||
#endif // HAS_ARGBTOUV444ROW_AVX2
|
||||
|
||||
#ifdef HAS_ARGBTOUV444ROW_AVX512BW
|
||||
static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15};
|
||||
|
||||
void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vbroadcasti64x4 0x20(%4),%%zmm3 \n" // kRGBToU
|
||||
"vbroadcasti64x4 0x40(%4),%%zmm4 \n" // kRGBToV
|
||||
"vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" // -1
|
||||
"vpsllw $15,%%zmm16,%%zmm5 \n" // 0x8000
|
||||
"vmovups %5,%%zmm7 \n"
|
||||
"sub %1,%2 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovups (%0),%%zmm0 \n"
|
||||
"vmovups 0x40(%0),%%zmm1 \n"
|
||||
"vmovups 0x80(%0),%%zmm2 \n"
|
||||
"vmovups 0xc0(%0),%%zmm6 \n"
|
||||
"vpmaddubsw %%zmm3,%%zmm0,%%zmm0 \n"
|
||||
"vpmaddubsw %%zmm3,%%zmm1,%%zmm1 \n"
|
||||
"vpmaddubsw %%zmm3,%%zmm2,%%zmm2 \n"
|
||||
"vpmaddubsw %%zmm3,%%zmm6,%%zmm6 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n"
|
||||
"vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" // mutates
|
||||
"vpackssdw %%zmm6,%%zmm2,%%zmm2 \n"
|
||||
"vpsubw %%zmm5,%%zmm0,%%zmm0 \n"
|
||||
"vpsubw %%zmm5,%%zmm2,%%zmm2 \n"
|
||||
"vpsrlw $0x8,%%zmm0,%%zmm0 \n"
|
||||
"vpsrlw $0x8,%%zmm2,%%zmm2 \n"
|
||||
"vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" // mutates
|
||||
"vpermd %%zmm0,%%zmm7,%%zmm0 \n" // unmutate.
|
||||
"vmovups %%zmm0,(%1) \n"
|
||||
|
||||
"vmovups (%0),%%zmm0 \n"
|
||||
"vmovups 0x40(%0),%%zmm1 \n"
|
||||
"vmovups 0x80(%0),%%zmm2 \n"
|
||||
"vmovups 0xc0(%0),%%zmm6 \n"
|
||||
"vpmaddubsw %%zmm4,%%zmm0,%%zmm0 \n"
|
||||
"vpmaddubsw %%zmm4,%%zmm1,%%zmm1 \n"
|
||||
"vpmaddubsw %%zmm4,%%zmm2,%%zmm2 \n"
|
||||
"vpmaddubsw %%zmm4,%%zmm6,%%zmm6 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n"
|
||||
"vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" // mutates
|
||||
"vpackssdw %%zmm6,%%zmm2,%%zmm2 \n"
|
||||
"vpsubw %%zmm5,%%zmm0,%%zmm0 \n"
|
||||
"vpsubw %%zmm5,%%zmm2,%%zmm2 \n"
|
||||
"vpsrlw $0x8,%%zmm0,%%zmm0 \n"
|
||||
"vpsrlw $0x8,%%zmm2,%%zmm2 \n"
|
||||
"vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" // mutates
|
||||
"vpermd %%zmm0,%%zmm7,%%zmm0 \n" // unmutate.
|
||||
"vmovups %%zmm0,(%1,%2,1) \n"
|
||||
"lea 0x100(%0),%0 \n"
|
||||
"lea 0x40(%1),%1 \n"
|
||||
"subl $0x40,%3 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
#if defined(__i386__)
|
||||
"+m"(width) // %3
|
||||
#else
|
||||
"+rm"(width) // %3
|
||||
#endif
|
||||
: "r"(c), // %4
|
||||
"m"(kPermdARGBToY_AVX512BW) // %5
|
||||
: "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
|
||||
"zmm7", "zmm16");
|
||||
}
|
||||
#endif // HAS_ARGBTOUV444ROW_AVX512BW
|
||||
|
||||
#ifdef HAS_ARGBTOUVROW_SSSE3
|
||||
|
||||
// ARGBARGB to AARRGGBB shuffle
|
||||
@ -1904,6 +1986,16 @@ void ARGBToUV444Row_AVX2(const uint8_t* src_argb,
|
||||
}
|
||||
#endif // HAS_ARGBTOUV444ROW_AVX2
|
||||
|
||||
#ifdef HAS_ARGBTOUV444ROW_AVX512BW
|
||||
void ARGBToUV444Row_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUV444MatrixRow_AVX512BW(src_argb, dst_u, dst_v, width,
|
||||
&kArgbI601Constants);
|
||||
}
|
||||
#endif // HAS_ARGBTOUV444ROW_AVX512BW
|
||||
|
||||
#ifdef HAS_ARGBTOUVROW_SSSE3
|
||||
void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
@ -1981,6 +2073,16 @@ void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb,
|
||||
}
|
||||
#endif // HAS_ARGBTOUVJ444ROW_AVX2
|
||||
|
||||
#ifdef HAS_ARGBTOUVJ444ROW_AVX512BW
|
||||
void ARGBToUVJ444Row_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUV444MatrixRow_AVX512BW(src_argb, dst_u, dst_v, width,
|
||||
&kArgbJPEGConstants);
|
||||
}
|
||||
#endif // HAS_ARGBTOUVJ444ROW_AVX512BW
|
||||
|
||||
#ifdef HAS_ARGBTOUVJROW_SSSE3
|
||||
void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user