Convert16To8Row_AVX512BW using vpmovuswb

- avx2 is pack/perm is mutating order
- cvt method maintains channel order on avx512

Sapphire Rapids

Benchmark of 640x360 on Sapphire Rapids
AVX512BW
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (3547 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (3186 ms)

AVX2
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (4000 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (3190 ms)

SSE2
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (5433 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (4840 ms)

Skylake Xeon
Now vpmovuswb
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (7946 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (7071 ms)

Was vpackuswb
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (7684 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (7059 ms)

Switch from vpunpcklwd to vpbroadcastw for scale value parameter
Was
vpunpcklwd  %%xmm2,%%xmm2,%%xmm2
vbroadcastss %%xmm2,%%ymm2

Now
vpbroadcastw %%xmm2,%%ymm2

Bug: 357439226, 357721018
Change-Id: Ifc9c82ab70dba58af6efa0f57f5f7a344014652e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5787040
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2024-08-13 16:57:45 -07:00
parent c21dda06dd
commit 679e851f65
6 changed files with 75 additions and 15 deletions

View File

@ -27,12 +27,10 @@ extern "C" {
#define LIBYUV_DISABLE_NEON
#endif
// clang >= 19.0.0 required for SME
#if !defined(LIBYUV_DISABLE_SME) && defined(__clang__) && defined(__aarch64__)
#if __clang_major__ < 19
// temporary disable SME
#if !defined(LIBYUV_DISABLE_SME)
#define LIBYUV_DISABLE_SME
#endif
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)

View File

@ -400,10 +400,11 @@ extern "C" {
// The following are available for AVX512 clang x86 platforms:
// TODO(fbarchard): Port to GCC and Visual C
// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
// TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
#define HAS_ARGBTORGB24ROW_AVX512VBMI
#define HAS_CONVERT16TO8ROW_AVX512BW
#define HAS_MERGEUVROW_AVX512BW
#endif
@ -3337,6 +3338,10 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width);
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width);
void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
uint8_t* dst_ptr,
int scale,
@ -3345,6 +3350,10 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
uint8_t* dst_ptr,
int scale,
int width);
void Convert16To8Row_Any_AVX512BW(const uint16_t* src_ptr,
uint8_t* dst_ptr,
int scale,
int width);
void Convert16To8Row_NEON(const uint16_t* src_y,
uint8_t* dst_y,
int scale,

View File

@ -705,6 +705,14 @@ int I010ToNV12(const uint16_t* src_y,
}
}
#endif
#if defined(HAS_CONVERT16TO8ROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
Convert16To8Row = Convert16To8Row_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
Convert16To8Row = Convert16To8Row_AVX512BW;
}
}
#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {

View File

@ -149,6 +149,14 @@ void Convert16To8Plane(const uint16_t* src_y,
}
}
#endif
#if defined(HAS_CONVERT16TO8ROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
Convert16To8Row = Convert16To8Row_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
Convert16To8Row = Convert16To8Row_AVX512BW;
}
}
#endif
// Convert plane
for (y = 0; y < height; ++y) {

View File

@ -1684,8 +1684,8 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
SIMD_ALIGNED(STYPE vin[32]); \
SIMD_ALIGNED(DTYPE vout[32]); \
SIMD_ALIGNED(STYPE vin[64]); \
SIMD_ALIGNED(DTYPE vout[64]); \
memset(vin, 0, sizeof(vin)); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
@ -1715,6 +1715,15 @@ ANY11C(Convert16To8Row_Any_AVX2,
uint8_t,
31)
#endif
#ifdef HAS_CONVERT16TO8ROW_AVX512BW
ANY11C(Convert16To8Row_Any_AVX512BW,
Convert16To8Row_AVX512BW,
2,
1,
uint16_t,
uint8_t,
63)
#endif
#ifdef HAS_CONVERT16TO8ROW_NEON
ANY11C(Convert16To8Row_Any_NEON,
Convert16To8Row_NEON,

View File

@ -5202,8 +5202,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
int width) {
asm volatile (
"vmovd %3,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
"vpbroadcastw %%xmm3,%%ymm3 \n"
"sub %0,%1 \n"
// 32 pixels per loop.
@ -5239,8 +5238,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
int width) {
asm volatile (
"vmovd %3,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
"vpbroadcastw %%xmm3,%%ymm3 \n"
"sub %0,%1 \n"
// 32 pixels per loop.
@ -5306,8 +5304,7 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
int width) {
asm volatile (
"vmovd %3,%%xmm2 \n"
"vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
"vbroadcastss %%xmm2,%%ymm2 \n"
"vpbroadcastw %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN
@ -5332,6 +5329,38 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
}
#endif // HAS_CONVERT16TO8ROW_AVX2
#ifdef HAS_CONVERT16TO8ROW_AVX512BW
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width) {
asm volatile (
"vpbroadcastw %3,%%zmm2 \n"
// 64 pixels per loop.
LABELALIGN
"1: \n"
"vmovdqu8 (%0),%%zmm0 \n"
"vmovdqu8 0x40(%0),%%zmm1 \n"
"add $0x80,%0 \n"
"vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n"
"vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n"
"vpmovuswb %%zmm0,%%ymm0 \n"
"vpmovuswb %%zmm1,%%ymm1 \n"
"vmovdqu8 %%ymm0,(%1) \n"
"vmovdqu8 %%ymm1,0x20(%1) \n"
"add $0x40,%1 \n"
"sub $0x40,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_CONVERT16TO8ROW_AVX2
// Use scale to convert to lsb formats depending how many bits there are:
// 512 = 9 bits
// 1024 = 10 bits
@ -5374,8 +5403,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
int width) {
asm volatile (
"vmovd %3,%%xmm2 \n"
"vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
"vbroadcastss %%xmm2,%%ymm2 \n"
"vpbroadcastw %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN