Convert16To8Row_AVX512BW using vpmovuswb

- avx2 is pack/perm is mutating order
- cvt method maintains channel order on avx512

Sapphire Rapids

Benchmark of 640x360 on Sapphire Rapids
AVX512BW
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (3547 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (3186 ms)

AVX2
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (4000 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (3190 ms)

SSE2
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (5433 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (4840 ms)

Skylake Xeon
Now vpmovuswb
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (7946 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (7071 ms)

Was vpackuswb
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (7684 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (7059 ms)

Switch from vpunpcklwd to vpbroadcastw for scale value parameter
Was
vpunpcklwd  %%xmm2,%%xmm2,%%xmm2
vbroadcastss %%xmm2,%%ymm2

Now
vpbroadcastw %%xmm2,%%ymm2

Bug: 357439226, 357721018
Change-Id: Ifc9c82ab70dba58af6efa0f57f5f7a344014652e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5787040
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2024-08-13 16:57:45 -07:00
parent c21dda06dd
commit 679e851f65
6 changed files with 75 additions and 15 deletions

View File

@ -27,12 +27,10 @@ extern "C" {
#define LIBYUV_DISABLE_NEON #define LIBYUV_DISABLE_NEON
#endif #endif
// clang >= 19.0.0 required for SME // temporary disable SME
#if !defined(LIBYUV_DISABLE_SME) && defined(__clang__) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_SME)
#if __clang_major__ < 19
#define LIBYUV_DISABLE_SME #define LIBYUV_DISABLE_SME
#endif #endif
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature) #if defined(__has_feature)

View File

@ -400,10 +400,11 @@ extern "C" {
// The following are available for AVX512 clang x86 platforms: // The following are available for AVX512 clang x86 platforms:
// TODO(fbarchard): Port to GCC and Visual C // TODO(fbarchard): Port to GCC and Visual C
// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 // TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI.
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
#define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_ARGBTORGB24ROW_AVX512VBMI
#define HAS_CONVERT16TO8ROW_AVX512BW
#define HAS_MERGEUVROW_AVX512BW #define HAS_MERGEUVROW_AVX512BW
#endif #endif
@ -3337,6 +3338,10 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
uint8_t* dst_y, uint8_t* dst_y,
int scale, int scale,
int width); int width);
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width);
void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int scale, int scale,
@ -3345,6 +3350,10 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int scale, int scale,
int width); int width);
void Convert16To8Row_Any_AVX512BW(const uint16_t* src_ptr,
uint8_t* dst_ptr,
int scale,
int width);
void Convert16To8Row_NEON(const uint16_t* src_y, void Convert16To8Row_NEON(const uint16_t* src_y,
uint8_t* dst_y, uint8_t* dst_y,
int scale, int scale,

View File

@ -705,6 +705,14 @@ int I010ToNV12(const uint16_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_CONVERT16TO8ROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
Convert16To8Row = Convert16To8Row_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
Convert16To8Row = Convert16To8Row_AVX512BW;
}
}
#endif
#if defined(HAS_MERGEUVROW_SSE2) #if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {

View File

@ -149,6 +149,14 @@ void Convert16To8Plane(const uint16_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_CONVERT16TO8ROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
Convert16To8Row = Convert16To8Row_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
Convert16To8Row = Convert16To8Row_AVX512BW;
}
}
#endif
// Convert plane // Convert plane
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {

View File

@ -1684,8 +1684,8 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
// Any 1 to 1 with parameter and shorts. BPP measures in shorts. // Any 1 to 1 with parameter and shorts. BPP measures in shorts.
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
SIMD_ALIGNED(STYPE vin[32]); \ SIMD_ALIGNED(STYPE vin[64]); \
SIMD_ALIGNED(DTYPE vout[32]); \ SIMD_ALIGNED(DTYPE vout[64]); \
memset(vin, 0, sizeof(vin)); /* for msan */ \ memset(vin, 0, sizeof(vin)); /* for msan */ \
int r = width & MASK; \ int r = width & MASK; \
int n = width & ~MASK; \ int n = width & ~MASK; \
@ -1715,6 +1715,15 @@ ANY11C(Convert16To8Row_Any_AVX2,
uint8_t, uint8_t,
31) 31)
#endif #endif
#ifdef HAS_CONVERT16TO8ROW_AVX512BW
ANY11C(Convert16To8Row_Any_AVX512BW,
Convert16To8Row_AVX512BW,
2,
1,
uint16_t,
uint8_t,
63)
#endif
#ifdef HAS_CONVERT16TO8ROW_NEON #ifdef HAS_CONVERT16TO8ROW_NEON
ANY11C(Convert16To8Row_Any_NEON, ANY11C(Convert16To8Row_Any_NEON,
Convert16To8Row_NEON, Convert16To8Row_NEON,

View File

@ -5202,8 +5202,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
int width) { int width) {
asm volatile ( asm volatile (
"vmovd %3,%%xmm3 \n" "vmovd %3,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" "vpbroadcastw %%xmm3,%%ymm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
"sub %0,%1 \n" "sub %0,%1 \n"
// 32 pixels per loop. // 32 pixels per loop.
@ -5239,8 +5238,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
int width) { int width) {
asm volatile ( asm volatile (
"vmovd %3,%%xmm3 \n" "vmovd %3,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" "vpbroadcastw %%xmm3,%%ymm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
"sub %0,%1 \n" "sub %0,%1 \n"
// 32 pixels per loop. // 32 pixels per loop.
@ -5306,8 +5304,7 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
int width) { int width) {
asm volatile ( asm volatile (
"vmovd %3,%%xmm2 \n" "vmovd %3,%%xmm2 \n"
"vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" "vpbroadcastw %%xmm2,%%ymm2 \n"
"vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop. // 32 pixels per loop.
LABELALIGN LABELALIGN
@ -5332,6 +5329,38 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
} }
#endif // HAS_CONVERT16TO8ROW_AVX2 #endif // HAS_CONVERT16TO8ROW_AVX2
#ifdef HAS_CONVERT16TO8ROW_AVX512BW
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width) {
asm volatile (
"vpbroadcastw %3,%%zmm2 \n"
// 64 pixels per loop.
LABELALIGN
"1: \n"
"vmovdqu8 (%0),%%zmm0 \n"
"vmovdqu8 0x40(%0),%%zmm1 \n"
"add $0x80,%0 \n"
"vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n"
"vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n"
"vpmovuswb %%zmm0,%%ymm0 \n"
"vpmovuswb %%zmm1,%%ymm1 \n"
"vmovdqu8 %%ymm0,(%1) \n"
"vmovdqu8 %%ymm1,0x20(%1) \n"
"add $0x40,%1 \n"
"sub $0x40,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_CONVERT16TO8ROW_AVX2
// Use scale to convert to lsb formats depending how many bits there are: // Use scale to convert to lsb formats depending how many bits there are:
// 512 = 9 bits // 512 = 9 bits
// 1024 = 10 bits // 1024 = 10 bits
@ -5374,8 +5403,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
int width) { int width) {
asm volatile ( asm volatile (
"vmovd %3,%%xmm2 \n" "vmovd %3,%%xmm2 \n"
"vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" "vpbroadcastw %%xmm2,%%ymm2 \n"
"vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop. // 32 pixels per loop.
LABELALIGN LABELALIGN