mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Convert16To8Row_AVX512BW using vpmovuswb
- avx2 is pack/perm is mutating order - cvt method maintains channel order on avx512 Sapphire Rapids Benchmark of 640x360 on Sapphire Rapids AVX512BW [ OK ] LibYUVConvertTest.I010ToNV12_Opt (3547 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (3186 ms) AVX2 [ OK ] LibYUVConvertTest.I010ToNV12_Opt (4000 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (3190 ms) SSE2 [ OK ] LibYUVConvertTest.I010ToNV12_Opt (5433 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (4840 ms) Skylake Xeon Now vpmovuswb [ OK ] LibYUVConvertTest.I010ToNV12_Opt (7946 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (7071 ms) Was vpackuswb [ OK ] LibYUVConvertTest.I010ToNV12_Opt (7684 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (7059 ms) Switch from vpunpcklwd to vpbroadcastw for scale value parameter Was vpunpcklwd %%xmm2,%%xmm2,%%xmm2 vbroadcastss %%xmm2,%%ymm2 Now vpbroadcastw %%xmm2,%%ymm2 Bug: 357439226, 357721018 Change-Id: Ifc9c82ab70dba58af6efa0f57f5f7a344014652e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5787040 Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
parent
c21dda06dd
commit
679e851f65
@ -27,12 +27,10 @@ extern "C" {
|
||||
#define LIBYUV_DISABLE_NEON
|
||||
#endif
|
||||
|
||||
// clang >= 19.0.0 required for SME
|
||||
#if !defined(LIBYUV_DISABLE_SME) && defined(__clang__) && defined(__aarch64__)
|
||||
#if __clang_major__ < 19
|
||||
// temporary disable SME
|
||||
#if !defined(LIBYUV_DISABLE_SME)
|
||||
#define LIBYUV_DISABLE_SME
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
|
||||
#if defined(__has_feature)
|
||||
|
||||
@ -400,10 +400,11 @@ extern "C" {
|
||||
|
||||
// The following are available for AVX512 clang x86 platforms:
|
||||
// TODO(fbarchard): Port to GCC and Visual C
|
||||
// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
|
||||
// TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
|
||||
#define HAS_ARGBTORGB24ROW_AVX512VBMI
|
||||
#define HAS_CONVERT16TO8ROW_AVX512BW
|
||||
#define HAS_MERGEUVROW_AVX512BW
|
||||
#endif
|
||||
|
||||
@ -3337,6 +3338,10 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int scale,
|
||||
@ -3345,6 +3350,10 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert16To8Row_Any_AVX512BW(const uint16_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert16To8Row_NEON(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
|
||||
@ -705,6 +705,14 @@ int I010ToNV12(const uint16_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_CONVERT16TO8ROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
Convert16To8Row = Convert16To8Row_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
Convert16To8Row = Convert16To8Row_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_MERGEUVROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
|
||||
@ -149,6 +149,14 @@ void Convert16To8Plane(const uint16_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_CONVERT16TO8ROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
Convert16To8Row = Convert16To8Row_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
Convert16To8Row = Convert16To8Row_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Convert plane
|
||||
for (y = 0; y < height; ++y) {
|
||||
|
||||
@ -1684,8 +1684,8 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
|
||||
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
|
||||
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
|
||||
void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
|
||||
SIMD_ALIGNED(STYPE vin[32]); \
|
||||
SIMD_ALIGNED(DTYPE vout[32]); \
|
||||
SIMD_ALIGNED(STYPE vin[64]); \
|
||||
SIMD_ALIGNED(DTYPE vout[64]); \
|
||||
memset(vin, 0, sizeof(vin)); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
@ -1715,6 +1715,15 @@ ANY11C(Convert16To8Row_Any_AVX2,
|
||||
uint8_t,
|
||||
31)
|
||||
#endif
|
||||
#ifdef HAS_CONVERT16TO8ROW_AVX512BW
|
||||
ANY11C(Convert16To8Row_Any_AVX512BW,
|
||||
Convert16To8Row_AVX512BW,
|
||||
2,
|
||||
1,
|
||||
uint16_t,
|
||||
uint8_t,
|
||||
63)
|
||||
#endif
|
||||
#ifdef HAS_CONVERT16TO8ROW_NEON
|
||||
ANY11C(Convert16To8Row_Any_NEON,
|
||||
Convert16To8Row_NEON,
|
||||
|
||||
@ -5202,8 +5202,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"vmovd %3,%%xmm3 \n"
|
||||
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
||||
"vbroadcastss %%xmm3,%%ymm3 \n"
|
||||
"vpbroadcastw %%xmm3,%%ymm3 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
// 32 pixels per loop.
|
||||
@ -5239,8 +5238,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"vmovd %3,%%xmm3 \n"
|
||||
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
||||
"vbroadcastss %%xmm3,%%ymm3 \n"
|
||||
"vpbroadcastw %%xmm3,%%ymm3 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
// 32 pixels per loop.
|
||||
@ -5306,8 +5304,7 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"vmovd %3,%%xmm2 \n"
|
||||
"vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
|
||||
"vbroadcastss %%xmm2,%%ymm2 \n"
|
||||
"vpbroadcastw %%xmm2,%%ymm2 \n"
|
||||
|
||||
// 32 pixels per loop.
|
||||
LABELALIGN
|
||||
@ -5332,6 +5329,38 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
|
||||
}
|
||||
#endif // HAS_CONVERT16TO8ROW_AVX2
|
||||
|
||||
#ifdef HAS_CONVERT16TO8ROW_AVX512BW
|
||||
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"vpbroadcastw %3,%%zmm2 \n"
|
||||
|
||||
// 64 pixels per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu8 (%0),%%zmm0 \n"
|
||||
"vmovdqu8 0x40(%0),%%zmm1 \n"
|
||||
"add $0x80,%0 \n"
|
||||
"vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n"
|
||||
"vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n"
|
||||
"vpmovuswb %%zmm0,%%ymm0 \n"
|
||||
"vpmovuswb %%zmm1,%%ymm1 \n"
|
||||
"vmovdqu8 %%ymm0,(%1) \n"
|
||||
"vmovdqu8 %%ymm1,0x20(%1) \n"
|
||||
"add $0x40,%1 \n"
|
||||
"sub $0x40,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
}
|
||||
#endif // HAS_CONVERT16TO8ROW_AVX2
|
||||
|
||||
// Use scale to convert to lsb formats depending how many bits there are:
|
||||
// 512 = 9 bits
|
||||
// 1024 = 10 bits
|
||||
@ -5374,8 +5403,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"vmovd %3,%%xmm2 \n"
|
||||
"vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
|
||||
"vbroadcastss %%xmm2,%%ymm2 \n"
|
||||
"vpbroadcastw %%xmm2,%%ymm2 \n"
|
||||
|
||||
// 32 pixels per loop.
|
||||
LABELALIGN
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user