mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
Convert16To8Row_AVX512BW using vpmovuswb
- avx2 is pack/perm is mutating order - cvt method maintains channel order on avx512 Sapphire Rapids Benchmark of 640x360 on Sapphire Rapids AVX512BW [ OK ] LibYUVConvertTest.I010ToNV12_Opt (3547 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (3186 ms) AVX2 [ OK ] LibYUVConvertTest.I010ToNV12_Opt (4000 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (3190 ms) SSE2 [ OK ] LibYUVConvertTest.I010ToNV12_Opt (5433 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (4840 ms) Skylake Xeon Now vpmovuswb [ OK ] LibYUVConvertTest.I010ToNV12_Opt (7946 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (7071 ms) Was vpackuswb [ OK ] LibYUVConvertTest.I010ToNV12_Opt (7684 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (7059 ms) Switch from vpunpcklwd to vpbroadcastw for scale value parameter Was vpunpcklwd %%xmm2,%%xmm2,%%xmm2 vbroadcastss %%xmm2,%%ymm2 Now vpbroadcastw %%xmm2,%%ymm2 Bug: 357439226, 357721018 Change-Id: Ifc9c82ab70dba58af6efa0f57f5f7a344014652e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5787040 Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
parent
c21dda06dd
commit
679e851f65
@ -27,12 +27,10 @@ extern "C" {
|
|||||||
#define LIBYUV_DISABLE_NEON
|
#define LIBYUV_DISABLE_NEON
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// clang >= 19.0.0 required for SME
|
// temporary disable SME
|
||||||
#if !defined(LIBYUV_DISABLE_SME) && defined(__clang__) && defined(__aarch64__)
|
#if !defined(LIBYUV_DISABLE_SME)
|
||||||
#if __clang_major__ < 19
|
|
||||||
#define LIBYUV_DISABLE_SME
|
#define LIBYUV_DISABLE_SME
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
|
||||||
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
|
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
|
||||||
#if defined(__has_feature)
|
#if defined(__has_feature)
|
||||||
|
|||||||
@ -400,10 +400,11 @@ extern "C" {
|
|||||||
|
|
||||||
// The following are available for AVX512 clang x86 platforms:
|
// The following are available for AVX512 clang x86 platforms:
|
||||||
// TODO(fbarchard): Port to GCC and Visual C
|
// TODO(fbarchard): Port to GCC and Visual C
|
||||||
// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
|
// TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI.
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||||
(defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
|
(defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
|
||||||
#define HAS_ARGBTORGB24ROW_AVX512VBMI
|
#define HAS_ARGBTORGB24ROW_AVX512VBMI
|
||||||
|
#define HAS_CONVERT16TO8ROW_AVX512BW
|
||||||
#define HAS_MERGEUVROW_AVX512BW
|
#define HAS_MERGEUVROW_AVX512BW
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -3337,6 +3338,10 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
|
|||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int scale,
|
int scale,
|
||||||
int width);
|
int width);
|
||||||
|
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int scale,
|
||||||
|
int width);
|
||||||
void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
|
void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int scale,
|
int scale,
|
||||||
@ -3345,6 +3350,10 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int scale,
|
int scale,
|
||||||
int width);
|
int width);
|
||||||
|
void Convert16To8Row_Any_AVX512BW(const uint16_t* src_ptr,
|
||||||
|
uint8_t* dst_ptr,
|
||||||
|
int scale,
|
||||||
|
int width);
|
||||||
void Convert16To8Row_NEON(const uint16_t* src_y,
|
void Convert16To8Row_NEON(const uint16_t* src_y,
|
||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int scale,
|
int scale,
|
||||||
|
|||||||
@ -705,6 +705,14 @@ int I010ToNV12(const uint16_t* src_y,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_CONVERT16TO8ROW_AVX512BW)
|
||||||
|
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||||
|
Convert16To8Row = Convert16To8Row_Any_AVX512BW;
|
||||||
|
if (IS_ALIGNED(width, 64)) {
|
||||||
|
Convert16To8Row = Convert16To8Row_AVX512BW;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(HAS_MERGEUVROW_SSE2)
|
#if defined(HAS_MERGEUVROW_SSE2)
|
||||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||||
|
|||||||
@ -149,6 +149,14 @@ void Convert16To8Plane(const uint16_t* src_y,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_CONVERT16TO8ROW_AVX512BW)
|
||||||
|
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||||
|
Convert16To8Row = Convert16To8Row_Any_AVX512BW;
|
||||||
|
if (IS_ALIGNED(width, 64)) {
|
||||||
|
Convert16To8Row = Convert16To8Row_AVX512BW;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Convert plane
|
// Convert plane
|
||||||
for (y = 0; y < height; ++y) {
|
for (y = 0; y < height; ++y) {
|
||||||
|
|||||||
@ -1684,8 +1684,8 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
|
|||||||
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
|
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
|
||||||
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
|
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
|
||||||
void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
|
void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
|
||||||
SIMD_ALIGNED(STYPE vin[32]); \
|
SIMD_ALIGNED(STYPE vin[64]); \
|
||||||
SIMD_ALIGNED(DTYPE vout[32]); \
|
SIMD_ALIGNED(DTYPE vout[64]); \
|
||||||
memset(vin, 0, sizeof(vin)); /* for msan */ \
|
memset(vin, 0, sizeof(vin)); /* for msan */ \
|
||||||
int r = width & MASK; \
|
int r = width & MASK; \
|
||||||
int n = width & ~MASK; \
|
int n = width & ~MASK; \
|
||||||
@ -1715,6 +1715,15 @@ ANY11C(Convert16To8Row_Any_AVX2,
|
|||||||
uint8_t,
|
uint8_t,
|
||||||
31)
|
31)
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef HAS_CONVERT16TO8ROW_AVX512BW
|
||||||
|
ANY11C(Convert16To8Row_Any_AVX512BW,
|
||||||
|
Convert16To8Row_AVX512BW,
|
||||||
|
2,
|
||||||
|
1,
|
||||||
|
uint16_t,
|
||||||
|
uint8_t,
|
||||||
|
63)
|
||||||
|
#endif
|
||||||
#ifdef HAS_CONVERT16TO8ROW_NEON
|
#ifdef HAS_CONVERT16TO8ROW_NEON
|
||||||
ANY11C(Convert16To8Row_Any_NEON,
|
ANY11C(Convert16To8Row_Any_NEON,
|
||||||
Convert16To8Row_NEON,
|
Convert16To8Row_NEON,
|
||||||
|
|||||||
@ -5202,8 +5202,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
|
|||||||
int width) {
|
int width) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmovd %3,%%xmm3 \n"
|
"vmovd %3,%%xmm3 \n"
|
||||||
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
"vpbroadcastw %%xmm3,%%ymm3 \n"
|
||||||
"vbroadcastss %%xmm3,%%ymm3 \n"
|
|
||||||
"sub %0,%1 \n"
|
"sub %0,%1 \n"
|
||||||
|
|
||||||
// 32 pixels per loop.
|
// 32 pixels per loop.
|
||||||
@ -5239,8 +5238,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
|
|||||||
int width) {
|
int width) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmovd %3,%%xmm3 \n"
|
"vmovd %3,%%xmm3 \n"
|
||||||
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
"vpbroadcastw %%xmm3,%%ymm3 \n"
|
||||||
"vbroadcastss %%xmm3,%%ymm3 \n"
|
|
||||||
"sub %0,%1 \n"
|
"sub %0,%1 \n"
|
||||||
|
|
||||||
// 32 pixels per loop.
|
// 32 pixels per loop.
|
||||||
@ -5306,8 +5304,7 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
|
|||||||
int width) {
|
int width) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmovd %3,%%xmm2 \n"
|
"vmovd %3,%%xmm2 \n"
|
||||||
"vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
|
"vpbroadcastw %%xmm2,%%ymm2 \n"
|
||||||
"vbroadcastss %%xmm2,%%ymm2 \n"
|
|
||||||
|
|
||||||
// 32 pixels per loop.
|
// 32 pixels per loop.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
@ -5332,6 +5329,38 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
|
|||||||
}
|
}
|
||||||
#endif // HAS_CONVERT16TO8ROW_AVX2
|
#endif // HAS_CONVERT16TO8ROW_AVX2
|
||||||
|
|
||||||
|
#ifdef HAS_CONVERT16TO8ROW_AVX512BW
|
||||||
|
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int scale,
|
||||||
|
int width) {
|
||||||
|
asm volatile (
|
||||||
|
"vpbroadcastw %3,%%zmm2 \n"
|
||||||
|
|
||||||
|
// 64 pixels per loop.
|
||||||
|
LABELALIGN
|
||||||
|
"1: \n"
|
||||||
|
"vmovdqu8 (%0),%%zmm0 \n"
|
||||||
|
"vmovdqu8 0x40(%0),%%zmm1 \n"
|
||||||
|
"add $0x80,%0 \n"
|
||||||
|
"vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n"
|
||||||
|
"vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n"
|
||||||
|
"vpmovuswb %%zmm0,%%ymm0 \n"
|
||||||
|
"vpmovuswb %%zmm1,%%ymm1 \n"
|
||||||
|
"vmovdqu8 %%ymm0,(%1) \n"
|
||||||
|
"vmovdqu8 %%ymm1,0x20(%1) \n"
|
||||||
|
"add $0x40,%1 \n"
|
||||||
|
"sub $0x40,%2 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
"vzeroupper \n"
|
||||||
|
: "+r"(src_y), // %0
|
||||||
|
"+r"(dst_y), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
: "r"(scale) // %3
|
||||||
|
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||||
|
}
|
||||||
|
#endif // HAS_CONVERT16TO8ROW_AVX2
|
||||||
|
|
||||||
// Use scale to convert to lsb formats depending how many bits there are:
|
// Use scale to convert to lsb formats depending how many bits there are:
|
||||||
// 512 = 9 bits
|
// 512 = 9 bits
|
||||||
// 1024 = 10 bits
|
// 1024 = 10 bits
|
||||||
@ -5374,8 +5403,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
|
|||||||
int width) {
|
int width) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmovd %3,%%xmm2 \n"
|
"vmovd %3,%%xmm2 \n"
|
||||||
"vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
|
"vpbroadcastw %%xmm2,%%ymm2 \n"
|
||||||
"vbroadcastss %%xmm2,%%ymm2 \n"
|
|
||||||
|
|
||||||
// 32 pixels per loop.
|
// 32 pixels per loop.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user