mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-06-15 00:16:08 +08:00
J400ToARGB switch from SSE2 to AVX2
- port for row_win - remove unused HAS_ macros Was C/SSE2 MSVC J400ToARGB_Opt (1967 ms) Clang J400ToARGB_Opt (568 ms) Now AVX2 MSVC J400ToARGB_Opt (411 ms) Clang J400ToARGB_Opt (418 ms) Test: libyuv_unittest --gtest_filter=*J400ToARGB* Bug: libyuv:508639302 Change-Id: Ifdfb026832b708b61f55477250cc5ee52449f421 TAG=agy CONV=186608fc-966a-4ea7-bf57-9fe07cc1383c Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7877368 Commit-Queue: Frank Barchard <fbarchard@google.com> Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
904f562d86
commit
e449eb2172
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||
Version: 1941
|
||||
Version: 1942
|
||||
Revision: DEPS
|
||||
License: BSD-3-Clause
|
||||
License File: LICENSE
|
||||
|
||||
@ -49,7 +49,6 @@ extern "C" {
|
||||
#define HAS_ARGBTORGB565ROW_SSE2
|
||||
#define HAS_COPYROW_ERMS
|
||||
#define HAS_COPYROW_SSE2
|
||||
#define HAS_H422TOARGBROW_SSSE3
|
||||
#define HAS_I422TOARGB1555ROW_SSSE3
|
||||
#define HAS_I422TOARGB4444ROW_SSSE3
|
||||
#define HAS_I422TOARGBROW_SSSE3
|
||||
@ -60,11 +59,8 @@ extern "C" {
|
||||
#define HAS_I422TOYUY2ROW_SSE2
|
||||
#define HAS_I444TOARGBROW_SSSE3
|
||||
#define HAS_I444TORGB24ROW_SSSE3
|
||||
#define HAS_J400TOARGBROW_SSE2
|
||||
#define HAS_J422TOARGBROW_SSSE3
|
||||
#define HAS_MERGEUVROW_SSE2
|
||||
#define HAS_MIRRORROW_SSSE3
|
||||
#define HAS_MIRRORSPLITUVROW_SSSE3
|
||||
#define HAS_NV12TOARGBROW_SSSE3
|
||||
#define HAS_NV12TORGB24ROW_SSSE3
|
||||
#define HAS_NV12TORGB565ROW_SSSE3
|
||||
@ -78,8 +74,6 @@ extern "C" {
|
||||
#define HAS_SETROW_X86
|
||||
#define HAS_SPLITUVROW_SSE2
|
||||
#define HAS_UYVYTOARGBROW_SSSE3
|
||||
#define HAS_UYVYTOUV422ROW_SSE2
|
||||
#define HAS_UYVYTOUVROW_SSE2
|
||||
#define HAS_UYVYTOYROW_SSE2
|
||||
#define HAS_YUY2TOARGBROW_SSSE3
|
||||
#define HAS_YUY2TOUV422ROW_SSE2
|
||||
@ -163,7 +157,6 @@ extern "C" {
|
||||
#define HAS_ARGBSHUFFLEROW_AVX2
|
||||
#define HAS_ARGBTORGB565DITHERROW_AVX2
|
||||
#define HAS_COPYROW_AVX
|
||||
#define HAS_H422TOARGBROW_AVX2
|
||||
#define HAS_HALFFLOATROW_AVX2
|
||||
#define HAS_I422TOARGB1555ROW_AVX2
|
||||
#define HAS_I422TOARGB4444ROW_AVX2
|
||||
@ -173,7 +166,7 @@ extern "C" {
|
||||
#define HAS_I422TORGBAROW_AVX2
|
||||
#define HAS_I444TOARGBROW_AVX2
|
||||
#define HAS_I444TORGB24ROW_AVX2
|
||||
#define HAS_J422TOARGBROW_AVX2
|
||||
#define HAS_J400TOARGBROW_AVX2
|
||||
#define HAS_MIRRORROW_AVX2
|
||||
#define HAS_MIRRORSPLITUVROW_AVX2
|
||||
#define HAS_NV12TOARGBROW_AVX2
|
||||
@ -183,8 +176,6 @@ extern "C" {
|
||||
#define HAS_NV21TORGB24ROW_AVX2
|
||||
#define HAS_SPLITUVROW_AVX2
|
||||
#define HAS_UYVYTOARGBROW_AVX2
|
||||
#define HAS_UYVYTOUV422ROW_AVX2
|
||||
#define HAS_UYVYTOUVROW_AVX2
|
||||
#define HAS_UYVYTOYROW_AVX2
|
||||
#define HAS_YUY2TOARGBROW_AVX2
|
||||
#define HAS_YUY2TOUV422ROW_AVX2
|
||||
@ -256,16 +247,11 @@ extern "C" {
|
||||
#define HAS_ARGBTOYROW_SSSE3
|
||||
#define HAS_ARGBTOYMATRIXROW_SSSE3
|
||||
#define HAS_BGRATOYROW_SSSE3
|
||||
#define HAS_ABGRTOYROW_SSSE3
|
||||
#define HAS_RGBATOYROW_SSSE3
|
||||
|
||||
// TODO: adjust row_win to use 8 bit negative coefficients.
|
||||
#define HAS_ABGRTOUVJROW_SSSE3
|
||||
#define HAS_ARGBTOUVJROW_SSSE3
|
||||
#define HAS_ABGRTOUVROW_SSSE3
|
||||
#define HAS_ARGBTOUVROW_SSSE3
|
||||
#define HAS_BGRATOUVROW_SSSE3
|
||||
#define HAS_RGBATOUVROW_SSSE3
|
||||
#define HAS_ARGBTOUVMATRIXROW_SSSE3
|
||||
#define HAS_ARGBTOUV444MATRIXROW_SSSE3
|
||||
|
||||
@ -372,8 +358,7 @@ extern "C" {
|
||||
#define HAS_ARGBTOYJROW_AVX2
|
||||
#define HAS_ABGRTOYJROW_AVX2
|
||||
#define HAS_RGBATOYJROW_AVX2
|
||||
#define HAS_RGBATOYROW_AVX2
|
||||
#define HAS_BGRATOYROW_AVX2
|
||||
#define HAS_J400TOARGBROW_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available for AVX512 clang x86 platforms:
|
||||
@ -397,8 +382,6 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \
|
||||
(defined(CLANG_HAS_AVX512))
|
||||
#define HAS_I422TOARGBROW_AVX512BW
|
||||
#define HAS_I444TOARGBROW_AVX512BW
|
||||
#define HAS_I444TORGB24ROW_AVX512BW
|
||||
#define HAS_ARGBTOUV444ROW_AVX512BW
|
||||
#define HAS_ARGBTOUV444MATRIXROW_AVX512BW
|
||||
#define HAS_ARGBTOYROW_AVX512BW
|
||||
@ -407,6 +390,7 @@ extern "C" {
|
||||
#define HAS_ARGBTOUVROW_AVX512BW
|
||||
#define HAS_ARGBTOUVJROW_AVX512BW
|
||||
#define HAS_ARGBTOUVMATRIXROW_AVX512BW
|
||||
#define HAS_J400TOARGBROW_AVX512BW
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms:
|
||||
@ -449,7 +433,6 @@ extern "C" {
|
||||
#define HAS_ARGBTOYMATRIXROW_NEON
|
||||
#endif
|
||||
#define HAS_ARGBTOYROW_NEON
|
||||
#define HAS_AYUVTOUVROW_NEON
|
||||
#define HAS_AYUVTOVUROW_NEON
|
||||
#define HAS_AYUVTOYROW_NEON
|
||||
#define HAS_BGRATOUVROW_NEON
|
||||
@ -523,7 +506,6 @@ extern "C" {
|
||||
#define HAS_SWAPUVROW_NEON
|
||||
#define HAS_UNPACKMT2T_NEON
|
||||
#define HAS_UYVYTOARGBROW_NEON
|
||||
#define HAS_UYVYTOUV422ROW_NEON
|
||||
#define HAS_UYVYTOUVROW_NEON
|
||||
#define HAS_UYVYTOYROW_NEON
|
||||
#define HAS_YUY2TOARGBROW_NEON
|
||||
@ -4386,15 +4368,15 @@ void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int width);
|
||||
|
||||
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
|
||||
void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width);
|
||||
void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
|
||||
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
|
||||
void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width);
|
||||
void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width);
|
||||
void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
|
||||
void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int width);
|
||||
void J400ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int width);
|
||||
void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int width);
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1941
|
||||
#define LIBYUV_VERSION 1942
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -3280,14 +3280,7 @@ int J400ToARGB(const uint8_t* src_y,
|
||||
height = 1;
|
||||
src_stride_y = dst_stride_argb = 0;
|
||||
}
|
||||
#if defined(HAS_J400TOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
J400ToARGBRow = J400ToARGBRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
J400ToARGBRow = J400ToARGBRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_J400TOARGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
J400ToARGBRow = J400ToARGBRow_Any_AVX2;
|
||||
@ -3296,6 +3289,14 @@ int J400ToARGB(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_J400TOARGBROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
J400ToARGBRow = J400ToARGBRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
J400ToARGBRow = J400ToARGBRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_J400TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
J400ToARGBRow = J400ToARGBRow_Any_NEON;
|
||||
|
||||
@ -983,8 +983,9 @@ ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
|
||||
#if defined(HAS_ARGBTOAR30ROW_AVX2)
|
||||
ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
|
||||
#endif
|
||||
#if defined(HAS_J400TOARGBROW_SSE2)
|
||||
ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
|
||||
|
||||
#if defined(HAS_J400TOARGBROW_AVX512BW)
|
||||
ANY11(J400ToARGBRow_Any_AVX512BW, J400ToARGBRow_AVX512BW, 0, 1, 4, 31)
|
||||
#endif
|
||||
#if defined(HAS_J400TOARGBROW_AVX2)
|
||||
ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
|
||||
|
||||
@ -117,34 +117,74 @@ static const lvec8 kShuffleNV21 = {
|
||||
};
|
||||
#endif // HAS_RGB24TOARGBROW_SSSE3
|
||||
|
||||
#ifdef HAS_J400TOARGBROW_SSE2
|
||||
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
#if defined(HAS_J400TOARGBROW_AVX2) || defined(HAS_J400TOARGBROW_AVX512BW)
|
||||
alignas(64) static const uint8_t kShuffleMaskJ400ToARGB[64] = {
|
||||
0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u,
|
||||
4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u,
|
||||
8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u,
|
||||
12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef HAS_J400TOARGBROW_AVX2
|
||||
void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"pslld $0x18,%%xmm5 \n"
|
||||
"vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n"
|
||||
"vpslld $0x18,%%ymm7,%%ymm7 \n"
|
||||
"vmovdqa (%3),%%ymm5 \n"
|
||||
"vmovdqa 0x20(%3),%%ymm6 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movq (%0),%%xmm0 \n"
|
||||
"lea 0x8(%0),%0 \n"
|
||||
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklwd %%xmm0,%%xmm0 \n"
|
||||
"punpckhwd %%xmm1,%%xmm1 \n"
|
||||
"por %%xmm5,%%xmm0 \n"
|
||||
"por %%xmm5,%%xmm1 \n"
|
||||
"movdqu %%xmm0,(%1) \n"
|
||||
"movdqu %%xmm1,0x10(%1) \n"
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"vbroadcasti128 (%0),%%ymm0 \n"
|
||||
"vpshufb %%ymm5,%%ymm0,%%ymm1 \n"
|
||||
"vpshufb %%ymm6,%%ymm0,%%ymm2 \n"
|
||||
"vpor %%ymm7,%%ymm1,%%ymm1 \n"
|
||||
"vpor %%ymm7,%%ymm2,%%ymm2 \n"
|
||||
"vmovdqu %%ymm1,(%1) \n"
|
||||
"vmovdqu %%ymm2,0x20(%1) \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"lea 0x40(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
::"memory",
|
||||
"cc", "xmm0", "xmm1", "xmm5");
|
||||
: "r"(kShuffleMaskJ400ToARGB) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
|
||||
}
|
||||
#endif // HAS_J400TOARGBROW_SSE2
|
||||
#endif // HAS_J400TOARGBROW_AVX2
|
||||
|
||||
#ifdef HAS_J400TOARGBROW_AVX512BW
|
||||
void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"vpternlogd $0xff,%%zmm7,%%zmm7,%%zmm7 \n" // 0xffffffff
|
||||
"vpslld $0x18,%%zmm7,%%zmm7 \n" // 0xff000000
|
||||
"vmovdqa64 %3,%%zmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vbroadcasti32x4 (%0),%%zmm0 \n"
|
||||
"vbroadcasti32x4 0x10(%0),%%zmm1 \n"
|
||||
"vpshufb %%zmm5,%%zmm0,%%zmm0 \n"
|
||||
"vpshufb %%zmm5,%%zmm1,%%zmm1 \n"
|
||||
"vpord %%zmm7,%%zmm0,%%zmm0 \n"
|
||||
"vpord %%zmm7,%%zmm1,%%zmm1 \n"
|
||||
"vmovdqu64 %%zmm0,(%1) \n"
|
||||
"vmovdqu64 %%zmm1,0x40(%1) \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"lea 0x80(%1),%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kShuffleMaskJ400ToARGB) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm5", "xmm7");
|
||||
}
|
||||
#endif // HAS_J400TOARGBROW_AVX512BW
|
||||
|
||||
#ifdef HAS_RGB24TOARGBROW_SSSE3
|
||||
void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
|
||||
|
||||
@ -669,6 +669,41 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_J400TOARGBROW_AVX2
|
||||
alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_0[32] = {
|
||||
0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u,
|
||||
4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u
|
||||
};
|
||||
alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_1[32] = {
|
||||
8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u,
|
||||
12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u
|
||||
};
|
||||
|
||||
LIBYUV_TARGET_AVX2
|
||||
void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
__m256i ymm_mask0 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0);
|
||||
__m256i ymm_mask1 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1);
|
||||
__m256i ymm_alpha = _mm256_set1_epi32((int)0xff000000u);
|
||||
|
||||
while (width > 0) {
|
||||
__m256i ymm0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y));
|
||||
|
||||
__m256i ymm1 = _mm256_shuffle_epi8(ymm0, ymm_mask0);
|
||||
__m256i ymm2 = _mm256_shuffle_epi8(ymm0, ymm_mask1);
|
||||
|
||||
ymm1 = _mm256_or_si256(ymm1, ymm_alpha);
|
||||
ymm2 = _mm256_or_si256(ymm2, ymm_alpha);
|
||||
|
||||
_mm256_storeu_si256((__m256i*)dst_argb, ymm1);
|
||||
_mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm2);
|
||||
|
||||
src_y += 16;
|
||||
dst_argb += 64;
|
||||
width -= 16;
|
||||
}
|
||||
}
|
||||
#endif // HAS_J400TOARGBROW_AVX2
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user