J400ToARGB switch from SSE2 to AVX2

- port for row_win
- remove unused HAS_ macros

Was C/SSE2
MSVC  J400ToARGB_Opt (1967 ms)
Clang J400ToARGB_Opt (568 ms)

Now AVX2
MSVC  J400ToARGB_Opt (411 ms)
Clang J400ToARGB_Opt (418 ms)

Test: libyuv_unittest --gtest_filter=*J400ToARGB*
Bug: libyuv:508639302

Change-Id: Ifdfb026832b708b61f55477250cc5ee52449f421
TAG=agy
CONV=186608fc-966a-4ea7-bf57-9fe07cc1383c
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7877368
Commit-Queue: Frank Barchard <fbarchard@google.com>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
Frank Barchard 2026-05-28 21:06:47 -07:00 committed by libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com
parent 904f562d86
commit e449eb2172
7 changed files with 115 additions and 56 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1941
Version: 1942
Revision: DEPS
License: BSD-3-Clause
License File: LICENSE

View File

@ -49,7 +49,6 @@ extern "C" {
#define HAS_ARGBTORGB565ROW_SSE2
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
#define HAS_H422TOARGBROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
@ -60,11 +59,8 @@ extern "C" {
#define HAS_I422TOYUY2ROW_SSE2
#define HAS_I444TOARGBROW_SSSE3
#define HAS_I444TORGB24ROW_SSSE3
#define HAS_J400TOARGBROW_SSE2
#define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSSE3
#define HAS_MIRRORSPLITUVROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV12TORGB24ROW_SSSE3
#define HAS_NV12TORGB565ROW_SSSE3
@ -78,8 +74,6 @@ extern "C" {
#define HAS_SETROW_X86
#define HAS_SPLITUVROW_SSE2
#define HAS_UYVYTOARGBROW_SSSE3
#define HAS_UYVYTOUV422ROW_SSE2
#define HAS_UYVYTOUVROW_SSE2
#define HAS_UYVYTOYROW_SSE2
#define HAS_YUY2TOARGBROW_SSSE3
#define HAS_YUY2TOUV422ROW_SSE2
@ -163,7 +157,6 @@ extern "C" {
#define HAS_ARGBSHUFFLEROW_AVX2
#define HAS_ARGBTORGB565DITHERROW_AVX2
#define HAS_COPYROW_AVX
#define HAS_H422TOARGBROW_AVX2
#define HAS_HALFFLOATROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
@ -173,7 +166,7 @@ extern "C" {
#define HAS_I422TORGBAROW_AVX2
#define HAS_I444TOARGBROW_AVX2
#define HAS_I444TORGB24ROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_J400TOARGBROW_AVX2
#define HAS_MIRRORROW_AVX2
#define HAS_MIRRORSPLITUVROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
@ -183,8 +176,6 @@ extern "C" {
#define HAS_NV21TORGB24ROW_AVX2
#define HAS_SPLITUVROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#define HAS_UYVYTOUV422ROW_AVX2
#define HAS_UYVYTOUVROW_AVX2
#define HAS_UYVYTOYROW_AVX2
#define HAS_YUY2TOARGBROW_AVX2
#define HAS_YUY2TOUV422ROW_AVX2
@ -256,16 +247,11 @@ extern "C" {
#define HAS_ARGBTOYROW_SSSE3
#define HAS_ARGBTOYMATRIXROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
// TODO: adjust row_win to use 8 bit negative coefficients.
#define HAS_ABGRTOUVJROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3
#define HAS_ARGBTOUVMATRIXROW_SSSE3
#define HAS_ARGBTOUV444MATRIXROW_SSSE3
@ -372,8 +358,7 @@ extern "C" {
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ABGRTOYJROW_AVX2
#define HAS_RGBATOYJROW_AVX2
#define HAS_RGBATOYROW_AVX2
#define HAS_BGRATOYROW_AVX2
#define HAS_J400TOARGBROW_AVX2
#endif
// The following are available for AVX512 clang x86 platforms:
@ -397,8 +382,6 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \
(defined(CLANG_HAS_AVX512))
#define HAS_I422TOARGBROW_AVX512BW
#define HAS_I444TOARGBROW_AVX512BW
#define HAS_I444TORGB24ROW_AVX512BW
#define HAS_ARGBTOUV444ROW_AVX512BW
#define HAS_ARGBTOUV444MATRIXROW_AVX512BW
#define HAS_ARGBTOYROW_AVX512BW
@ -407,6 +390,7 @@ extern "C" {
#define HAS_ARGBTOUVROW_AVX512BW
#define HAS_ARGBTOUVJROW_AVX512BW
#define HAS_ARGBTOUVMATRIXROW_AVX512BW
#define HAS_J400TOARGBROW_AVX512BW
#endif
// The following are available on Neon platforms:
@ -449,7 +433,6 @@ extern "C" {
#define HAS_ARGBTOYMATRIXROW_NEON
#endif
#define HAS_ARGBTOYROW_NEON
#define HAS_AYUVTOUVROW_NEON
#define HAS_AYUVTOVUROW_NEON
#define HAS_AYUVTOYROW_NEON
#define HAS_BGRATOUVROW_NEON
@ -523,7 +506,6 @@ extern "C" {
#define HAS_SWAPUVROW_NEON
#define HAS_UNPACKMT2T_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON
#define HAS_UYVYTOYROW_NEON
#define HAS_YUY2TOARGBROW_NEON
@ -4386,15 +4368,15 @@ void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr,
uint8_t* dst_ptr,
int width);
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void J400ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1941
#define LIBYUV_VERSION 1942
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -3280,14 +3280,7 @@ int J400ToARGB(const uint8_t* src_y,
height = 1;
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_J400TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
J400ToARGBRow = J400ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
J400ToARGBRow = J400ToARGBRow_SSE2;
}
}
#endif
#if defined(HAS_J400TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J400ToARGBRow = J400ToARGBRow_Any_AVX2;
@ -3296,6 +3289,14 @@ int J400ToARGB(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_J400TOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
J400ToARGBRow = J400ToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 32)) {
J400ToARGBRow = J400ToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_J400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
J400ToARGBRow = J400ToARGBRow_Any_NEON;

View File

@ -983,8 +983,9 @@ ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
#if defined(HAS_ARGBTOAR30ROW_AVX2)
ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
#endif
#if defined(HAS_J400TOARGBROW_SSE2)
ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
#if defined(HAS_J400TOARGBROW_AVX512BW)
ANY11(J400ToARGBRow_Any_AVX512BW, J400ToARGBRow_AVX512BW, 0, 1, 4, 31)
#endif
#if defined(HAS_J400TOARGBROW_AVX2)
ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)

View File

@ -117,34 +117,74 @@ static const lvec8 kShuffleNV21 = {
};
#endif // HAS_RGB24TOARGBROW_SSSE3
#ifdef HAS_J400TOARGBROW_SSE2
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
#if defined(HAS_J400TOARGBROW_AVX2) || defined(HAS_J400TOARGBROW_AVX512BW)
alignas(64) static const uint8_t kShuffleMaskJ400ToARGB[64] = {
0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u,
4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u,
8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u,
12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u
};
#endif
#ifdef HAS_J400TOARGBROW_AVX2
void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
"vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n"
"vpslld $0x18,%%ymm7,%%ymm7 \n"
"vmovdqa (%3),%%ymm5 \n"
"vmovdqa 0x20(%3),%%ymm6 \n"
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n"
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n"
"vbroadcasti128 (%0),%%ymm0 \n"
"vpshufb %%ymm5,%%ymm0,%%ymm1 \n"
"vpshufb %%ymm6,%%ymm0,%%ymm2 \n"
"vpor %%ymm7,%%ymm1,%%ymm1 \n"
"vpor %%ymm7,%%ymm2,%%ymm2 \n"
"vmovdqu %%ymm1,(%1) \n"
"vmovdqu %%ymm2,0x20(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
::"memory",
"cc", "xmm0", "xmm1", "xmm5");
: "r"(kShuffleMaskJ400ToARGB) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
}
#endif // HAS_J400TOARGBROW_SSE2
#endif // HAS_J400TOARGBROW_AVX2
#ifdef HAS_J400TOARGBROW_AVX512BW
void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
"vpternlogd $0xff,%%zmm7,%%zmm7,%%zmm7 \n" // 0xffffffff
"vpslld $0x18,%%zmm7,%%zmm7 \n" // 0xff000000
"vmovdqa64 %3,%%zmm5 \n"
LABELALIGN
"1: \n"
"vbroadcasti32x4 (%0),%%zmm0 \n"
"vbroadcasti32x4 0x10(%0),%%zmm1 \n"
"vpshufb %%zmm5,%%zmm0,%%zmm0 \n"
"vpshufb %%zmm5,%%zmm1,%%zmm1 \n"
"vpord %%zmm7,%%zmm0,%%zmm0 \n"
"vpord %%zmm7,%%zmm1,%%zmm1 \n"
"vmovdqu64 %%zmm0,(%1) \n"
"vmovdqu64 %%zmm1,0x40(%1) \n"
"lea 0x20(%0),%0 \n"
"lea 0x80(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kShuffleMaskJ400ToARGB) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm5", "xmm7");
}
#endif // HAS_J400TOARGBROW_AVX512BW
#ifdef HAS_RGB24TOARGBROW_SSSE3
void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,

View File

@ -669,6 +669,41 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
}
#endif
#ifdef HAS_J400TOARGBROW_AVX2
alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_0[32] = {
0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u,
4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u
};
alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_1[32] = {
8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u,
12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u
};
LIBYUV_TARGET_AVX2
void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
__m256i ymm_mask0 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0);
__m256i ymm_mask1 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1);
__m256i ymm_alpha = _mm256_set1_epi32((int)0xff000000u);
while (width > 0) {
__m256i ymm0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y));
__m256i ymm1 = _mm256_shuffle_epi8(ymm0, ymm_mask0);
__m256i ymm2 = _mm256_shuffle_epi8(ymm0, ymm_mask1);
ymm1 = _mm256_or_si256(ymm1, ymm_alpha);
ymm2 = _mm256_or_si256(ymm2, ymm_alpha);
_mm256_storeu_si256((__m256i*)dst_argb, ymm1);
_mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm2);
src_y += 16;
dst_argb += 64;
width -= 16;
}
}
#endif // HAS_J400TOARGBROW_AVX2
#endif
#ifdef __cplusplus