From e449eb2172b33db01959032940e51b903dc0d661 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 28 May 2026 21:06:47 -0700 Subject: [PATCH] J400ToARGB switch from SSE2 to AVX2 - port for row_win - remove unused HAS_ macros Was C/SSE2 MSVC J400ToARGB_Opt (1967 ms) Clang J400ToARGB_Opt (568 ms) Now AVX2 MSVC J400ToARGB_Opt (411 ms) Clang J400ToARGB_Opt (418 ms) Test: libyuv_unittest --gtest_filter=*J400ToARGB* Bug: libyuv:508639302 Change-Id: Ifdfb026832b708b61f55477250cc5ee52449f421 TAG=agy CONV=186608fc-966a-4ea7-bf57-9fe07cc1383c Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7877368 Commit-Queue: Frank Barchard Reviewed-by: Justin Green --- README.chromium | 2 +- include/libyuv/row.h | 32 ++++------------- include/libyuv/version.h | 2 +- source/convert_argb.cc | 17 ++++----- source/row_any.cc | 5 +-- source/row_gcc.cc | 78 ++++++++++++++++++++++++++++++---------- source/row_win.cc | 35 ++++++++++++++++++ 7 files changed, 115 insertions(+), 56 deletions(-) diff --git a/README.chromium b/README.chromium index c4a82bdae..c1f416458 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1941 +Version: 1942 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 529b3d610..ede80b13a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -49,7 +49,6 @@ extern "C" { #define HAS_ARGBTORGB565ROW_SSE2 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 -#define HAS_H422TOARGBROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 @@ -60,11 +59,8 @@ extern "C" { #define HAS_I422TOYUY2ROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 #define HAS_I444TORGB24ROW_SSSE3 -#define HAS_J400TOARGBROW_SSE2 -#define HAS_J422TOARGBROW_SSSE3 #define HAS_MERGEUVROW_SSE2 #define HAS_MIRRORROW_SSSE3 -#define HAS_MIRRORSPLITUVROW_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV12TORGB24ROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3 @@ -78,8 +74,6 @@ extern "C" { #define HAS_SETROW_X86 #define HAS_SPLITUVROW_SSE2 #define HAS_UYVYTOARGBROW_SSSE3 -#define HAS_UYVYTOUV422ROW_SSE2 -#define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOYROW_SSE2 #define HAS_YUY2TOARGBROW_SSSE3 #define HAS_YUY2TOUV422ROW_SSE2 @@ -163,7 +157,6 @@ extern "C" { #define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_AVX2 #define HAS_COPYROW_AVX -#define HAS_H422TOARGBROW_AVX2 #define HAS_HALFFLOATROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 @@ -173,7 +166,7 @@ extern "C" { #define HAS_I422TORGBAROW_AVX2 #define HAS_I444TOARGBROW_AVX2 #define HAS_I444TORGB24ROW_AVX2 -#define HAS_J422TOARGBROW_AVX2 +#define HAS_J400TOARGBROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_MIRRORSPLITUVROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 @@ -183,8 +176,6 @@ extern "C" { #define HAS_NV21TORGB24ROW_AVX2 #define HAS_SPLITUVROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2 -#define HAS_UYVYTOUV422ROW_AVX2 -#define HAS_UYVYTOUVROW_AVX2 #define HAS_UYVYTOYROW_AVX2 #define HAS_YUY2TOARGBROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2 @@ -256,16 +247,11 @@ extern "C" { #define HAS_ARGBTOYROW_SSSE3 #define HAS_ARGBTOYMATRIXROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 -#define HAS_ABGRTOYROW_SSSE3 -#define HAS_RGBATOYROW_SSSE3 // TODO: adjust row_win to use 8 bit negative coefficients. #define HAS_ABGRTOUVJROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3 -#define HAS_ABGRTOUVROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3 -#define HAS_BGRATOUVROW_SSSE3 -#define HAS_RGBATOUVROW_SSSE3 #define HAS_ARGBTOUVMATRIXROW_SSSE3 #define HAS_ARGBTOUV444MATRIXROW_SSSE3 @@ -372,8 +358,7 @@ extern "C" { #define HAS_ARGBTOYJROW_AVX2 #define HAS_ABGRTOYJROW_AVX2 #define HAS_RGBATOYJROW_AVX2 -#define HAS_RGBATOYROW_AVX2 -#define HAS_BGRATOYROW_AVX2 +#define HAS_J400TOARGBROW_AVX2 #endif // The following are available for AVX512 clang x86 platforms: @@ -397,8 +382,6 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \ (defined(CLANG_HAS_AVX512)) #define HAS_I422TOARGBROW_AVX512BW -#define HAS_I444TOARGBROW_AVX512BW -#define HAS_I444TORGB24ROW_AVX512BW #define HAS_ARGBTOUV444ROW_AVX512BW #define HAS_ARGBTOUV444MATRIXROW_AVX512BW #define HAS_ARGBTOYROW_AVX512BW @@ -407,6 +390,7 @@ extern "C" { #define HAS_ARGBTOUVROW_AVX512BW #define HAS_ARGBTOUVJROW_AVX512BW #define HAS_ARGBTOUVMATRIXROW_AVX512BW +#define HAS_J400TOARGBROW_AVX512BW #endif // The following are available on Neon platforms: @@ -449,7 +433,6 @@ extern "C" { #define HAS_ARGBTOYMATRIXROW_NEON #endif #define HAS_ARGBTOYROW_NEON -#define HAS_AYUVTOUVROW_NEON #define HAS_AYUVTOVUROW_NEON #define HAS_AYUVTOYROW_NEON #define HAS_BGRATOUVROW_NEON @@ -523,7 +506,6 @@ extern "C" { #define HAS_SWAPUVROW_NEON #define HAS_UNPACKMT2T_NEON #define HAS_UYVYTOARGBROW_NEON -#define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOYROW_NEON #define HAS_YUY2TOARGBROW_NEON @@ -4386,15 +4368,15 @@ void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr, uint8_t* dst_ptr, int width); -void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); -void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); +void J400ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 5155b1913..ab2420cd5 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1941 +#define LIBYUV_VERSION 1942 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 7672a6692..82f10966a 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3280,14 +3280,7 @@ int J400ToARGB(const uint8_t* src_y, height = 1; src_stride_y = dst_stride_argb = 0; } -#if defined(HAS_J400TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - J400ToARGBRow = J400ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - J400ToARGBRow = J400ToARGBRow_SSE2; - } - } -#endif + #if defined(HAS_J400TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { J400ToARGBRow = J400ToARGBRow_Any_AVX2; @@ -3296,6 +3289,14 @@ int J400ToARGB(const uint8_t* src_y, } } #endif +#if defined(HAS_J400TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + J400ToARGBRow = J400ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + J400ToARGBRow = J400ToARGBRow_AVX512BW; + } + } +#endif #if defined(HAS_J400TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { J400ToARGBRow = J400ToARGBRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 20371c173..70b83e4e5 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -983,8 +983,9 @@ ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) #if defined(HAS_ARGBTOAR30ROW_AVX2) ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) #endif -#if defined(HAS_J400TOARGBROW_SSE2) -ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) + +#if defined(HAS_J400TOARGBROW_AVX512BW) +ANY11(J400ToARGBRow_Any_AVX512BW, J400ToARGBRow_AVX512BW, 0, 1, 4, 31) #endif #if defined(HAS_J400TOARGBROW_AVX2) ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 6ac03a52f..7d7f1df39 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -117,34 +117,74 @@ static const lvec8 kShuffleNV21 = { }; #endif // HAS_RGB24TOARGBROW_SSSE3 -#ifdef HAS_J400TOARGBROW_SSE2 -void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { +#if defined(HAS_J400TOARGBROW_AVX2) || defined(HAS_J400TOARGBROW_AVX512BW) +alignas(64) static const uint8_t kShuffleMaskJ400ToARGB[64] = { + 0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u, + 4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u, + 8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u, + 12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u +}; +#endif + +#ifdef HAS_J400TOARGBROW_AVX2 +void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" + "vpslld $0x18,%%ymm7,%%ymm7 \n" + "vmovdqa (%3),%%ymm5 \n" + "vmovdqa 0x20(%3),%%ymm6 \n" LABELALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" + "vbroadcasti128 (%0),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm2 \n" + "vpor %%ymm7,%%ymm1,%%ymm1 \n" + "vpor %%ymm7,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x10(%0),%0 \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm5"); + : "r"(kShuffleMaskJ400ToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); } -#endif // HAS_J400TOARGBROW_SSE2 +#endif // HAS_J400TOARGBROW_AVX2 + +#ifdef HAS_J400TOARGBROW_AVX512BW +void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "vpternlogd $0xff,%%zmm7,%%zmm7,%%zmm7 \n" // 0xffffffff + "vpslld $0x18,%%zmm7,%%zmm7 \n" // 0xff000000 + "vmovdqa64 %3,%%zmm5 \n" + + LABELALIGN + "1: \n" + "vbroadcasti32x4 (%0),%%zmm0 \n" + "vbroadcasti32x4 0x10(%0),%%zmm1 \n" + "vpshufb %%zmm5,%%zmm0,%%zmm0 \n" + "vpshufb %%zmm5,%%zmm1,%%zmm1 \n" + "vpord %%zmm7,%%zmm0,%%zmm0 \n" + "vpord %%zmm7,%%zmm1,%%zmm1 \n" + "vmovdqu64 %%zmm0,(%1) \n" + "vmovdqu64 %%zmm1,0x40(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x80(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskJ400ToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5", "xmm7"); +} +#endif // HAS_J400TOARGBROW_AVX512BW #ifdef HAS_RGB24TOARGBROW_SSSE3 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, diff --git a/source/row_win.cc b/source/row_win.cc index cc0e67ae6..847d3a04d 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -669,6 +669,41 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif +#ifdef HAS_J400TOARGBROW_AVX2 +alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_0[32] = { + 0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u, + 4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u +}; +alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_1[32] = { + 8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u, + 12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u +}; + +LIBYUV_TARGET_AVX2 +void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) { + __m256i ymm_mask0 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0); + __m256i ymm_mask1 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1); + __m256i ymm_alpha = _mm256_set1_epi32((int)0xff000000u); + + while (width > 0) { + __m256i ymm0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y)); + + __m256i ymm1 = _mm256_shuffle_epi8(ymm0, ymm_mask0); + __m256i ymm2 = _mm256_shuffle_epi8(ymm0, ymm_mask1); + + ymm1 = _mm256_or_si256(ymm1, ymm_alpha); + ymm2 = _mm256_or_si256(ymm2, ymm_alpha); + + _mm256_storeu_si256((__m256i*)dst_argb, ymm1); + _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm2); + + src_y += 16; + dst_argb += 64; + width -= 16; + } +} +#endif // HAS_J400TOARGBROW_AVX2 + #endif #ifdef __cplusplus