From 4c3d7d517ae80dbe5e222f7dcb11659f5b240f11 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 8 Apr 2026 18:48:12 -0700 Subject: [PATCH] ARGBToUV444 for AVX512 1.27x faster on AMD Zen5 (turin) Now AVX512 perf record ./libyuv_test '--gunit_filter=*ARGBToI444_Opt' --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=10000 --libyuv_flags=-1 --libyuv_cpu_info=-1 [ OK ] LibYUVConvertTest.ARGBToI444_Opt (1071 ms) Overhead Symbol 53.49% ARGBToYRow_AVX2 44.70% ARGBToUV444Row_AVX512BW Was AVX2 [ OK ] LibYUVConvertTest.ARGBToI444_Opt (1369 ms) 61.06% ARGBToUV444Row_AVX2 37.67% ARGBToYRow_AVX2 Bug: libyuv:42280902 Change-Id: I306fbac656d6f7834ce1559e86d01eb34931ec3c Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7738362 Commit-Queue: Frank Barchard Reviewed-by: Dale Curtis --- README.chromium | 2 +- include/libyuv/row.h | 34 ++++++++++-- include/libyuv/version.h | 2 +- source/convert.cc | 16 ++++++ source/convert_from_argb.cc | 24 +++++++++ source/row_any.cc | 9 ++++ source/row_gcc.cc | 102 ++++++++++++++++++++++++++++++++++++ 7 files changed, 182 insertions(+), 7 deletions(-) diff --git a/README.chromium b/README.chromium index 592fc1899..698e99b24 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1922 +Version: 1923 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index db875b74f..9c11f3199 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -379,6 +379,9 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \ (defined(CLANG_HAS_AVX512)) #define HAS_I422TOARGBROW_AVX512BW +#define HAS_ARGBTOUV444ROW_AVX512BW +#define HAS_ARGBTOUV444MATRIXROW_AVX512BW +#define HAS_ARGBTOUVJ444ROW_AVX512BW #endif // The following are available on Neon platforms: @@ -2156,6 +2159,11 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); void ARGBToUVMatrixRow_Any_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2178,11 +2186,11 @@ void ARGBToUV444MatrixRow_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGBToUV444MatrixRow_Any_AVX2(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); +void ARGBToUV444MatrixRow_Any_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); void ABGRToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); @@ -2735,19 +2743,35 @@ void ARGBToUV444Row_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUV444Row_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUV444Row_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJ444Row_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVJ444Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJ444Row_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_C(const uint8_t* src_argb, uint8_t* dst_u, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e8fb3ed5c..c132cdafb 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1922 +#define LIBYUV_VERSION 1923 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index cddaf961b..fbc0ea26e 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -3591,6 +3591,14 @@ int RAWToI444(const uint8_t* src_raw, } } #endif +#if defined(HAS_ARGBTOUV444ROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUV444Row = ARGBToUV444Row_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUV444Row = ARGBToUV444Row_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOUV444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV444Row = ARGBToUV444Row_Any_NEON; @@ -3794,6 +3802,14 @@ int RAWToJ444(const uint8_t* src_raw, } } #endif +#if defined(HAS_ARGBTOUVJ444ROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVJ444Row = ARGBToUVJ444Row_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOUVJ444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index c7bf41ea8..d3353ee79 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -68,6 +68,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUV444Row = ARGBToUV444Row_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUV444Row = ARGBToUV444Row_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOUV444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV444Row = ARGBToUV444Row_Any_NEON; @@ -200,6 +208,14 @@ int ARGBToI444Matrix(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444MATRIXROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; @@ -2638,6 +2654,14 @@ int ARGBToJ444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJ444ROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVJ444Row = ARGBToUVJ444Row_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOUVJ444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index f34f3eb2e..ff8b980a4 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2030,9 +2030,15 @@ ANY12(ARGBToUVJ444Row_Any_SSSE3, ARGBToUVJ444Row_SSSE3, 0, 4, 0, 15) #ifdef HAS_ARGBTOUV444ROW_AVX2 ANY12(ARGBToUV444Row_Any_AVX2, ARGBToUV444Row_AVX2, 0, 4, 0, 31) #endif +#ifdef HAS_ARGBTOUV444ROW_AVX512BW +ANY12(ARGBToUV444Row_Any_AVX512BW, ARGBToUV444Row_AVX512BW, 0, 4, 0, 63) +#endif #ifdef HAS_ARGBTOUVJ444ROW_AVX2 ANY12(ARGBToUVJ444Row_Any_AVX2, ARGBToUVJ444Row_AVX2, 0, 4, 0, 31) #endif +#ifdef HAS_ARGBTOUVJ444ROW_AVX512BW +ANY12(ARGBToUVJ444Row_Any_AVX512BW, ARGBToUVJ444Row_AVX512BW, 0, 4, 0, 63) +#endif #ifdef HAS_YUY2TOUV422ROW_AVX2 ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) @@ -2250,6 +2256,9 @@ ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7) #ifdef HAS_ARGBTOUV444MATRIXROW_AVX2 ANY12M(ARGBToUV444MatrixRow_Any_AVX2, ARGBToUV444MatrixRow_AVX2, 4, 31) #endif +#ifdef HAS_ARGBTOUV444MATRIXROW_AVX512BW +ANY12M(ARGBToUV444MatrixRow_Any_AVX512BW, ARGBToUV444MatrixRow_AVX512BW, 4, 63) +#endif #ifdef HAS_ARGBTOUV444MATRIXROW_SSSE3 ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 48998d323..dc4957a45 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1723,6 +1723,88 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBTOUV444ROW_AVX2 +#ifdef HAS_ARGBTOUV444ROW_AVX512BW +static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; + +void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { + asm volatile( + "vbroadcasti64x4 0x20(%4),%%zmm3 \n" // kRGBToU + "vbroadcasti64x4 0x40(%4),%%zmm4 \n" // kRGBToV + "vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" // -1 + "vpsllw $15,%%zmm16,%%zmm5 \n" // 0x8000 + "vmovups %5,%%zmm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovups (%0),%%zmm0 \n" + "vmovups 0x40(%0),%%zmm1 \n" + "vmovups 0x80(%0),%%zmm2 \n" + "vmovups 0xc0(%0),%%zmm6 \n" + "vpmaddubsw %%zmm3,%%zmm0,%%zmm0 \n" + "vpmaddubsw %%zmm3,%%zmm1,%%zmm1 \n" + "vpmaddubsw %%zmm3,%%zmm2,%%zmm2 \n" + "vpmaddubsw %%zmm3,%%zmm6,%%zmm6 \n" + "vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n" + "vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n" + "vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n" + "vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n" + "vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" // mutates + "vpackssdw %%zmm6,%%zmm2,%%zmm2 \n" + "vpsubw %%zmm5,%%zmm0,%%zmm0 \n" + "vpsubw %%zmm5,%%zmm2,%%zmm2 \n" + "vpsrlw $0x8,%%zmm0,%%zmm0 \n" + "vpsrlw $0x8,%%zmm2,%%zmm2 \n" + "vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" // mutates + "vpermd %%zmm0,%%zmm7,%%zmm0 \n" // unmutate. + "vmovups %%zmm0,(%1) \n" + + "vmovups (%0),%%zmm0 \n" + "vmovups 0x40(%0),%%zmm1 \n" + "vmovups 0x80(%0),%%zmm2 \n" + "vmovups 0xc0(%0),%%zmm6 \n" + "vpmaddubsw %%zmm4,%%zmm0,%%zmm0 \n" + "vpmaddubsw %%zmm4,%%zmm1,%%zmm1 \n" + "vpmaddubsw %%zmm4,%%zmm2,%%zmm2 \n" + "vpmaddubsw %%zmm4,%%zmm6,%%zmm6 \n" + "vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n" + "vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n" + "vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n" + "vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n" + "vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" // mutates + "vpackssdw %%zmm6,%%zmm2,%%zmm2 \n" + "vpsubw %%zmm5,%%zmm0,%%zmm0 \n" + "vpsubw %%zmm5,%%zmm2,%%zmm2 \n" + "vpsrlw $0x8,%%zmm0,%%zmm0 \n" + "vpsrlw $0x8,%%zmm2,%%zmm2 \n" + "vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" // mutates + "vpermd %%zmm0,%%zmm7,%%zmm0 \n" // unmutate. + "vmovups %%zmm0,(%1,%2,1) \n" + "lea 0x100(%0),%0 \n" + "lea 0x40(%1),%1 \n" + "subl $0x40,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 +#if defined(__i386__) + "+m"(width) // %3 +#else + "+rm"(width) // %3 +#endif + : "r"(c), // %4 + "m"(kPermdARGBToY_AVX512BW) // %5 + : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", + "zmm7", "zmm16"); +} +#endif // HAS_ARGBTOUV444ROW_AVX512BW + #ifdef HAS_ARGBTOUVROW_SSSE3 // ARGBARGB to AARRGGBB shuffle @@ -1904,6 +1986,16 @@ void ARGBToUV444Row_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBTOUV444ROW_AVX2 +#ifdef HAS_ARGBTOUV444ROW_AVX512BW +void ARGBToUV444Row_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_AVX512BW(src_argb, dst_u, dst_v, width, + &kArgbI601Constants); +} +#endif // HAS_ARGBTOUV444ROW_AVX512BW + #ifdef HAS_ARGBTOUVROW_SSSE3 void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, @@ -1981,6 +2073,16 @@ void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVJ444ROW_AVX2 +#ifdef HAS_ARGBTOUVJ444ROW_AVX512BW +void ARGBToUVJ444Row_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_AVX512BW(src_argb, dst_u, dst_v, width, + &kArgbJPEGConstants); +} +#endif // HAS_ARGBTOUVJ444ROW_AVX512BW + #ifdef HAS_ARGBTOUVJROW_SSSE3 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb,