From 3bdb3b94caaf12487af7c5f91f830ef0064be250 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 8 Jun 2026 14:03:52 -0700 Subject: [PATCH] I420ToRAW use 2 step AVX512 On Icelake Was AVX2 I420ToRAW_Opt (283 ms) 67.55% I422ToARGBRow_AVX2 26.46% ARGBToRGB24Row_AVX2 Now AVX512VBMI I420ToRAW_Opt (238 ms) 73.08% I422ToARGBRow_AVX512BW 21.59% ARGBToRGB24Row_AVX512VBMI Bug: 42280902 Change-Id: I9d4d21faed30c529a5e593819f103be115709f37 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7909924 Reviewed-by: richard winterton Commit-Queue: Frank Barchard --- README.chromium | 2 +- include/libyuv/row.h | 26 ++++++++++++++++++++++++ include/libyuv/version.h | 2 +- source/convert_argb.cc | 32 +++++++++++++++++++++++++++++ source/row_any.cc | 6 ++++++ source/row_common.cc | 44 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 2 deletions(-) diff --git a/README.chromium b/README.chromium index e025cb9d6..cc424502a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1947 +Version: 1948 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 1ec86f5eb..835342acd 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -398,6 +398,8 @@ extern "C" { #define HAS_ARGBTOUV444MATRIXROW_AVX512BW #define HAS_ARGBTOYROW_AVX512BW #define HAS_ARGBTOYMATRIXROW_AVX512BW +#define HAS_I422TORGB24ROW_AVX512VBMI +#define HAS_I422TORGB24ROW_AVX512BW #define HAS_ARGBTOUVJ444ROW_AVX512BW #define HAS_ARGBTOUVROW_AVX512BW #define HAS_ARGBTOUVJROW_AVX512BW @@ -5148,6 +5150,18 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB24Row_AVX512VBMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_AVX512BW(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5466,6 +5480,18 @@ void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB24Row_Any_AVX512VBMI(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_Any_AVX512BW(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b12b94978..9f9d18da7 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1947 +#define LIBYUV_VERSION 1948 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_argb.cc b/source/convert_argb.cc index a0b9c5d37..3844e9691 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -5556,6 +5556,22 @@ int I420ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB24ROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_AVX512BW; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX512VBMI; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_AVX512VBMI; + } + } +#endif #if defined(HAS_I422TORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToRGB24Row = I422ToRGB24Row_Any_NEON; @@ -5761,6 +5777,22 @@ int I422ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB24ROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_AVX512BW; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX512VBMI; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_AVX512VBMI; + } + } +#endif #if defined(HAS_I422TORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToRGB24Row = I422ToRGB24Row_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 340adc188..919b231e6 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -387,6 +387,12 @@ ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15) #ifdef HAS_I422TORGB24ROW_AVX2 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) #endif +#ifdef HAS_I422TORGB24ROW_AVX512VBMI +ANY31C(I422ToRGB24Row_Any_AVX512VBMI, I422ToRGB24Row_AVX512VBMI, 1, 0, 3, 31) +#endif +#ifdef HAS_I422TORGB24ROW_AVX512BW +ANY31C(I422ToRGB24Row_Any_AVX512BW, I422ToRGB24Row_AVX512BW, 1, 0, 3, 31) +#endif #ifdef HAS_I422TOARGBROW_AVX2 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) #endif diff --git a/source/row_common.cc b/source/row_common.cc index f44b0f313..70ceaf5c8 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -4298,6 +4298,50 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y, } #endif +#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX512VBMI) +void I422ToRGB24Row_AVX512VBMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX512BW(src_y, src_u, src_v, row, yuvconstants, twidth); + ARGBToRGB24Row_AVX512VBMI(row, dst_rgb24, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX2) +void I422ToRGB24Row_AVX512BW(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX512BW(src_y, src_u, src_v, row, yuvconstants, twidth); + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + #if defined(HAS_I444TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2) void I444ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_u,