I420ToRAW use 2 step AVX512

On Icelake
Was AVX2
I420ToRAW_Opt (283 ms)
  67.55%  I422ToARGBRow_AVX2
  26.46%  ARGBToRGB24Row_AVX2

Now AVX512VBMI
I420ToRAW_Opt (238 ms)
  73.08%  I422ToARGBRow_AVX512BW
  21.59%  ARGBToRGB24Row_AVX512VBMI

Bug: 42280902
Change-Id: I9d4d21faed30c529a5e593819f103be115709f37
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7909924
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
Frank Barchard 2026-06-08 14:03:52 -07:00 committed by libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com
parent 4be798d7c5
commit 3bdb3b94ca
6 changed files with 110 additions and 2 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/ URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1947 Version: 1948
Revision: DEPS Revision: DEPS
License: BSD-3-Clause License: BSD-3-Clause
License File: LICENSE License File: LICENSE

View File

@ -398,6 +398,8 @@ extern "C" {
#define HAS_ARGBTOUV444MATRIXROW_AVX512BW #define HAS_ARGBTOUV444MATRIXROW_AVX512BW
#define HAS_ARGBTOYROW_AVX512BW #define HAS_ARGBTOYROW_AVX512BW
#define HAS_ARGBTOYMATRIXROW_AVX512BW #define HAS_ARGBTOYMATRIXROW_AVX512BW
#define HAS_I422TORGB24ROW_AVX512VBMI
#define HAS_I422TORGB24ROW_AVX512BW
#define HAS_ARGBTOUVJ444ROW_AVX512BW #define HAS_ARGBTOUVJ444ROW_AVX512BW
#define HAS_ARGBTOUVROW_AVX512BW #define HAS_ARGBTOUVROW_AVX512BW
#define HAS_ARGBTOUVJROW_AVX512BW #define HAS_ARGBTOUVJROW_AVX512BW
@ -5148,6 +5150,18 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422ToRGB24Row_AVX512VBMI(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_AVX512BW(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf, void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* v_buf,
@ -5466,6 +5480,18 @@ void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
uint8_t* dst_ptr, uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422ToRGB24Row_Any_AVX512VBMI(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_Any_AVX512BW(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I400ToARGBRow_C(const uint8_t* src_y, void I400ToARGBRow_C(const uint8_t* src_y,
uint8_t* rgb_buf, uint8_t* rgb_buf,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1947 #define LIBYUV_VERSION 1948
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -5556,6 +5556,22 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I422TORGB24ROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
I422ToRGB24Row = I422ToRGB24Row_Any_AVX512BW;
if (IS_ALIGNED(width, 32)) {
I422ToRGB24Row = I422ToRGB24Row_AVX512BW;
}
}
#endif
#if defined(HAS_I422TORGB24ROW_AVX512VBMI)
if (TestCpuFlag(kCpuHasAVX512VBMI)) {
I422ToRGB24Row = I422ToRGB24Row_Any_AVX512VBMI;
if (IS_ALIGNED(width, 32)) {
I422ToRGB24Row = I422ToRGB24Row_AVX512VBMI;
}
}
#endif
#if defined(HAS_I422TORGB24ROW_NEON) #if defined(HAS_I422TORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I422ToRGB24Row = I422ToRGB24Row_Any_NEON; I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
@ -5761,6 +5777,22 @@ int I422ToRGB24Matrix(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I422TORGB24ROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
I422ToRGB24Row = I422ToRGB24Row_Any_AVX512BW;
if (IS_ALIGNED(width, 32)) {
I422ToRGB24Row = I422ToRGB24Row_AVX512BW;
}
}
#endif
#if defined(HAS_I422TORGB24ROW_AVX512VBMI)
if (TestCpuFlag(kCpuHasAVX512VBMI)) {
I422ToRGB24Row = I422ToRGB24Row_Any_AVX512VBMI;
if (IS_ALIGNED(width, 32)) {
I422ToRGB24Row = I422ToRGB24Row_AVX512VBMI;
}
}
#endif
#if defined(HAS_I422TORGB24ROW_NEON) #if defined(HAS_I422TORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I422ToRGB24Row = I422ToRGB24Row_Any_NEON; I422ToRGB24Row = I422ToRGB24Row_Any_NEON;

View File

@ -387,6 +387,12 @@ ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15)
#ifdef HAS_I422TORGB24ROW_AVX2 #ifdef HAS_I422TORGB24ROW_AVX2
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
#endif #endif
#ifdef HAS_I422TORGB24ROW_AVX512VBMI
ANY31C(I422ToRGB24Row_Any_AVX512VBMI, I422ToRGB24Row_AVX512VBMI, 1, 0, 3, 31)
#endif
#ifdef HAS_I422TORGB24ROW_AVX512BW
ANY31C(I422ToRGB24Row_Any_AVX512BW, I422ToRGB24Row_AVX512BW, 1, 0, 3, 31)
#endif
#ifdef HAS_I422TOARGBROW_AVX2 #ifdef HAS_I422TOARGBROW_AVX2
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif #endif

View File

@ -4298,6 +4298,50 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
} }
#endif #endif
#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
void I422ToRGB24Row_AVX512VBMI(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX512BW(src_y, src_u, src_v, row, yuvconstants, twidth);
ARGBToRGB24Row_AVX512VBMI(row, dst_rgb24, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_rgb24 += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX2)
void I422ToRGB24Row_AVX512BW(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX512BW(src_y, src_u, src_v, row, yuvconstants, twidth);
ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_rgb24 += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_I444TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2) #if defined(HAS_I444TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2)
void I444ToRGB24Row_AVX2(const uint8_t* src_y, void I444ToRGB24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,