ARGBToY for AVX512

- add ARGBToYMatrixRow_AVX512BW
- refactor SSE and AVX to use Matrix functions, making old functions
  call the new ones.

Zen5 1280x720
Was AVX2   LibYUVConvertTest.ARGBToI444_Opt (1125 ms)
Now AVX512 LibYUVConvertTest.ARGBToI444_Opt (641 ms)

Details by Gemini:
  1. Created 3 new Matrix functions:
    Added ARGBToYMatrixRow_SSSE3, ARGBToYMatrixRow_AVX2, and
    ARGBToYMatrixRow_AVX512BW to source/row_gcc.cc. These take the
    const struct ArgbConstants* c parameter similarly to
    ARGBToUV444MatrixRow_*. The x86 vector instructions dynamically
    calculate the needed values using the properties of the constants
    struct, including using vpmaddwd inside the AVX512 code to offset
    the lack of a native vphaddw.

  2. Replaced Old Functions with Wrappers:
    Modified the existing implementations of ARGBToYRow_SSSE3,
    ARGBToYJRow_SSSE3, ABGRToYRow_SSSE3, ABGRToYJRow_SSSE3,
    RGBAToYRow_SSSE3, RGBAToYJRow_SSSE3, BGRAToYRow_SSSE3 (and their
    _AVX2 equivalents) in source/row_gcc.cc to act as inline wrappers
    calling the new ARGBToYMatrixRow_* functions, passing the right
    matrix parameters (e.g. &kArgbI601Constants, &kArgbJPEGConstants,
    &kAbgrI601Constants).

  3. Added row_any.cc Handlers:
    Added ANY11MC definitions to source/row_any.cc to autogenerate
    ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_Any_AVX2, and
    ARGBToYMatrixRow_Any_AVX512BW which safely handles non-aligned
    tails.

  4. Updated include/libyuv/row.h:
    Updated the headers with the proper void declarations for all newly
    generated Matrix and Any_ variants. Also defined
    HAS_ARGBTOYROW_AVX512BW in the CPU macros.

  5. Tested the Implementations:
    Compiled and tested on Linux x86, which resulted in all tests passing
    cleanly. Also successfully completed all Windows 32-bit build checks
    ensuring 32-bit regression prevention without issues.

Bug: 477295731
Change-Id: I4f5eec9a961e24a9d760d0a1c0810fb5e29a0bd1
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7759494
Reviewed-by: Dale Curtis <dalecurtis@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2026-04-13 16:07:49 -07:00
parent 644251f252
commit 893eacf9b4
8 changed files with 390 additions and 223 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1924
Version: 1925
Revision: DEPS
License: BSD-3-Clause
License File: LICENSE

View File

@ -381,6 +381,7 @@ extern "C" {
#define HAS_I422TOARGBROW_AVX512BW
#define HAS_ARGBTOUV444ROW_AVX512BW
#define HAS_ARGBTOUV444MATRIXROW_AVX512BW
#define HAS_ARGBTOYROW_AVX512BW
#define HAS_ARGBTOUVJ444ROW_AVX512BW
#endif
@ -1746,19 +1747,31 @@ void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void ABGRToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void ABGRToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGBAToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void BGRAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
void BGRAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGBAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
@ -2149,6 +2162,31 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_Any_SSSE3(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_Any_AVX2(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_Any_AVX512BW(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -2204,6 +2242,15 @@ void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
void ARGBToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1924
#define LIBYUV_VERSION 1925
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -2091,6 +2091,14 @@ int ARGBToI420(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYRow = ARGBToYRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToYRow = ARGBToYRow_AVX512BW;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
@ -2522,6 +2530,14 @@ int BGRAToI420(const uint8_t* src_bgra,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
BGRAToYRow = BGRAToYRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
BGRAToYRow = BGRAToYRow_AVX512BW;
}
}
#endif
#if defined(HAS_BGRATOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
BGRAToUVRow = BGRAToUVRow_Any_AVX2;
@ -2621,6 +2637,14 @@ int ABGRToI420(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ABGRToYRow = ABGRToYRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ABGRToYRow = ABGRToYRow_AVX512BW;
}
}
#endif
#if defined(HAS_ABGRTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
@ -2752,6 +2776,22 @@ int RGBAToI420(const uint8_t* src_rgba,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RGBAToYRow = RGBAToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RGBAToYRow = RGBAToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RGBAToYRow = RGBAToYRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RGBAToYRow = RGBAToYRow_AVX512BW;
}
}
#endif
#if defined(HAS_RGBATOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
@ -3125,6 +3165,14 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToYJRow = ARGBToYJRow_AVX512BW;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;

View File

@ -124,6 +124,14 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYRow = ARGBToYRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToYRow = ARGBToYRow_AVX512BW;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
@ -1083,6 +1091,14 @@ int ABGRToNV12(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ABGRToYRow = ABGRToYRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ABGRToYRow = ABGRToYRow_AVX512BW;
}
}
#endif
#if defined(HAS_ABGRTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
@ -2710,6 +2726,14 @@ int ARGBToJ444(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToYJRow = ARGBToYJRow_AVX512BW;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
@ -3171,6 +3195,14 @@ int RGBAToJ400(const uint8_t* src_rgba,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RGBAToYJRow = RGBAToYJRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RGBAToYJRow = RGBAToYJRow_AVX512BW;
}
}
#endif
#if defined(HAS_RGBATOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGBAToYJRow = RGBAToYJRow_Any_NEON;
@ -3268,6 +3300,14 @@ int ABGRToJ420(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ABGRToYJRow = ABGRToYJRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ABGRToYJRow = ABGRToYJRow_AVX512BW;
}
}
#endif
#if defined(HAS_ABGRTOUVJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;

View File

@ -4777,6 +4777,14 @@ static int ARGBSobelize(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToYJRow = ARGBToYJRow_AVX512BW;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;

View File

@ -1070,11 +1070,29 @@ ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
#ifdef HAS_ARGBTOYROW_SSSE3
ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
ANY11(ARGBToYRow_Any_AVX512BW, ARGBToYRow_AVX512BW, 0, 4, 1, 63)
#endif
#ifdef HAS_BGRATOYROW_SSSE3
ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
ANY11(BGRAToYRow_Any_AVX512BW, BGRAToYRow_AVX512BW, 0, 4, 1, 63)
#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(BGRAToYRow_Any_AVX2, BGRAToYRow_AVX2, 0, 4, 1, 31)
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
ANY11(RGBAToYRow_Any_AVX512BW, RGBAToYRow_AVX512BW, 0, 4, 1, 63)
#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(RGBAToYRow_Any_AVX2, RGBAToYRow_AVX2, 0, 4, 1, 31)
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
ANY11(ABGRToYRow_Any_AVX512BW, ABGRToYRow_AVX512BW, 0, 4, 1, 63)
#endif
#ifdef HAS_YUY2TOYROW_SSE2
ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
@ -1082,12 +1100,21 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
#ifdef HAS_ARGBTOYJROW_SSSE3
ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
ANY11(ARGBToYJRow_Any_AVX512BW, ARGBToYJRow_AVX512BW, 0, 4, 1, 63)
#endif
#ifdef HAS_ABGRTOYJROW_SSSE3
ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
ANY11(ABGRToYJRow_Any_AVX512BW, ABGRToYJRow_AVX512BW, 0, 4, 1, 63)
#endif
#ifdef HAS_RGBATOYJROW_SSSE3
ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
ANY11(RGBAToYJRow_Any_AVX512BW, RGBAToYJRow_AVX512BW, 0, 4, 1, 63)
#endif
#ifdef HAS_ARGBTOYROW_NEON
ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15)
#endif
@ -2282,6 +2309,15 @@ ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7)
memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r); \
}
#ifdef HAS_ARGBTOYROW_SSSE3
ANY11MC(ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_SSSE3, 4, 15)
#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11MC(ARGBToYMatrixRow_Any_AVX2, ARGBToYMatrixRow_AVX2, 4, 31)
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63)
#endif
#ifdef HAS_ARGBTOYMATRIXROW_NEON
ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15)
#endif

View File

@ -24,37 +24,21 @@ extern "C" {
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB
static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
// JPeg full range.
static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u,
77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u};
static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
// Constants for BGRA
static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
// Constants for ABGR
static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
// Constants for RGBA.
static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
// 126 (7e) - (-109..110) = 16..235
static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
static const uvec16 kAddY0 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
0x8080u, 0x8080u, 0x8080u, 0x8080u};
static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
0x8080u, 0x8080u, 0x8080u, 0x8080u};
@ -1367,22 +1351,9 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
#ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n" //
LABELALIGN "" //
RGBTOY(xmm7) //
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kArgbI601Constants);
}
#endif // HAS_ARGBTOYROW_SSSE3
@ -1390,198 +1361,203 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16.
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n" //
LABELALIGN "" //
RGBTOY(xmm7) //
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToYJ), // %3
"m"(kSub128), // %4
"m"(kAddY0) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kArgbJPEGConstants);
}
#endif // HAS_ARGBTOYJROW_SSSE3
#ifdef HAS_ABGRTOYJROW_SSSE3
// Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
// Same as ABGRToYRow but different coefficients, no add 16.
void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n" //
LABELALIGN "" //
RGBTOY(xmm7) //
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToYJ), // %3
"m"(kSub128), // %4
"m"(kAddY0) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
void ABGRToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kAbgrJPEGConstants);
}
#endif // HAS_ABGRTOYJROW_SSSE3
#ifdef HAS_RGBATOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16.
void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n" //
LABELALIGN "" //
RGBTOY(xmm7) //
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kRGBAToYJ), // %3
"m"(kSub128), // %4
"m"(kAddY0) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
void RGBAToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kRgbaJPEGConstants);
}
#endif // HAS_RGBATOYJROW_SSSE3
#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
// vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
#endif
#ifdef HAS_ARGBTOYROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
#ifdef HAS_ARGBTOYROW_AVX2
void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
"vmovdqa %6,%%ymm6 \n" //
LABELALIGN "" //
RGBTOY_AVX2(ymm7) //
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToY), // %3
"m"(kSub128), // %4
"m"(kAddY16), // %5
"m"(kPermdARGBToY_AVX) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kArgbI601Constants);
}
#endif
#endif // HAS_ARGBTOYROW_AVX2
#ifdef HAS_ABGRTOYROW_AVX2
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
"vmovdqa %6,%%ymm6 \n" //
LABELALIGN "" //
RGBTOY_AVX2(ymm7) //
"vzeroupper \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToY), // %3
"m"(kSub128), // %4
"m"(kAddY16), // %5
"m"(kPermdARGBToY_AVX) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
#ifdef HAS_ARGBTOYROW_AVX2
void ABGRToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kAbgrI601Constants);
}
#endif
#endif // HAS_ABGRTOYROW_AVX2
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
#ifdef HAS_ARGBTOYROW_AVX2
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
"vmovdqa %6,%%ymm6 \n" //
LABELALIGN "" //
RGBTOY_AVX2(ymm7) //
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToYJ), // %3
"m"(kSub128), // %4
"m"(kAddY0), // %5
"m"(kPermdARGBToY_AVX) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kArgbJPEGConstants);
}
#endif
#endif // HAS_ARGBTOYJROW_AVX2
#ifdef HAS_ABGRTOYJROW_AVX2
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
"vmovdqa %6,%%ymm6 \n" //
LABELALIGN "" //
RGBTOY_AVX2(ymm7) //
"vzeroupper \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToYJ), // %3
"m"(kSub128), // %4
"m"(kAddY0), // %5
"m"(kPermdARGBToY_AVX) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
#ifdef HAS_ARGBTOYROW_AVX2
void ABGRToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kAbgrJPEGConstants);
}
#endif
#endif // HAS_ABGRTOYJROW_AVX2
#ifdef HAS_RGBATOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vbroadcastf128 %5,%%ymm7 \n"
"vmovdqa %6,%%ymm6 \n" //
#ifdef HAS_ARGBTOYROW_AVX2
void RGBAToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kRgbaJPEGConstants);
}
#endif
#endif // HAS_RGBATOYJROW_AVX2
LABELALIGN "" //
RGBTOY_AVX2(ymm7) //
"vzeroupper \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kRGBAToYJ), // %3
"m"(kSub128), // %4
"m"(kAddY0), // %5
"m"(kPermdARGBToY_AVX) // %6
#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBTOUV444ROW_AVX2) || \
defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
// vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5 \n"
"psllw $15,%%xmm5 \n"
"packsswb %%xmm5,%%xmm5 \n"
"movdqa 0(%3),%%xmm4 \n"
"movdqa 0x60(%3),%%xmm7 \n"
"movdqa %%xmm4,%%xmm6 \n"
"pmaddubsw %%xmm5,%%xmm6 \n"
"phaddw %%xmm6,%%xmm6 \n"
"psubw %%xmm6,%%xmm7 \n"
LABELALIGN ""
RGBTOY(xmm7)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_RGBATOYJROW_AVX2
#endif
#ifdef HAS_ARGBTOYROW_AVX2
void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
asm volatile(
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsllw $15,%%ymm5,%%ymm5 \n"
"vpacksswb %%ymm5,%%ymm5,%%ymm5 \n"
"vbroadcastf128 0(%3),%%ymm4 \n"
"vbroadcastf128 0x60(%3),%%ymm7 \n"
"vpmaddubsw %%ymm5,%%ymm4,%%ymm6 \n"
"vphaddw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsubw %%ymm6,%%ymm7,%%ymm7 \n"
"vmovdqa %4,%%ymm6 \n"
LABELALIGN ""
RGBTOY_AVX2(ymm7)
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c), // %3
"m"(kPermdARGBToY_AVX) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#if defined(HAS_ARGBTOYROW_AVX512BW) || defined(HAS_ARGBTOUV444ROW_AVX512BW)
static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15};
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
asm volatile(
"vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
"vpsllw $15,%%zmm16,%%zmm5 \n"
"vpacksswb %%zmm5,%%zmm5,%%zmm5 \n"
"vpsrlw $15,%%zmm16,%%zmm16 \n" // zmm16 = 1
"vbroadcasti64x4 0(%3),%%zmm4 \n"
"vbroadcasti64x4 0x60(%3),%%zmm7 \n"
"vpmaddubsw %%zmm5,%%zmm4,%%zmm6 \n"
"vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n"
"vpackssdw %%zmm6,%%zmm6,%%zmm6 \n"
"vpsubw %%zmm6,%%zmm7,%%zmm7 \n"
"vmovups %4,%%zmm6 \n"
LABELALIGN
"1: \n"
"vmovups (%0),%%zmm0 \n"
"vmovups 0x40(%0),%%zmm1 \n"
"vmovups 0x80(%0),%%zmm2 \n"
"vmovups 0xc0(%0),%%zmm3 \n"
"vpsubb %%zmm5,%%zmm0,%%zmm0 \n"
"vpsubb %%zmm5,%%zmm1,%%zmm1 \n"
"vpsubb %%zmm5,%%zmm2,%%zmm2 \n"
"vpsubb %%zmm5,%%zmm3,%%zmm3 \n"
"vpmaddubsw %%zmm0,%%zmm4,%%zmm0 \n"
"vpmaddubsw %%zmm1,%%zmm4,%%zmm1 \n"
"vpmaddubsw %%zmm2,%%zmm4,%%zmm2 \n"
"vpmaddubsw %%zmm3,%%zmm4,%%zmm3 \n"
"lea 0x100(%0),%0 \n"
"vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n"
"vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n"
"vpackssdw %%zmm1,%%zmm0,%%zmm0 \n"
"vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n"
"vpmaddwd %%zmm16,%%zmm3,%%zmm3 \n"
"vpackssdw %%zmm3,%%zmm2,%%zmm2 \n"
"vpaddw %%zmm7,%%zmm0,%%zmm0 \n"
"vpaddw %%zmm7,%%zmm2,%%zmm2 \n"
"vpsrlw $0x8,%%zmm0,%%zmm0 \n"
"vpsrlw $0x8,%%zmm2,%%zmm2 \n"
"vpackuswb %%zmm2,%%zmm0,%%zmm0 \n"
"vpermd %%zmm0,%%zmm6,%%zmm0 \n"
"vmovups %%zmm0,(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x40,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c), // %3
"m"(kPermdARGBToY_AVX512BW) // %4
: "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
"zmm7", "zmm16");
}
#endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3
void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1724,8 +1700,6 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
#endif // HAS_ARGBTOUV444ROW_AVX2
#ifdef HAS_ARGBTOUV444ROW_AVX512BW
static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15};
void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
uint8_t* dst_u,
@ -1977,6 +1951,62 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
}
#endif // HAS_ARGBTOUV444ROW_SSSE3
#ifdef HAS_ARGBTOYROW_AVX2
void RGBAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kRgbaI601Constants);
}
#endif
#ifdef HAS_ARGBTOYROW_AVX2
void BGRAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kBgraI601Constants);
}
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
void ARGBToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kArgbI601Constants);
}
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
void ARGBToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kArgbJPEGConstants);
}
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
void ABGRToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kAbgrI601Constants);
}
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
void ABGRToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kAbgrJPEGConstants);
}
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
void RGBAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kRgbaI601Constants);
}
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
void RGBAToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kRgbaJPEGConstants);
}
#endif
#ifdef HAS_ARGBTOYROW_AVX512BW
void BGRAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kBgraI601Constants);
}
#endif
#ifdef HAS_ARGBTOUV444ROW_AVX2
void ARGBToUV444Row_AVX2(const uint8_t* src_argb,
uint8_t* dst_u,
@ -2127,58 +2157,16 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
}
#endif // HAS_ABGRTOUVJROW_AVX2
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN "" //
RGBTOY(xmm7)
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kBGRAToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
void BGRAToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kBgraI601Constants);
}
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN "" //
RGBTOY(xmm7)
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
void ABGRToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kAbgrI601Constants);
}
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN "" //
RGBTOY(xmm7)
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kRGBAToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
void RGBAToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kRgbaI601Constants);
}
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)