ARGBToUV Matrix for AVX2 and SSSE3

- Round before shifting to 8 bit to match NEON
  - RAWToARGB use unaligned loads and port to AVX2

Was C/SSSE/AVX2
ARGBToI444_Opt (343 ms)
ARGBToJ444_Opt (677 ms)
RAWToI444_Opt (405 ms)
RAWToJ444_Opt (803 ms)

Now AVX2
ARGBToI444_Opt (283 ms)
ARGBToJ444_Opt (284 ms)
RAWToI444_Opt (316 ms)
RAWToJ444_Opt (339 ms)

Profile Now AVX2
  38.31%  ARGBToUVJ444Row_AVX2
  32.31%  RAWToARGBRow_AVX2
  23.99%  ARGBToYJRow_AVX2

Profile Was C/SSSE/AVX2
    73.15%  ARGBToUVJ444Row_C
    15.74%  RAWToARGBRow_SSSE3
     8.87%  ARGBToYJRow_AVX2

Bug: 381138208
Change-Id: I696b2d83435bc985aa38df831e01ff1a658da56e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6231592
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Reviewed-by: Ben Weiss <bweiss@google.com>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2025-02-10 18:19:58 -08:00 committed by libyuv LUCI CQ
parent d32d19ccf2
commit 61354d2671
9 changed files with 557 additions and 477 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1904
Version: 1905
License: BSD
License File: LICENSE
Shipped: yes

View File

@ -88,7 +88,6 @@ extern "C" {
#define HAS_YUY2TOYROW_SSE2
#if !defined(LIBYUV_BIT_EXACT)
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ARGBTOUV444ROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
@ -234,6 +233,8 @@ extern "C" {
#define HAS_ARGBTOAB64ROW_SSSE3
#define HAS_ARGBTOAR30ROW_SSSE3
#define HAS_ARGBTOAR64ROW_SSSE3
#define HAS_ARGBTOUV444ROW_SSSE3
#define HAS_ARGBTOUVJ444ROW_SSSE3
#define HAS_ARGBUNATTENUATEROW_SSE2
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
@ -259,13 +260,14 @@ extern "C" {
#define HAS_P210TOARGBROW_SSSE3
#define HAS_P410TOAR30ROW_SSSE3
#define HAS_P410TOARGBROW_SSSE3
#define HAS_RAWTOARGBROW_AVX2
#define HAS_RAWTORGBAROW_SSSE3
#define HAS_RGB24MIRRORROW_SSSE3
#define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITARGBROW_SSE2
#define HAS_SPLITARGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SPLITRGBROW_SSE41
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SPLITXRGBROW_SSE2
#define HAS_SPLITXRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
@ -298,6 +300,8 @@ extern "C" {
#define HAS_ARGBTOAR64ROW_AVX2
#define HAS_ARGBTORAWROW_AVX2
#define HAS_ARGBTORGB24ROW_AVX2
#define HAS_ARGBTOUV444ROW_AVX2
#define HAS_ARGBTOUVJ444ROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
@ -332,8 +336,8 @@ extern "C" {
#define HAS_P410TOAR30ROW_AVX2
#define HAS_P410TOARGBROW_AVX2
#define HAS_RGBATOYJROW_AVX2
#define HAS_SPLITRGBROW_AVX2
#define HAS_SPLITARGBROW_AVX2
#define HAS_SPLITRGBROW_AVX2
#define HAS_SPLITUVROW_16_AVX2
#define HAS_SPLITXRGBROW_AVX2
#define HAS_SWAPUVROW_AVX2
@ -2699,6 +2703,33 @@ void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_v,
int width);
void ARGBToUVJ444Row_SSSE3(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJ444Row_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUV444Row_AVX2(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUV444Row_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJ444Row_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUV444Row_C(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -3853,6 +3884,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width);
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
@ -3955,6 +3987,7 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1904
#define LIBYUV_VERSION 1905
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -3265,6 +3265,14 @@ int RAWToI420(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RAWToARGBRow = RAWToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@ -3443,6 +3451,14 @@ int RAWToJ420(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RAWToARGBRow = RAWToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
@ -3559,6 +3575,14 @@ int RAWToI444(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_ARGBTOUV444ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUV444Row = ARGBToUV444Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUV444Row = ARGBToUV444Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUV444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
@ -3669,6 +3693,14 @@ int RAWToI444(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RAWToARGBRow = RAWToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_RAWTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToARGBRow = RAWToARGBRow_Any_NEON;
@ -3762,7 +3794,6 @@ int RAWToJ444(const uint8_t* src_raw,
src_stride_raw = -src_stride_raw;
}
// TODO: add row coalesce when main loop handles large width in blocks
// TODO: implement UVJ444 or trim the ifdef below
#if defined(HAS_ARGBTOUVJ444ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJ444Row = ARGBToUVJ444Row_Any_SSSE3;
@ -3771,6 +3802,14 @@ int RAWToJ444(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_ARGBTOUVJ444ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJ444Row = ARGBToUVJ444Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVJ444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON;
@ -3881,6 +3920,14 @@ int RAWToJ444(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RAWToARGBRow = RAWToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_RAWTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToARGBRow = RAWToARGBRow_Any_NEON;

View File

@ -3833,6 +3833,14 @@ int RAWToARGB(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RAWToARGBRow = RAWToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_RAWTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToARGBRow = RAWToARGBRow_Any_NEON;

View File

@ -60,6 +60,14 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUV444ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUV444Row = ARGBToUV444Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUV444Row = ARGBToUV444Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUV444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
@ -2445,6 +2453,14 @@ int ARGBToJ444(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVJ444ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJ444Row = ARGBToUVJ444Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVJ444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON;
@ -3630,6 +3646,14 @@ int RAWToJNV21(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RAWToARGBRow = RAWToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;

View File

@ -1033,6 +1033,9 @@ ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
#endif
#if defined(HAS_RAWTOARGBROW_AVX2)
ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31)
#endif
#if defined(HAS_RAWTORGBAROW_SSSE3)
ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
#endif
@ -2145,6 +2148,15 @@ ANY12(SplitUVRow_Any_LSX, SplitUVRow_LSX, 0, 2, 0, 31)
#ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
#endif
#ifdef HAS_ARGBTOUVJ444ROW_SSSE3
ANY12(ARGBToUVJ444Row_Any_SSSE3, ARGBToUVJ444Row_SSSE3, 0, 4, 0, 15)
#endif
#ifdef HAS_ARGBTOUV444ROW_AVX2
ANY12(ARGBToUV444Row_Any_AVX2, ARGBToUV444Row_AVX2, 0, 4, 0, 31)
#endif
#ifdef HAS_ARGBTOUVJ444ROW_AVX2
ANY12(ARGBToUVJ444Row_Any_AVX2, ARGBToUVJ444Row_AVX2, 0, 4, 0, 31)
#endif
#ifdef HAS_YUY2TOUV422ROW_AVX2
ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)

View File

@ -47,7 +47,6 @@ extern "C" {
#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \
defined(__i386__) || defined(_M_IX86))
#define LIBYUV_ARGBTOUV_PAVGB 1
#define LIBYUV_RGBTOU_TRUNCATE 1
#endif
#if defined(LIBYUV_BIT_EXACT)
#define LIBYUV_UNATTENUATE_DUP 1
@ -626,10 +625,16 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64,
}
#ifdef LIBYUV_RGB7
// Old 7 bit math for compatibility on unsupported platforms.
// Old 7 bit math for Visual C
static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
return STATIC_CAST(uint8_t, ((33 * r + 65 * g + 13 * b) >> 7) + 16);
}
static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8);
}
static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8);
}
#else
// 8 bit
// Intel SSE/AVX uses the following equivalent formula
@ -640,20 +645,6 @@ static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8);
}
#endif
#define AVGB(a, b) (((a) + (b) + 1) >> 1)
// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round.
#ifdef LIBYUV_RGBTOU_TRUNCATE
static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8);
}
static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8);
}
#else
// TODO(fbarchard): Add rounding to x86 SIMD and use this
static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) {
return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8);
}
@ -662,7 +653,9 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
}
#endif
// ARM uses uint16
#define AVGB(a, b) (((a) + (b) + 1) >> 1)
// ARM uses uint16. TODO: Make ARM use uint8 to allow dotproduct.
#if !defined(LIBYUV_ARGBTOUV_PAVGB)
static __inline int RGBxToU(uint16_t r, uint16_t g, uint16_t b) {
return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8);
@ -784,6 +777,16 @@ MAKEROWY(RAW, 0, 1, 2, 3)
// b -0.08131 * 255 = -20.73405 = -20
// g -0.41869 * 255 = -106.76595 = -107
// r 0.50000 * 255 = 127.5 = 127
// TODO: consider 256 for fixed point on UV
// JPeg 8 bit U:
// b 0.50000 * 256 = 128.0 = 128
// g -0.33126 * 256 = 84.80256 = -85
// r -0.16874 * 256 = 43.19744 = -43
// JPeg 8 bit V:
// b -0.08131 * 256 = 20.81536 = -21
// g -0.41869 * 256 = 107.18464 = -107
// r 0.50000 * 256 = 128.0 = 128
#ifdef LIBYUV_RGB7
// Old 7 bit math for compatibility on unsupported platforms.
@ -4379,13 +4382,17 @@ void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
#endif // HAS_RGB24TOYJROW_AVX2
#ifdef HAS_RAWTOYJROW_AVX2
// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
// Convert 32 RAW pixels (128 bytes) to 32 YJ values.
void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
#ifdef HAS_RAWTOARGBROW_AVX2
RAWToARGBRow_AVX2(src_raw, row, twidth);
#else
RAWToARGBRow_SSSE3(src_raw, row, twidth);
#endif
ARGBToYJRow_AVX2(row, dst_yj, twidth);
src_raw += twidth * 3;
dst_yj += twidth;

File diff suppressed because it is too large Load Diff