ARGBToUV SSE use average of 4 pixels

- Was using avgb twice for non-exact and C for exact.

On Skylake Xeon:

Now SSE3
ARGBToJ420_Opt (326 ms)

Was
Exact C
ARGBToJ420_Opt (871 ms)
Not exact AVX2
ARGBToJ420_Opt (237 ms)
Not exact SSSE3
ARGBToJ420_Opt (312 ms)

Bug: 381138208
Change-Id: I6d1081bb52e36f06736c0c6575fa82bb2268629b
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6629605
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Ben Weiss <bweiss@google.com>
This commit is contained in:
Frank Barchard 2025-06-16 16:56:37 -07:00
parent 889613683a
commit 6f729fbe65
7 changed files with 164 additions and 315 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/ URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1912 Version: 1913
License: BSD-3-Clause License: BSD-3-Clause
License File: LICENSE License File: LICENSE
Shipped: yes Shipped: yes

View File

@ -79,12 +79,6 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOUVROW_SSE2
#define HAS_YUY2TOYROW_SSE2 #define HAS_YUY2TOYROW_SSE2
#if !defined(LIBYUV_BIT_EXACT)
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3
#endif
// Effects: // Effects:
#define HAS_ARGBADDROW_SSE2 #define HAS_ARGBADDROW_SSE2
@ -272,11 +266,13 @@ extern "C" {
#define HAS_RGB24TOYROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3
#define HAS_RGBATOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3
#if !defined(LIBYUV_BIT_EXACT)
// TODO: adjust row_win to use 8 bit negative coefficients. // TODO: adjust row_win to use 8 bit negative coefficients.
#define HAS_ABGRTOUVJROW_SSSE3 #define HAS_ABGRTOUVJROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3
#endif #define HAS_ABGRTOUVROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3
#if defined(__x86_64__) || !defined(__pic__) #if defined(__x86_64__) || !defined(__pic__)
// TODO(fbarchard): fix build error on android_full_debug=1 // TODO(fbarchard): fix build error on android_full_debug=1
@ -350,12 +346,11 @@ extern "C" {
#define HAS_SPLITXRGBROW_AVX2 #define HAS_SPLITXRGBROW_AVX2
#define HAS_SWAPUVROW_AVX2 #define HAS_SWAPUVROW_AVX2
#define HAS_YUY2TONVUVROW_AVX2 #define HAS_YUY2TONVUVROW_AVX2
#if !defined(LIBYUV_BIT_EXACT) // TODO: Port SSSE3 to AVX2
#define HAS_ABGRTOUVJROW_AVX2 // #define HAS_ABGRTOUVJROW_AVX2
#define HAS_ABGRTOUVROW_AVX2 // #define HAS_ABGRTOUVROW_AVX2
#define HAS_ARGBTOUVJROW_AVX2 // #define HAS_ARGBTOUVJROW_AVX2
#define HAS_ARGBTOUVROW_AVX2 // #define HAS_ARGBTOUVROW_AVX2
#endif
#if defined(__x86_64__) || !defined(__pic__) #if defined(__x86_64__) || !defined(__pic__)
// TODO(fbarchard): fix build error on android_full_debug=1 // TODO(fbarchard): fix build error on android_full_debug=1

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1912 #define LIBYUV_VERSION 1913
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -36,10 +36,6 @@ extern "C" {
// LIBYUV_UNLIMITED_BT709 // LIBYUV_UNLIMITED_BT709
// LIBYUV_UNLIMITED_BT2020 // LIBYUV_UNLIMITED_BT2020
#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \
defined(__i386__) || defined(_M_IX86))
#define LIBYUV_ARGBTOUV_PAVGB 1
#endif
#if defined(LIBYUV_BIT_EXACT) #if defined(LIBYUV_BIT_EXACT)
#define LIBYUV_UNATTENUATE_DUP 1 #define LIBYUV_UNATTENUATE_DUP 1
#endif #endif
@ -627,7 +623,6 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64,
// b -0.1406 * 128 = 17.9968 = -18 // b -0.1406 * 128 = 17.9968 = -18
// g -0.7344 * 128 = 94.0032 = -94 // g -0.7344 * 128 = 94.0032 = -94
// r 0.875 * 128 = 112.0 = 112 // r 0.875 * 128 = 112.0 = 112
static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8); return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8);
} }
@ -639,46 +634,6 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
} }
#define AVGB(a, b) (((a) + (b) + 1) >> 1) #define AVGB(a, b) (((a) + (b) + 1) >> 1)
// ARGBToY_C and ARGBToUV_C
// Intel version of UV mimic SSE/AVX which does 2 pavgb
#if defined(LIBYUV_ARGBTOUV_PAVGB)
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \
src_rgb += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \
AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \
uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \
AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \
uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \
AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
src_rgb += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \
uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \
uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
} \
}
#else
// ARM version does average of 4 pixels with rounding
#define MAKEROWY(NAME, R, G, B, BPP) \ #define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \ int x; \
@ -717,7 +672,6 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
dst_v[0] = RGBToV(ar, ag, ab); \ dst_v[0] = RGBToV(ar, ag, ab); \
} \ } \
} }
#endif
MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(ARGB, 2, 1, 0, 4)
MAKEROWY(BGRA, 1, 2, 3, 4) MAKEROWY(BGRA, 1, 2, 3, 4)
@ -756,45 +710,6 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
} }
// ARGBToYJ_C and ARGBToUVJ_C // ARGBToYJ_C and ARGBToUVJ_C
// Intel version mimic SSE/AVX which does 2 pavgb
#if defined(LIBYUV_ARGBTOUV_PAVGB)
#define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \
src_rgb += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \
AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \
uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \
AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \
uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \
AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToUJ(ar, ag, ab); \
dst_v[0] = RGBToVJ(ar, ag, ab); \
src_rgb += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \
uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \
uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \
dst_u[0] = RGBToUJ(ar, ag, ab); \
dst_v[0] = RGBToVJ(ar, ag, ab); \
} \
}
#else
// ARM version does average of 4 pixels with rounding
#define MAKEROWYJ(NAME, R, G, B, BPP) \ #define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \ int x; \
@ -834,8 +749,6 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
} \ } \
} }
#endif
MAKEROWYJ(ARGB, 2, 1, 0, 4) MAKEROWYJ(ARGB, 2, 1, 0, 4)
MAKEROWYJ(ABGR, 0, 1, 2, 4) MAKEROWYJ(ABGR, 0, 1, 2, 4)
MAKEROWYJ(RGBA, 3, 2, 1, 4) MAKEROWYJ(RGBA, 3, 2, 1, 4)
@ -928,19 +841,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4)); g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4));
r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2)); r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
#if defined(LIBYUV_ARGBTOUV_PAVGB)
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
#else
uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2; uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2;
uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2; uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2;
uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2; uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2;
dst_u[0] = RGBToU(r, g, b); dst_u[0] = RGBToU(r, g, b);
dst_v[0] = RGBToV(r, g, b); dst_v[0] = RGBToV(r, g, b);
#endif
src_rgb565 += 4; src_rgb565 += 4;
next_rgb565 += 4; next_rgb565 += 4;
@ -1009,19 +914,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
g3 = STATIC_CAST(uint8_t, (g3 << 3) | (g3 >> 2)); g3 = STATIC_CAST(uint8_t, (g3 << 3) | (g3 >> 2));
r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2)); r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
#if defined(LIBYUV_ARGBTOUV_PAVGB)
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
#else
uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2; uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2;
uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2; uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2;
uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2; uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2;
dst_u[0] = RGBToU(r, g, b); dst_u[0] = RGBToU(r, g, b);
dst_v[0] = RGBToV(r, g, b); dst_v[0] = RGBToV(r, g, b);
#endif
src_argb1555 += 4; src_argb1555 += 4;
next_argb1555 += 4; next_argb1555 += 4;
@ -1087,19 +984,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
g3 = STATIC_CAST(uint8_t, (g3 << 4) | g3); g3 = STATIC_CAST(uint8_t, (g3 << 4) | g3);
r3 = STATIC_CAST(uint8_t, (r3 << 4) | r3); r3 = STATIC_CAST(uint8_t, (r3 << 4) | r3);
#if defined(LIBYUV_ARGBTOUV_PAVGB)
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
#else
uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2; uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2;
uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2; uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2;
uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2; uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2;
dst_u[0] = RGBToU(r, g, b); dst_u[0] = RGBToU(r, g, b);
dst_v[0] = RGBToV(r, g, b); dst_v[0] = RGBToV(r, g, b);
#endif
src_argb4444 += 4; src_argb4444 += 4;
next_argb4444 += 4; next_argb4444 += 4;

View File

@ -1733,161 +1733,71 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
#endif // HAS_ARGBTOUV444ROW_AVX2 #endif // HAS_ARGBTOUV444ROW_AVX2
#ifdef HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_SSSE3
// 8x2 -> 4x1 ARGB pixels converted to 4 U and 4 V
void OMITFP // ARGBToUV does rounding average of 4 ARGB pixels
ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width, int width,
const struct RgbUVConstants* rgbuvconstants) { const struct RgbUVConstants* rgbuvconstants) {
asm volatile( asm volatile(
"movdqa %0,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101
"movdqa %1,%%xmm4 \n" "pabsb %%xmm4,%%xmm4 \n"
"movdqa %2,%%xmm5 \n" "movdqa %5,%%xmm6 \n" // ARGB to U
: "movdqa %6,%%xmm7 \n" // ARGB to V
: "m"(rgbuvconstants->kRGBToU), // %0
"m"(rgbuvconstants->kRGBToV), // %1
"m"(kAddUV128) // %2
: "xmm3", "xmm4", "xmm5");
asm volatile("sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x10(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x20(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqu 0x30(%0),%%xmm6 \n"
"movdqu 0x30(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
"lea 0x40(%0),%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqa %%xmm2,%%xmm7 \n"
"shufps $0x88,%%xmm6,%%xmm2 \n"
"shufps $0xdd,%%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm0 \n"
"pmaddubsw %%xmm3,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
"movdqa %%xmm5,%%xmm2 \n"
"movdqa %%xmm5,%%xmm6 \n"
"psubw %%xmm0,%%xmm2 \n"
"psubw %%xmm1,%%xmm6 \n"
"psrlw $0x8,%%xmm2 \n"
"psrlw $0x8,%%xmm6 \n"
"packuswb %%xmm6,%%xmm2 \n"
"movlps %%xmm2,(%1) \n"
"movhps %%xmm2,0x00(%1,%2,1) \n"
"lea 0x8(%1),%1 \n"
"subl $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
#if defined(__i386__)
"+m"(width) // %3
#else
"+rm"(width) // %3
#endif
: "r"((intptr_t)(src_stride_argb)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6", "xmm7");
}
#endif // HAS_ARGBTOUVROW_SSSE3
// vpshufb for vphaddw + vpackuswb packed to shorts.
// Coefficients expressed as negatives to allow 128
struct UVMatrixConstants {
lvec8 kShufARGBToUV;
ulvec8 kAddUV128;
};
static const UVMatrixConstants kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128};
void OMITFP
ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
asm volatile(
"vbroadcastf128 %0,%%ymm6 \n"
"vbroadcastf128 %1,%%ymm7 \n"
:
: "m"(rgbuvconstants->kRGBToU), // %0
"m"(rgbuvconstants->kRGBToV) // %1
:);
asm volatile(
#if !defined(__i386__)
"vmovdqa 0(%5),%%ymm8 \n"
#endif
"vmovdqa 32(%5),%%ymm5 \n"
"sub %1,%2 \n" "sub %1,%2 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "movdqu (%0),%%xmm0 \n" // Read 8 ARGB Pixels
"vmovdqu 0x20(%0),%%ymm1 \n" "movdqu 0x10(%0),%%xmm5 \n"
"vmovdqu 0x40(%0),%%ymm2 \n" "movdqa %%xmm0,%%xmm1 \n"
"vmovdqu 0x60(%0),%%ymm3 \n" "shufps $0x88,%%xmm5,%%xmm0 \n" // Even pixels
"vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" "shufps $0xdd,%%xmm5,%%xmm1 \n" // Odd pixels
"vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" "movdqa %%xmm0,%%xmm5 \n"
"vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" "punpcklbw %%xmm1,%%xmm0 \n" // aarrgbb
"vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" "punpckhbw %%xmm5,%%xmm1 \n"
"lea 0x80(%0),%0 \n" "pmaddubsw %%xmm4,%%xmm0 \n" // paired add argb
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
"vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
"vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
"vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm6,%%ymm0,%%ymm1 \n" "movdqu 0x00(%0,%4,1),%%xmm2 \n" // Read 2nd row
"vpmaddubsw %%ymm6,%%ymm2,%%ymm3 \n" "movdqu 0x10(%0,%4,1),%%xmm5 \n"
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" "movdqa %%xmm2,%%xmm3 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" "shufps $0x88,%%xmm5,%%xmm2 \n" // Even
"vphaddw %%ymm3,%%ymm1,%%ymm1 \n" "shufps $0xdd,%%xmm5,%%xmm3 \n" // Odd pixels
"vphaddw %%ymm2,%%ymm0,%%ymm0 \n" "movdqa %%xmm2,%%xmm5 \n"
"vpsubw %%ymm0,%%ymm5,%%ymm0 \n" "punpcklbw %%xmm3,%%xmm2 \n" // aarrgbb
"vpsubw %%ymm1,%%ymm5,%%ymm1 \n" "punpckhbw %%xmm5,%%xmm3 \n"
"vpsrlw $0x8,%%ymm1,%%ymm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" // argb
"vpsrlw $0x8,%%ymm0,%%ymm0 \n" "pmaddubsw %%xmm4,%%xmm3 \n"
"vpackuswb %%ymm0,%%ymm1,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n" "pxor %%xmm5,%%xmm5 \n" // constant 0 for pavgw
#if defined(__i386__) "paddw %%xmm2,%%xmm0 \n"
"vpshufb (%5),%%ymm0,%%ymm0 \n" "paddw %%xmm3,%%xmm1 \n"
#else "psrlw $1,%%xmm0 \n" // round
"vpshufb %%ymm8,%%ymm0,%%ymm0 \n" "psrlw $1,%%xmm1 \n"
#endif "pavgw %%xmm5,%%xmm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n" "pavgw %%xmm5,%%xmm1 \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" "packuswb %%xmm1,%%xmm0 \n" // 4 ARGB pixels
"lea 0x10(%1),%1 \n"
"subl $0x20,%3 \n" "movdqa %%xmm0,%%xmm1 \n"
"pmaddubsw %%xmm6,%%xmm0 \n" // u
"pmaddubsw %%xmm7,%%xmm1 \n" // v
"phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv
"movdqa %7,%%xmm2 \n" // 0x8000
"psubw %%xmm0,%%xmm2 \n" // unsigned 0 to 0xffff
"psrlw $0x8,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n"
"movd %%xmm2,(%1) \n" // Write 4 U's
"shufps $0xdd,%%xmm2,%%xmm2 \n"
"movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's
"lea 0x20(%0),%0 \n"
"lea 0x4(%1),%1 \n"
"subl $0x8,%3 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
@ -1897,10 +1807,14 @@ ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
"+rm"(width) // %3 "+rm"(width) // %3
#endif #endif
: "r"((intptr_t)(src_stride_argb)), // %4 : "r"((intptr_t)(src_stride_argb)), // %4
"r"(&kShufARGBToUV_AVX) // %5 "m"(rgbuvconstants->kRGBToU), // %5
"m"(rgbuvconstants->kRGBToV), // %6
"m"(kAddUV128) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7"); "xmm7");
} }
#endif // HAS_ARGBTOUVROW_SSSE3
#ifdef HAS_ARGBTOUV444ROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_SSSE3
@ -1944,14 +1858,6 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width, ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width,
&kARGBI601UVConstants); &kARGBI601UVConstants);
} }
void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width,
&kARGBI601UVConstants);
}
static const struct RgbUVConstants kABGRI601UVConstants = { static const struct RgbUVConstants kABGRI601UVConstants = {
{38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0}, {38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0},
@ -1966,15 +1872,6 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
&kABGRI601UVConstants); &kABGRI601UVConstants);
} }
void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width,
&kABGRI601UVConstants);
}
static const struct RgbUVConstants kBGRAI601UVConstants = { static const struct RgbUVConstants kBGRAI601UVConstants = {
{0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112}, {0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112},
{0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18}}; {0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18}};
@ -2035,28 +1932,10 @@ void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb,
} }
#endif // HAS_ARGBTOUVJ444ROW_AVX2 #endif // HAS_ARGBTOUVJ444ROW_AVX2
void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width,
&kARGBJPEGUVConstants);
}
static const struct RgbUVConstants kABGRJPEGUVConstants = { static const struct RgbUVConstants kABGRJPEGUVConstants = {
{43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0}, {43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0},
{-128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0}}; {-128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0}};
void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width,
&kABGRJPEGUVConstants);
}
#ifdef HAS_ARGBTOUVJROW_SSSE3 #ifdef HAS_ARGBTOUVJROW_SSSE3
void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,

View File

@ -184,7 +184,7 @@ void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
// 32 bit // 32 bit
#else // defined(_M_X64) #else // defined(_M_X64)
#ifdef HAS_ARGBTOUVROW_SSSE3 // if HAS_ARGBTOUVROW_SSSE3
// 8 bit fixed point 0.5, for bias of UV. // 8 bit fixed point 0.5, for bias of UV.
static const ulvec8 kBiasUV128 = { static const ulvec8 kBiasUV128 = {
@ -221,7 +221,7 @@ static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
// JPeg full range. // JPeg full range.
static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
15, 75, 38, 0, 15, 75, 38, 0}; 15, 75, 38, 0, 15, 75, 38, 0};
#endif // endif
// vpermd for vphaddw + vpackuswb vpermd. // vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
@ -1208,6 +1208,8 @@ __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
} }
} }
#ifdef HAS_ARGBTOUVROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
@ -1244,6 +1246,7 @@ __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
ret ret
} }
} }
#endif
#ifdef HAS_ARGBTOYROW_AVX2 #ifdef HAS_ARGBTOYROW_AVX2
@ -1430,6 +1433,8 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
} }
} }
#ifdef HAS_ARGBTOUVROW_SSSE3
__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_u,
@ -1578,6 +1583,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
ret ret
} }
} }
#endif
#ifdef HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVROW_AVX2
__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,

View File

@ -2721,6 +2721,86 @@ TEST_F(LibYUVConvertTest, TestUYVYToARGB) {
EXPECT_EQ(3486643515u, checksum); EXPECT_EQ(3486643515u, checksum);
} }
#ifdef ENABLE_ROW_TESTS
TEST_F(LibYUVConvertTest, TestARGBToUVRow) {
SIMD_ALIGNED(uint8_t orig_argb_pixels[256]);
SIMD_ALIGNED(uint8_t dest_u[32]);
SIMD_ALIGNED(uint8_t dest_v[32]);
for (int i = 0; i < 256; ++i) {
orig_argb_pixels[i] = i * 43;
}
orig_argb_pixels[0] = 0xff; // blue
orig_argb_pixels[1] = 0x0;
orig_argb_pixels[2] = 0x0;
orig_argb_pixels[3] = 0xff;
orig_argb_pixels[4] = 0xff; // blue
orig_argb_pixels[5] = 0x0;
orig_argb_pixels[6] = 0x0;
orig_argb_pixels[7] = 0xff;
orig_argb_pixels[8] = 0x0;
orig_argb_pixels[9] = 0xff; // green
orig_argb_pixels[10] = 0x0;
orig_argb_pixels[11] = 0xff;
orig_argb_pixels[12] = 0x0;
orig_argb_pixels[13] = 0xff; // green
orig_argb_pixels[14] = 0x0;
orig_argb_pixels[15] = 0xff;
orig_argb_pixels[16] = 0x0;
orig_argb_pixels[17] = 0x0;
orig_argb_pixels[18] = 0xff; // red
orig_argb_pixels[19] = 0xff;
orig_argb_pixels[20] = 0x0;
orig_argb_pixels[21] = 0x0;
orig_argb_pixels[22] = 0xff; // red
orig_argb_pixels[23] = 0xff;
orig_argb_pixels[24] = 0xff;
orig_argb_pixels[25] = 0xff;
orig_argb_pixels[26] = 0xff; // white
orig_argb_pixels[27] = 0xff;
orig_argb_pixels[28] = 0xff;
orig_argb_pixels[29] = 0xff;
orig_argb_pixels[30] = 0xff; // white
orig_argb_pixels[31] = 0xff;
int benchmark_iterations =
benchmark_width_ * benchmark_height_ * benchmark_iterations_ / 32;
for (int i = 0; i < benchmark_iterations; ++i) {
#if defined(HAS_ARGBTOUVROW_SSSE3)
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
if (has_ssse3) {
ARGBToUVRow_SSSE3(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
} else {
ARGBToUVRow_C(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
}
#elif defined(HAS_ARGBTOUVROW_NEON)
ARGBToUVRow_NEON(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
#else
ARGBToUVRow_C(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
#endif
}
printf("u: ");
for (int i = 0; i < 32; ++i) {
printf("%3d ", (int)dest_u[i]);
}
printf("\nv: ");
for (int i = 0; i < 32; ++i) {
printf("%3d ", (int)dest_v[i]);
}
printf("\n");
uint32_t checksum_u = HashDjb2(&dest_u[0], sizeof(dest_u), 5381);
EXPECT_EQ(192508756u, checksum_u);
uint32_t checksum_v = HashDjb2(&dest_v[0], sizeof(dest_v), 5381);
EXPECT_EQ(2590663990u, checksum_v);
}
#endif
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) #if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__)
TEST_F(LibYUVConvertTest, TestI400LargeSize) { TEST_F(LibYUVConvertTest, TestI400LargeSize) {
// The width and height are chosen as follows: // The width and height are chosen as follows: