ARGBToUV SSE use average of 4 pixels

- Was using avgb twice for non-exact and C for exact.

On Skylake Xeon:

Now SSE3
ARGBToJ420_Opt (326 ms)

Was
Exact C
ARGBToJ420_Opt (871 ms)
Not exact AVX2
ARGBToJ420_Opt (237 ms)
Not exact SSSE3
ARGBToJ420_Opt (312 ms)

Bug: 381138208
Change-Id: I6d1081bb52e36f06736c0c6575fa82bb2268629b
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6629605
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Ben Weiss <bweiss@google.com>
This commit is contained in:
Frank Barchard 2025-06-16 16:56:37 -07:00
parent 889613683a
commit 6f729fbe65
7 changed files with 164 additions and 315 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1912
Version: 1913
License: BSD-3-Clause
License File: LICENSE
Shipped: yes

View File

@ -79,12 +79,6 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2
#define HAS_YUY2TOYROW_SSE2
#if !defined(LIBYUV_BIT_EXACT)
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3
#endif
// Effects:
#define HAS_ARGBADDROW_SSE2
@ -272,11 +266,13 @@ extern "C" {
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
#if !defined(LIBYUV_BIT_EXACT)
// TODO: adjust row_win to use 8 bit negative coefficients.
#define HAS_ABGRTOUVJROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3
#endif
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3
#if defined(__x86_64__) || !defined(__pic__)
// TODO(fbarchard): fix build error on android_full_debug=1
@ -350,12 +346,11 @@ extern "C" {
#define HAS_SPLITXRGBROW_AVX2
#define HAS_SWAPUVROW_AVX2
#define HAS_YUY2TONVUVROW_AVX2
#if !defined(LIBYUV_BIT_EXACT)
#define HAS_ABGRTOUVJROW_AVX2
#define HAS_ABGRTOUVROW_AVX2
#define HAS_ARGBTOUVJROW_AVX2
#define HAS_ARGBTOUVROW_AVX2
#endif
// TODO: Port SSSE3 to AVX2
// #define HAS_ABGRTOUVJROW_AVX2
// #define HAS_ABGRTOUVROW_AVX2
// #define HAS_ARGBTOUVJROW_AVX2
// #define HAS_ARGBTOUVROW_AVX2
#if defined(__x86_64__) || !defined(__pic__)
// TODO(fbarchard): fix build error on android_full_debug=1

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1912
#define LIBYUV_VERSION 1913
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -36,10 +36,6 @@ extern "C" {
// LIBYUV_UNLIMITED_BT709
// LIBYUV_UNLIMITED_BT2020
#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \
defined(__i386__) || defined(_M_IX86))
#define LIBYUV_ARGBTOUV_PAVGB 1
#endif
#if defined(LIBYUV_BIT_EXACT)
#define LIBYUV_UNATTENUATE_DUP 1
#endif
@ -627,7 +623,6 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64,
// b -0.1406 * 128 = 17.9968 = -18
// g -0.7344 * 128 = 94.0032 = -94
// r 0.875 * 128 = 112.0 = 112
static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) {
return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8);
}
@ -639,46 +634,6 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
}
#define AVGB(a, b) (((a) + (b) + 1) >> 1)
// ARGBToY_C and ARGBToUV_C
// Intel version of UV mimic SSE/AVX which does 2 pavgb
#if defined(LIBYUV_ARGBTOUV_PAVGB)
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \
src_rgb += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \
AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \
uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \
AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \
uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \
AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
src_rgb += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \
uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \
uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
} \
}
#else
// ARM version does average of 4 pixels with rounding
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \
@ -717,7 +672,6 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) {
dst_v[0] = RGBToV(ar, ag, ab); \
} \
}
#endif
MAKEROWY(ARGB, 2, 1, 0, 4)
MAKEROWY(BGRA, 1, 2, 3, 4)
@ -756,45 +710,6 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
}
// ARGBToYJ_C and ARGBToUVJ_C
// Intel version mimic SSE/AVX which does 2 pavgb
#if defined(LIBYUV_ARGBTOUV_PAVGB)
#define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \
src_rgb += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \
AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \
uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \
AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \
uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \
AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToUJ(ar, ag, ab); \
dst_v[0] = RGBToVJ(ar, ag, ab); \
src_rgb += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \
uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \
uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \
dst_u[0] = RGBToUJ(ar, ag, ab); \
dst_v[0] = RGBToVJ(ar, ag, ab); \
} \
}
#else
// ARM version does average of 4 pixels with rounding
#define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \
@ -834,8 +749,6 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
} \
}
#endif
MAKEROWYJ(ARGB, 2, 1, 0, 4)
MAKEROWYJ(ABGR, 0, 1, 2, 4)
MAKEROWYJ(RGBA, 3, 2, 1, 4)
@ -928,19 +841,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4));
r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
#if defined(LIBYUV_ARGBTOUV_PAVGB)
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
#else
uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2;
uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2;
uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2;
dst_u[0] = RGBToU(r, g, b);
dst_v[0] = RGBToV(r, g, b);
#endif
src_rgb565 += 4;
next_rgb565 += 4;
@ -1009,19 +914,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
g3 = STATIC_CAST(uint8_t, (g3 << 3) | (g3 >> 2));
r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2));
#if defined(LIBYUV_ARGBTOUV_PAVGB)
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
#else
uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2;
uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2;
uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2;
dst_u[0] = RGBToU(r, g, b);
dst_v[0] = RGBToV(r, g, b);
#endif
src_argb1555 += 4;
next_argb1555 += 4;
@ -1087,19 +984,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
g3 = STATIC_CAST(uint8_t, (g3 << 4) | g3);
r3 = STATIC_CAST(uint8_t, (r3 << 4) | r3);
#if defined(LIBYUV_ARGBTOUV_PAVGB)
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
#else
uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2;
uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2;
uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2;
dst_u[0] = RGBToU(r, g, b);
dst_v[0] = RGBToV(r, g, b);
#endif
src_argb4444 += 4;
next_argb4444 += 4;

View File

@ -1733,161 +1733,71 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
#endif // HAS_ARGBTOUV444ROW_AVX2
#ifdef HAS_ARGBTOUVROW_SSSE3
void OMITFP
ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
// 8x2 -> 4x1 ARGB pixels converted to 4 U and 4 V
// ARGBToUV does rounding average of 4 ARGB pixels
void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
asm volatile(
"movdqa %0,%%xmm3 \n"
"movdqa %1,%%xmm4 \n"
"movdqa %2,%%xmm5 \n"
:
: "m"(rgbuvconstants->kRGBToU), // %0
"m"(rgbuvconstants->kRGBToV), // %1
"m"(kAddUV128) // %2
: "xmm3", "xmm4", "xmm5");
asm volatile("sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x10(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x20(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqu 0x30(%0),%%xmm6 \n"
"movdqu 0x30(%0,%4,1),%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
"lea 0x40(%0),%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
"pavgb %%xmm7,%%xmm0 \n"
"movdqa %%xmm2,%%xmm7 \n"
"shufps $0x88,%%xmm6,%%xmm2 \n"
"shufps $0xdd,%%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm0 \n"
"pmaddubsw %%xmm3,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm6 \n"
"phaddw %%xmm2,%%xmm0 \n"
"phaddw %%xmm6,%%xmm1 \n"
"movdqa %%xmm5,%%xmm2 \n"
"movdqa %%xmm5,%%xmm6 \n"
"psubw %%xmm0,%%xmm2 \n"
"psubw %%xmm1,%%xmm6 \n"
"psrlw $0x8,%%xmm2 \n"
"psrlw $0x8,%%xmm6 \n"
"packuswb %%xmm6,%%xmm2 \n"
"movlps %%xmm2,(%1) \n"
"movhps %%xmm2,0x00(%1,%2,1) \n"
"lea 0x8(%1),%1 \n"
"subl $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
#if defined(__i386__)
"+m"(width) // %3
#else
"+rm"(width) // %3
#endif
: "r"((intptr_t)(src_stride_argb)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6", "xmm7");
}
#endif // HAS_ARGBTOUVROW_SSSE3
// vpshufb for vphaddw + vpackuswb packed to shorts.
// Coefficients expressed as negatives to allow 128
struct UVMatrixConstants {
lvec8 kShufARGBToUV;
ulvec8 kAddUV128;
};
static const UVMatrixConstants kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128};
void OMITFP
ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
asm volatile(
"vbroadcastf128 %0,%%ymm6 \n"
"vbroadcastf128 %1,%%ymm7 \n"
:
: "m"(rgbuvconstants->kRGBToU), // %0
"m"(rgbuvconstants->kRGBToV) // %1
:);
asm volatile(
#if !defined(__i386__)
"vmovdqa 0(%5),%%ymm8 \n"
#endif
"vmovdqa 32(%5),%%ymm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101
"pabsb %%xmm4,%%xmm4 \n"
"movdqa %5,%%xmm6 \n" // ARGB to U
"movdqa %6,%%xmm7 \n" // ARGB to V
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
"vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
"vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
"vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
"vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
"vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
"vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
"movdqu (%0),%%xmm0 \n" // Read 8 ARGB Pixels
"movdqu 0x10(%0),%%xmm5 \n"
"movdqa %%xmm0,%%xmm1 \n"
"shufps $0x88,%%xmm5,%%xmm0 \n" // Even pixels
"shufps $0xdd,%%xmm5,%%xmm1 \n" // Odd pixels
"movdqa %%xmm0,%%xmm5 \n"
"punpcklbw %%xmm1,%%xmm0 \n" // aarrgbb
"punpckhbw %%xmm5,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm0 \n" // paired add argb
"pmaddubsw %%xmm4,%%xmm1 \n"
"vpmaddubsw %%ymm6,%%ymm0,%%ymm1 \n"
"vpmaddubsw %%ymm6,%%ymm2,%%ymm3 \n"
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n"
"vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpsubw %%ymm0,%%ymm5,%%ymm0 \n"
"vpsubw %%ymm1,%%ymm5,%%ymm1 \n"
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm1,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
#if defined(__i386__)
"vpshufb (%5),%%ymm0,%%ymm0 \n"
#else
"vpshufb %%ymm8,%%ymm0,%%ymm0 \n"
#endif
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
"lea 0x10(%1),%1 \n"
"subl $0x20,%3 \n"
"movdqu 0x00(%0,%4,1),%%xmm2 \n" // Read 2nd row
"movdqu 0x10(%0,%4,1),%%xmm5 \n"
"movdqa %%xmm2,%%xmm3 \n"
"shufps $0x88,%%xmm5,%%xmm2 \n" // Even
"shufps $0xdd,%%xmm5,%%xmm3 \n" // Odd pixels
"movdqa %%xmm2,%%xmm5 \n"
"punpcklbw %%xmm3,%%xmm2 \n" // aarrgbb
"punpckhbw %%xmm5,%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm2 \n" // argb
"pmaddubsw %%xmm4,%%xmm3 \n"
"pxor %%xmm5,%%xmm5 \n" // constant 0 for pavgw
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
"psrlw $1,%%xmm0 \n" // round
"psrlw $1,%%xmm1 \n"
"pavgw %%xmm5,%%xmm0 \n"
"pavgw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" // 4 ARGB pixels
"movdqa %%xmm0,%%xmm1 \n"
"pmaddubsw %%xmm6,%%xmm0 \n" // u
"pmaddubsw %%xmm7,%%xmm1 \n" // v
"phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv
"movdqa %7,%%xmm2 \n" // 0x8000
"psubw %%xmm0,%%xmm2 \n" // unsigned 0 to 0xffff
"psrlw $0x8,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n"
"movd %%xmm2,(%1) \n" // Write 4 U's
"shufps $0xdd,%%xmm2,%%xmm2 \n"
"movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's
"lea 0x20(%0),%0 \n"
"lea 0x4(%1),%1 \n"
"subl $0x8,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@ -1897,10 +1807,14 @@ ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
"+rm"(width) // %3
#endif
: "r"((intptr_t)(src_stride_argb)), // %4
"r"(&kShufARGBToUV_AVX) // %5
"m"(rgbuvconstants->kRGBToU), // %5
"m"(rgbuvconstants->kRGBToV), // %6
"m"(kAddUV128) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_ARGBTOUVROW_SSSE3
#ifdef HAS_ARGBTOUV444ROW_SSSE3
@ -1944,14 +1858,6 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width,
&kARGBI601UVConstants);
}
void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width,
&kARGBI601UVConstants);
}
static const struct RgbUVConstants kABGRI601UVConstants = {
{38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0},
@ -1966,15 +1872,6 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
&kABGRI601UVConstants);
}
void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width,
&kABGRI601UVConstants);
}
static const struct RgbUVConstants kBGRAI601UVConstants = {
{0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112},
{0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18}};
@ -2035,28 +1932,10 @@ void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb,
}
#endif // HAS_ARGBTOUVJ444ROW_AVX2
void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width,
&kARGBJPEGUVConstants);
}
static const struct RgbUVConstants kABGRJPEGUVConstants = {
{43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0},
{-128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0}};
void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width,
&kABGRJPEGUVConstants);
}
#ifdef HAS_ARGBTOUVJROW_SSSE3
void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,

View File

@ -184,7 +184,7 @@ void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
// 32 bit
#else // defined(_M_X64)
#ifdef HAS_ARGBTOUVROW_SSSE3
// if HAS_ARGBTOUVROW_SSSE3
// 8 bit fixed point 0.5, for bias of UV.
static const ulvec8 kBiasUV128 = {
@ -221,7 +221,7 @@ static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
// JPeg full range.
static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
15, 75, 38, 0, 15, 75, 38, 0};
#endif
// endif
// vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
@ -1208,6 +1208,8 @@ __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
}
}
#ifdef HAS_ARGBTOUVROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
@ -1244,6 +1246,7 @@ __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
ret
}
}
#endif
#ifdef HAS_ARGBTOYROW_AVX2
@ -1430,6 +1433,8 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
}
}
#ifdef HAS_ARGBTOUVROW_SSSE3
__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@ -1578,6 +1583,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
ret
}
}
#endif
#ifdef HAS_ARGBTOUVROW_AVX2
__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,

View File

@ -2721,6 +2721,86 @@ TEST_F(LibYUVConvertTest, TestUYVYToARGB) {
EXPECT_EQ(3486643515u, checksum);
}
#ifdef ENABLE_ROW_TESTS
TEST_F(LibYUVConvertTest, TestARGBToUVRow) {
SIMD_ALIGNED(uint8_t orig_argb_pixels[256]);
SIMD_ALIGNED(uint8_t dest_u[32]);
SIMD_ALIGNED(uint8_t dest_v[32]);
for (int i = 0; i < 256; ++i) {
orig_argb_pixels[i] = i * 43;
}
orig_argb_pixels[0] = 0xff; // blue
orig_argb_pixels[1] = 0x0;
orig_argb_pixels[2] = 0x0;
orig_argb_pixels[3] = 0xff;
orig_argb_pixels[4] = 0xff; // blue
orig_argb_pixels[5] = 0x0;
orig_argb_pixels[6] = 0x0;
orig_argb_pixels[7] = 0xff;
orig_argb_pixels[8] = 0x0;
orig_argb_pixels[9] = 0xff; // green
orig_argb_pixels[10] = 0x0;
orig_argb_pixels[11] = 0xff;
orig_argb_pixels[12] = 0x0;
orig_argb_pixels[13] = 0xff; // green
orig_argb_pixels[14] = 0x0;
orig_argb_pixels[15] = 0xff;
orig_argb_pixels[16] = 0x0;
orig_argb_pixels[17] = 0x0;
orig_argb_pixels[18] = 0xff; // red
orig_argb_pixels[19] = 0xff;
orig_argb_pixels[20] = 0x0;
orig_argb_pixels[21] = 0x0;
orig_argb_pixels[22] = 0xff; // red
orig_argb_pixels[23] = 0xff;
orig_argb_pixels[24] = 0xff;
orig_argb_pixels[25] = 0xff;
orig_argb_pixels[26] = 0xff; // white
orig_argb_pixels[27] = 0xff;
orig_argb_pixels[28] = 0xff;
orig_argb_pixels[29] = 0xff;
orig_argb_pixels[30] = 0xff; // white
orig_argb_pixels[31] = 0xff;
int benchmark_iterations =
benchmark_width_ * benchmark_height_ * benchmark_iterations_ / 32;
for (int i = 0; i < benchmark_iterations; ++i) {
#if defined(HAS_ARGBTOUVROW_SSSE3)
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
if (has_ssse3) {
ARGBToUVRow_SSSE3(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
} else {
ARGBToUVRow_C(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
}
#elif defined(HAS_ARGBTOUVROW_NEON)
ARGBToUVRow_NEON(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
#else
ARGBToUVRow_C(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
#endif
}
printf("u: ");
for (int i = 0; i < 32; ++i) {
printf("%3d ", (int)dest_u[i]);
}
printf("\nv: ");
for (int i = 0; i < 32; ++i) {
printf("%3d ", (int)dest_v[i]);
}
printf("\n");
uint32_t checksum_u = HashDjb2(&dest_u[0], sizeof(dest_u), 5381);
EXPECT_EQ(192508756u, checksum_u);
uint32_t checksum_v = HashDjb2(&dest_v[0], sizeof(dest_v), 5381);
EXPECT_EQ(2590663990u, checksum_v);
}
#endif
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__)
TEST_F(LibYUVConvertTest, TestI400LargeSize) {
// The width and height are chosen as follows: