mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
YUY2ToARGB avx2 in 1 step conversion.
Includes UYVYToARGB ssse3 fix. Was YUY2ToARGB_Opt (433 ms) 69.79% libyuv_unittest libyuv_unittest [.] I422ToARGBRow_AVX2 20.73% libyuv_unittest libyuv_unittest [.] YUY2ToUV422Row_AVX2 6.04% libyuv_unittest libyuv_unittest [.] YUY2ToYRow_AVX2 0.77% libyuv_unittest libyuv_unittest [.] YUY2ToARGBRow_AVX2 Now YUY2ToARGB_Opt (280 ms) 95.66% libyuv_unittest libyuv_unittest [.] YUY2ToARGBRow_AVX2 BUG=libyuv:494 R=harryjin@google.com Review URL: https://codereview.chromium.org/1364813002 .
This commit is contained in:
parent
16f12b58cc
commit
000cf89ca8
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1489
|
||||
Version: 1490
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1489
|
||||
#define LIBYUV_VERSION 1490
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -2607,48 +2607,6 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_YUY2TOARGBROW_AVX2)
|
||||
void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
|
||||
uint8* dst_argb,
|
||||
struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
// Row buffers for intermediate YUV pixels.
|
||||
SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
|
||||
SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
|
||||
SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
|
||||
YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
|
||||
I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, yuvconstants, twidth);
|
||||
src_yuy2 += twidth * 2;
|
||||
dst_argb += twidth * 4;
|
||||
width -= twidth;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_UYVYTOARGBROW_AVX2)
|
||||
void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
|
||||
uint8* dst_argb,
|
||||
struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
// Row buffers for intermediate YUV pixels.
|
||||
SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
|
||||
SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
|
||||
SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
|
||||
UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
|
||||
I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, yuvconstants, twidth);
|
||||
src_uyvy += twidth * 2;
|
||||
dst_argb += twidth * 4;
|
||||
width -= twidth;
|
||||
}
|
||||
}
|
||||
#endif // !defined(LIBYUV_DISABLE_X86)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -140,6 +140,30 @@ static uvec8 kShuffleMaskARGBToRGB24_0 = {
|
||||
static uvec8 kShuffleMaskARGBToRAW_0 = {
|
||||
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
|
||||
};
|
||||
|
||||
// YUY2 shuf 16 Y to 32 Y.
|
||||
static const lvec8 kShuffleYUY2Y = {
|
||||
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
|
||||
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
|
||||
};
|
||||
|
||||
// YUY2 shuf 8 UV to 16 UV.
|
||||
static const lvec8 kShuffleYUY2UV = {
|
||||
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
|
||||
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
|
||||
};
|
||||
|
||||
// UYVY shuf 16 Y to 32 Y.
|
||||
static const lvec8 kShuffleUYVYY = {
|
||||
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
|
||||
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
|
||||
};
|
||||
|
||||
// UYVY shuf 8 UV to 16 UV.
|
||||
static const lvec8 kShuffleUYVYUV = {
|
||||
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
|
||||
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
|
||||
};
|
||||
#endif // HAS_RGB24TOARGBROW_SSSE3
|
||||
|
||||
#ifdef HAS_J400TOARGBROW_SSE2
|
||||
@ -1361,16 +1385,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
|
||||
"punpcklbw %%xmm4,%%xmm4 \n" \
|
||||
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
|
||||
|
||||
// YUY2 shuf 8 Y to 16 Y.
|
||||
static const vec8 kShuffleYUY2Y = {
|
||||
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
|
||||
};
|
||||
|
||||
// YUY2 shuf 4 UV to 8 UV.
|
||||
static const vec8 kShuffleYUY2UV = {
|
||||
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
|
||||
};
|
||||
|
||||
// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
|
||||
#define READYUY2 \
|
||||
"movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
|
||||
@ -1379,16 +1393,6 @@ static const vec8 kShuffleYUY2UV = {
|
||||
"pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
|
||||
"lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
|
||||
|
||||
// UYVY shuf 8 Y to 16 Y.
|
||||
static const vec8 kShuffleUYVYY = {
|
||||
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
|
||||
};
|
||||
|
||||
// UYVY shuf 4 UV to 8 UV.
|
||||
static const vec8 kShuffleUYVYUV = {
|
||||
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
|
||||
};
|
||||
|
||||
// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
|
||||
#define READUYVY \
|
||||
"movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
|
||||
@ -1422,7 +1426,7 @@ static const vec8 kShuffleUYVYUV = {
|
||||
"packuswb %%xmm1,%%xmm1 \n" \
|
||||
"packuswb %%xmm2,%%xmm2 \n"
|
||||
|
||||
// Store 8 ARGB values. Assumes XMM5 is zero.
|
||||
// Store 8 ARGB values. Assumes XMM5 is set.
|
||||
#define STOREARGB \
|
||||
"punpcklbw %%xmm1,%%xmm0 \n" \
|
||||
"punpcklbw %%xmm5,%%xmm2 \n" \
|
||||
@ -1433,7 +1437,7 @@ static const vec8 kShuffleUYVYUV = {
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
|
||||
"lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
|
||||
|
||||
// Store 8 BGRA values. Assumes XMM5 is zero.
|
||||
// Store 8 BGRA values.
|
||||
#define STOREBGRA \
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n" \
|
||||
"punpcklbw %%xmm0,%%xmm1 \n" \
|
||||
@ -1445,7 +1449,7 @@ static const vec8 kShuffleUYVYUV = {
|
||||
"movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \
|
||||
"lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"
|
||||
|
||||
// Store 8 ABGR values. Assumes XMM5 is zero.
|
||||
// Store 8 ABGR values. Assumes XMM5 is set.
|
||||
#define STOREABGR \
|
||||
"punpcklbw %%xmm1,%%xmm2 \n" \
|
||||
"punpcklbw %%xmm5,%%xmm0 \n" \
|
||||
@ -1456,7 +1460,7 @@ static const vec8 kShuffleUYVYUV = {
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \
|
||||
"lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"
|
||||
|
||||
// Store 8 RGBA values. Assumes XMM5 is zero.
|
||||
// Store 8 RGBA values. Assumes XMM5 is set.
|
||||
#define STORERGBA \
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n" \
|
||||
"punpcklbw %%xmm2,%%xmm1 \n" \
|
||||
@ -1522,7 +1526,6 @@ void OMITFP I444ToABGRRow_SSSE3(const uint8* y_buf,
|
||||
);
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Consider putting masks into constants.
|
||||
void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -1829,7 +1832,27 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
|
||||
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
|
||||
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
|
||||
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
|
||||
"vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
|
||||
"vpermq $0xd8,%%ymm4,%%ymm4 \n" \
|
||||
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
|
||||
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
|
||||
|
||||
// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
|
||||
#define READYUY2_AVX2 \
|
||||
"vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
|
||||
"vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
|
||||
"vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
|
||||
"vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
|
||||
"lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
|
||||
|
||||
// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
|
||||
#define READUYVY_AVX2 \
|
||||
"vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
|
||||
"vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
|
||||
"vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
|
||||
"vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
|
||||
"lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
|
||||
|
||||
// Convert 16 pixels: 16 UV and 16 Y.
|
||||
#define YUVTORGB_AVX2(YuvConstants) \
|
||||
@ -1842,20 +1865,28 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
|
||||
"vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
|
||||
"vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \
|
||||
"vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
|
||||
"vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
|
||||
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
|
||||
"vpermq $0xd8,%%ymm3,%%ymm3 \n" \
|
||||
"vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \
|
||||
"vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \
|
||||
"vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \
|
||||
"vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \
|
||||
"vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \
|
||||
"vpsraw $0x6,%%ymm0,%%ymm0 \n" \
|
||||
"vpsraw $0x6,%%ymm1,%%ymm1 \n" \
|
||||
"vpsraw $0x6,%%ymm2,%%ymm2 \n" \
|
||||
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
|
||||
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
|
||||
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
|
||||
"vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm4,%%ymm4 \n" \
|
||||
"vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
|
||||
"vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
|
||||
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
|
||||
"vpsraw $0x6,%%ymm0,%%ymm0 \n" \
|
||||
"vpsraw $0x6,%%ymm1,%%ymm1 \n" \
|
||||
"vpsraw $0x6,%%ymm2,%%ymm2 \n" \
|
||||
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
|
||||
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
|
||||
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
|
||||
|
||||
// Store 16 ARGB values. Assumes XMM5 is set.
|
||||
#define STOREARGB_AVX2 \
|
||||
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n" \
|
||||
"vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
|
||||
"vpermq $0xd8,%%ymm2,%%ymm2 \n" \
|
||||
"vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
|
||||
"vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
|
||||
"vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
|
||||
"vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \
|
||||
"lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
|
||||
|
||||
#if defined(HAS_I422TOBGRAROW_AVX2)
|
||||
// 16 pixels
|
||||
@ -1916,18 +1947,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
|
||||
"1: \n"
|
||||
READYUV422_AVX2
|
||||
YUVTORGB_AVX2(yuvconstants)
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
|
||||
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
|
||||
"vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
|
||||
"vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
|
||||
|
||||
"vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
|
||||
"vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
|
||||
"lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
|
||||
STOREARGB_AVX2
|
||||
"sub $0x10,%[width] \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
@ -2027,6 +2047,66 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
|
||||
}
|
||||
#endif // HAS_I422TORGBAROW_AVX2
|
||||
|
||||
#if defined(HAS_YUY2TOARGBROW_AVX2)
|
||||
// 16 pixels.
|
||||
// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
|
||||
void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
|
||||
uint8* dst_argb,
|
||||
struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
|
||||
asm volatile (
|
||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
READYUY2_AVX2
|
||||
YUVTORGB_AVX2(yuvconstants)
|
||||
STOREARGB_AVX2
|
||||
"sub $0x10,%[width] \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
|
||||
[dst_argb]"+r"(dst_argb), // %[dst_argb]
|
||||
[width]"+rm"(width) // %[width]
|
||||
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
|
||||
[kShuffleYUY2Y]"m"(kShuffleYUY2Y),
|
||||
[kShuffleYUY2UV]"m"(kShuffleYUY2UV)
|
||||
// Does not use r14.
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_YUY2TOARGBROW_AVX2
|
||||
|
||||
#if defined(HAS_UYVYTOARGBROW_AVX2)
|
||||
// 16 pixels.
|
||||
// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
|
||||
void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
|
||||
uint8* dst_argb,
|
||||
struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
|
||||
asm volatile (
|
||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
READUYVY_AVX2
|
||||
YUVTORGB_AVX2(yuvconstants)
|
||||
STOREARGB_AVX2
|
||||
"sub $0x10,%[width] \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
|
||||
[dst_argb]"+r"(dst_argb), // %[dst_argb]
|
||||
[width]"+rm"(width) // %[width]
|
||||
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
|
||||
[kShuffleUYVYY]"m"(kShuffleUYVYY),
|
||||
[kShuffleUYVYUV]"m"(kShuffleUYVYUV)
|
||||
// Does not use r14.
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_UYVYTOARGBROW_AVX2
|
||||
|
||||
#ifdef HAS_I400TOARGBROW_SSE2
|
||||
void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
|
||||
@ -243,6 +243,30 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = {
|
||||
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
|
||||
};
|
||||
|
||||
// YUY2 shuf 16 Y to 32 Y.
|
||||
static const lvec8 kShuffleYUY2Y = {
|
||||
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
|
||||
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
|
||||
};
|
||||
|
||||
// YUY2 shuf 8 UV to 16 UV.
|
||||
static const lvec8 kShuffleYUY2UV = {
|
||||
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
|
||||
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
|
||||
};
|
||||
|
||||
// UYVY shuf 16 Y to 32 Y.
|
||||
static const lvec8 kShuffleUYVYY = {
|
||||
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
|
||||
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
|
||||
};
|
||||
|
||||
// UYVY shuf 8 UV to 16 UV.
|
||||
static const lvec8 kShuffleUYVYUV = {
|
||||
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
|
||||
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
|
||||
};
|
||||
|
||||
// Duplicates gray value 3 times and fills in alpha opaque.
|
||||
__declspec(naked)
|
||||
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
@ -1899,6 +1923,24 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
__asm lea eax, [eax + 16] \
|
||||
}
|
||||
|
||||
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
|
||||
#define READYUY2_AVX2 __asm { \
|
||||
__asm vmovdqu ymm4, [eax] /* YUY2 */ \
|
||||
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
|
||||
__asm vmovdqu ymm0, [eax] /* UV */ \
|
||||
__asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
|
||||
__asm lea eax, [eax + 32] \
|
||||
}
|
||||
|
||||
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
|
||||
#define READUYVY_AVX2 __asm { \
|
||||
__asm vmovdqu ymm4, [eax] /* UYVY */ \
|
||||
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
|
||||
__asm vmovdqu ymm0, [eax] /* UV */ \
|
||||
__asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
|
||||
__asm lea eax, [eax + 32] \
|
||||
}
|
||||
|
||||
// Convert 16 pixels: 16 UV and 16 Y.
|
||||
#define YUVTORGB_AVX2(YuvConstants) __asm { \
|
||||
__asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
|
||||
@ -2168,6 +2210,65 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
|
||||
}
|
||||
#endif // HAS_NV12TOARGBROW_AVX2
|
||||
|
||||
// 16 pixels.
|
||||
// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
|
||||
__declspec(naked)
|
||||
void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
|
||||
uint8* dst_argb,
|
||||
struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
__asm {
|
||||
push ebp
|
||||
mov eax, [esp + 4 + 4] // yuy2
|
||||
mov edx, [esp + 4 + 8] // argb
|
||||
mov ebp, [esp + 4 + 12] // yuvconstants
|
||||
mov ecx, [esp + 4 + 16] // width
|
||||
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
||||
|
||||
convertloop:
|
||||
READYUY2_AVX2
|
||||
YUVTORGB_AVX2(ebp)
|
||||
STOREARGB_AVX2
|
||||
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop ebp
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// 16 pixels.
|
||||
// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
|
||||
__declspec(naked)
|
||||
void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
|
||||
uint8* dst_argb,
|
||||
struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
__asm {
|
||||
push ebp
|
||||
mov eax, [esp + 4 + 4] // uyvy
|
||||
mov edx, [esp + 4 + 8] // argb
|
||||
mov ebp, [esp + 4 + 12] // yuvconstants
|
||||
mov ecx, [esp + 4 + 16] // width
|
||||
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
|
||||
|
||||
convertloop:
|
||||
READUYVY_AVX2
|
||||
YUVTORGB_AVX2(ebp)
|
||||
STOREARGB_AVX2
|
||||
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop ebp
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAS_I422TOBGRAROW_AVX2
|
||||
// 16 pixels
|
||||
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
|
||||
@ -2338,17 +2439,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
||||
__asm lea eax, [eax + 8] \
|
||||
}
|
||||
|
||||
// YUY2 shuf 8 Y to 16 Y.
|
||||
static const vec8 kShuffleYUY2Y = {
|
||||
0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
|
||||
};
|
||||
|
||||
// YUY2 shuf 4 UV to 8 UV.
|
||||
static const vec8 kShuffleYUY2UV = {
|
||||
1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
|
||||
};
|
||||
|
||||
// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
|
||||
// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
|
||||
#define READYUY2 __asm { \
|
||||
__asm movdqu xmm4, [eax] /* YUY2 */ \
|
||||
__asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
|
||||
@ -2357,24 +2448,13 @@ static const vec8 kShuffleYUY2UV = {
|
||||
__asm lea eax, [eax + 16] \
|
||||
}
|
||||
|
||||
// UYVY shuf 8 Y to 16 Y.
|
||||
static const vec8 kShuffleUYVYY = {
|
||||
1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
|
||||
};
|
||||
|
||||
// UYVY shuf 4 UV to 8 UV.
|
||||
static const vec8 kShuffleUYVYUV = {
|
||||
0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
|
||||
};
|
||||
|
||||
// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
|
||||
// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
|
||||
#define READUYVY __asm { \
|
||||
__asm movdqu xmm4, [eax] /* UYVY */ \
|
||||
__asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
|
||||
__asm movdqu xmm0, [eax] /* UV */ \
|
||||
__asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
|
||||
__asm lea eax, [eax + 16] \
|
||||
__asm lea eax, [eax + 8] \
|
||||
}
|
||||
|
||||
// Convert 8 pixels: 8 UV and 8 Y.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user