SwapUV AVX2 and SSSE3

Based on ARGBShuffle but with count adjusted and new shuffle mask

BUG=libyuv:809

Change-Id: Idd936ee6bedcf285607a68c2fc54d876b4becc01
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1711882
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2019-07-20 20:40:44 -07:00 committed by Commit Bot
parent 22ae4bfa05
commit fec9121b67
7 changed files with 93 additions and 2 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1733 Version: 1734
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -275,6 +275,7 @@ extern "C" {
#define HAS_I422TOAR30ROW_SSSE3 #define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
#endif #endif
// The following are available for AVX2 gcc/clang x86 platforms: // The following are available for AVX2 gcc/clang x86 platforms:
@ -295,6 +296,7 @@ extern "C" {
#define HAS_I422TOYUY2ROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2
#define HAS_MERGEUVROW_16_AVX2 #define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2
#define HAS_SWAPUVROW_AVX2
// TODO(fbarchard): Fix AVX2 version of YUV24 // TODO(fbarchard): Fix AVX2 version of YUV24
// #define HAS_NV21TOYUV24ROW_AVX2 // #define HAS_NV21TOYUV24ROW_AVX2
#endif #endif
@ -3374,6 +3376,10 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_Any_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_Any_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
void AYUVToUVRow_C(const uint8_t* src_ayuv, void AYUVToUVRow_C(const uint8_t* src_ayuv,
int stride_ayuv, int stride_ayuv,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1733 #define LIBYUV_VERSION 1734
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -527,6 +527,22 @@ void SwapUVPlane(const uint8_t* src_uv,
src_stride_uv = dst_stride_vu = 0; src_stride_uv = dst_stride_vu = 0;
} }
#if defined(HAS_SWAPUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
SwapUVRow = SwapUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
SwapUVRow = SwapUVRow_SSSE3;
}
}
#endif
#if defined(HAS_SWAPUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SwapUVRow = SwapUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
SwapUVRow = SwapUVRow_AVX2;
}
}
#endif
#if defined(HAS_SWAPUVROW_NEON) #if defined(HAS_SWAPUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
SwapUVRow = SwapUVRow_Any_NEON; SwapUVRow = SwapUVRow_Any_NEON;

View File

@ -710,6 +710,12 @@ ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
#ifdef HAS_AYUVTOYROW_NEON #ifdef HAS_AYUVTOYROW_NEON
ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_SWAPUVROW_SSSE3
ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
#endif
#ifdef HAS_SWAPUVROW_AVX2
ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
#endif
#ifdef HAS_SWAPUVROW_NEON #ifdef HAS_SWAPUVROW_NEON
ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15) ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
#endif #endif

View File

@ -6790,6 +6790,68 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
} }
#endif // HAS_NV21TOYUV24ROW_AVX2 #endif // HAS_NV21TOYUV24ROW_AVX2
#ifdef HAS_SWAPUVROW_SSSE3
// Shuffle table for reversing the bytes.
static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
void SwapUVRow_SSSE3(const uint8_t* src_uv,
uint8_t* dst_vu,
int width) {
asm volatile(
"movdqu %3,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
: "m"(kShuffleUVToVU) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_SWAPUVROW_SSSE3
#ifdef HAS_SWAPUVROW_AVX2
void SwapUVRow_AVX2(const uint8_t* src_uv,
uint8_t* dst_vu,
int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n"
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
: "m"(kShuffleUVToVU) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_SWAPUVROW_AVX2
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -994,6 +994,7 @@ TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2) TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4) TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4) TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
// TODO(fbarchard): Investigate high error on Win32.
TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 10) TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 10)
TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5) TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4) TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)