SplitRGB for SSE4 and AVX2

libyuv_test '--gunit_filter=*SplitRGB*' --libyuv_width=640 --libyuv_height=360 --libyuv_repeat=100000 --libyuv_flags=-1 --libyuv_cpu_info=-1
Note: Google Test filter = *SplitRGB*

Skylake Xeon x86 32 bit
AVX2  LibYUVPlanarTest.SplitRGBPlane_Opt (4143 ms)
SSE4  LibYUVPlanarTest.SplitRGBPlane_Opt (4543 ms)
SSSE3 LibYUVPlanarTest.SplitRGBPlane_Opt (5346 ms)
C     LibYUVPlanarTest.SplitRGBPlane_Opt (22965 ms)

Skylake Xeon x86 64 bit
AVX2  LibYUVPlanarTest.SplitRGBPlane_Opt (4470 ms)
SSE4  LibYUVPlanarTest.SplitRGBPlane_Opt (4723 ms)
SSSE3 LibYUVPlanarTest.SplitRGBPlane_Opt (5465 ms)
C     LibYUVPlanarTest.SplitRGBPlane_Opt (4707 ms)

Bug: 379186682
Change-Id: Idce67a4ded836f2ee31854aa06f3903e7bcb7791
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6024314
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2024-11-14 16:32:25 -08:00
parent 823d960afc
commit 75f7cfdde5
5 changed files with 136 additions and 2 deletions

View File

@ -265,6 +265,7 @@ extern "C" {
#define HAS_SPLITARGBROW_SSE2
#define HAS_SPLITARGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SPLITRGBROW_SSE41
#define HAS_SPLITXRGBROW_SSE2
#define HAS_SPLITXRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
@ -330,6 +331,7 @@ extern "C" {
#define HAS_P410TOAR30ROW_AVX2
#define HAS_P410TOARGBROW_AVX2
#define HAS_RGBATOYJROW_AVX2
#define HAS_SPLITRGBROW_AVX2
#define HAS_SPLITARGBROW_AVX2
#define HAS_SPLITUVROW_16_AVX2
#define HAS_SPLITXRGBROW_AVX2
@ -2803,6 +2805,16 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitRGBRow_SSE41(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitRGBRow_AVX2(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
@ -2818,6 +2830,16 @@ void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitRGBRow_Any_SSE41(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitRGBRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,

View File

@ -1285,6 +1285,22 @@ void SplitRGBPlane(const uint8_t* src_rgb,
}
}
#endif
#if defined(HAS_SPLITRGBROW_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) {
SplitRGBRow = SplitRGBRow_Any_SSE41;
if (IS_ALIGNED(width, 16)) {
SplitRGBRow = SplitRGBRow_SSE41;
}
}
#endif
#if defined(HAS_SPLITRGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SplitRGBRow = SplitRGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
SplitRGBRow = SplitRGBRow_AVX2;
}
}
#endif
#if defined(HAS_SPLITRGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitRGBRow = SplitRGBRow_Any_NEON;

View File

@ -2205,6 +2205,12 @@ ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7)
#ifdef HAS_SPLITRGBROW_SSSE3
ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
#endif
#ifdef HAS_SPLITRGBROW_SSE41
ANY13(SplitRGBRow_Any_SSE41, SplitRGBRow_SSE41, 3, 15)
#endif
#ifdef HAS_SPLITRGBROW_AVX2
ANY13(SplitRGBRow_Any_AVX2, SplitRGBRow_AVX2, 3, 31)
#endif
#ifdef HAS_SPLITRGBROW_NEON
ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
#endif

View File

@ -5503,6 +5503,98 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
}
#endif // HAS_SPLITRGBROW_SSSE3
#ifdef HAS_SPLITRGBROW_SSE41
// Shuffle table for converting RGB to Planar, SSE4.1.
alignas(16) static const uvec8 kSplitRGBShuffleSSE41[4] = {
{1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u},
{0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u},
{1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u},
{2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u}};
void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r,
uint8_t* dst_g, uint8_t* dst_b, int width) {
asm volatile(
"movdqa 0(%5), %%xmm0 \n"
"1: \n"
"movdqu (%0),%%xmm1 \n"
"movdqu 0x10(%0),%%xmm2 \n"
"movdqu 0x20(%0),%%xmm3 \n"
"lea 0x30(%0),%0 \n"
"movdqa %%xmm1, %%xmm4 \n"
"pblendvb %%xmm3, %%xmm1 \n"
"pblendvb %%xmm2, %%xmm3 \n"
"pblendvb %%xmm4, %%xmm2 \n"
"psrlq $0x1, %%xmm0 \n"
"pblendvb %%xmm2, %%xmm1 \n"
"pblendvb %%xmm3, %%xmm2 \n"
"pblendvb %%xmm4, %%xmm3 \n"
"psllq $0x1, %%xmm0 \n"
"pshufb 16(%5), %%xmm1 \n"
"pshufb 32(%5), %%xmm2 \n"
"pshufb 48(%5), %%xmm3 \n"
"movdqu %%xmm1,(%1) \n"
"lea 0x10(%1),%1 \n"
"movdqu %%xmm2,(%2) \n"
"lea 0x10(%2),%2 \n"
"movdqu %%xmm3,(%3) \n"
"lea 0x10(%3),%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "r"(&kSplitRGBShuffleSSE41[0]) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_SPLITRGBROW_SSE41
#ifdef HAS_SPLITRGBROW_AVX2
void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r,
uint8_t* dst_g, uint8_t* dst_b, int width) {
asm volatile(
"vbroadcasti128 0(%5), %%ymm0 \n"
"vpsrlq $0x1,%%ymm0,%%ymm7 \n"
"1: \n"
"vmovdqu (%0),%%ymm4 \n"
"vmovdqu 0x20(%0),%%ymm5 \n"
"vmovdqu 0x40(%0),%%ymm6 \n"
"lea 0x60(%0),%0 \n"
"vpblendd $240, %%ymm5, %%ymm4, %%ymm1 \n"
"vperm2i128 $33, %%ymm6, %%ymm4, %%ymm2 \n"
"vpblendd $240, %%ymm6, %%ymm5, %%ymm3 \n"
"vpblendvb %%ymm0, %%ymm3, %%ymm1, %%ymm4 \n"
"vpblendvb %%ymm0, %%ymm1, %%ymm2, %%ymm5 \n"
"vpblendvb %%ymm0, %%ymm2, %%ymm3, %%ymm6 \n"
"vpblendvb %%ymm7, %%ymm5, %%ymm4, %%ymm1 \n"
"vpblendvb %%ymm7, %%ymm6, %%ymm5, %%ymm2 \n"
"vpblendvb %%ymm7, %%ymm4, %%ymm6, %%ymm3 \n"
"vbroadcasti128 16(%5), %%ymm4 \n"
"vbroadcasti128 32(%5), %%ymm5 \n"
"vbroadcasti128 48(%5), %%ymm6 \n"
"vpshufb %%ymm4, %%ymm1, %%ymm1 \n"
"vpshufb %%ymm5, %%ymm2, %%ymm2 \n"
"vpshufb %%ymm6, %%ymm3, %%ymm3 \n"
"vmovdqu %%ymm1,(%1) \n"
"lea 0x20(%1),%1 \n"
"vmovdqu %%ymm2,(%2) \n"
"lea 0x20(%2),%2 \n"
"vmovdqu %%ymm3,(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x20,%4 \n"
"jg 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "r"(&kSplitRGBShuffleSSE41[0]) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_SPLITRGBROW_AVX2
#ifdef HAS_MERGERGBROW_SSSE3
// Shuffle table for converting Planar to RGB.
static const uvec8 kMergeRGBShuffle[9] = {

View File

@ -322,8 +322,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
22, 24, 27, 30, 0, 0, 0, 0};
static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
34, 6, 22, 35, 0, 0, 0, 0};
static const vec16 kMult38_Div664 = {
65536 / 12, 65536 / 12, 65536 / 8, 65536 / 12, 65536 / 12, 65536 / 8, 0, 0};
static const vec16 kMult38_Div996 = {65536 / 18, 65536 / 18, 65536 / 12,