mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
SplitRGB for SSE4 and AVX2
libyuv_test '--gunit_filter=*SplitRGB*' --libyuv_width=640 --libyuv_height=360 --libyuv_repeat=100000 --libyuv_flags=-1 --libyuv_cpu_info=-1 Note: Google Test filter = *SplitRGB* Skylake Xeon x86 32 bit AVX2 LibYUVPlanarTest.SplitRGBPlane_Opt (4143 ms) SSE4 LibYUVPlanarTest.SplitRGBPlane_Opt (4543 ms) SSSE3 LibYUVPlanarTest.SplitRGBPlane_Opt (5346 ms) C LibYUVPlanarTest.SplitRGBPlane_Opt (22965 ms) Skylake Xeon x86 64 bit AVX2 LibYUVPlanarTest.SplitRGBPlane_Opt (4470 ms) SSE4 LibYUVPlanarTest.SplitRGBPlane_Opt (4723 ms) SSSE3 LibYUVPlanarTest.SplitRGBPlane_Opt (5465 ms) C LibYUVPlanarTest.SplitRGBPlane_Opt (4707 ms) Bug: 379186682 Change-Id: Idce67a4ded836f2ee31854aa06f3903e7bcb7791 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6024314 Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
823d960afc
commit
75f7cfdde5
@ -265,6 +265,7 @@ extern "C" {
|
||||
#define HAS_SPLITARGBROW_SSE2
|
||||
#define HAS_SPLITARGBROW_SSSE3
|
||||
#define HAS_SPLITRGBROW_SSSE3
|
||||
#define HAS_SPLITRGBROW_SSE41
|
||||
#define HAS_SPLITXRGBROW_SSE2
|
||||
#define HAS_SPLITXRGBROW_SSSE3
|
||||
#define HAS_SWAPUVROW_SSSE3
|
||||
@ -330,6 +331,7 @@ extern "C" {
|
||||
#define HAS_P410TOAR30ROW_AVX2
|
||||
#define HAS_P410TOARGBROW_AVX2
|
||||
#define HAS_RGBATOYJROW_AVX2
|
||||
#define HAS_SPLITRGBROW_AVX2
|
||||
#define HAS_SPLITARGBROW_AVX2
|
||||
#define HAS_SPLITUVROW_16_AVX2
|
||||
#define HAS_SPLITXRGBROW_AVX2
|
||||
@ -2803,6 +2805,16 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
|
||||
uint8_t* dst_g,
|
||||
uint8_t* dst_b,
|
||||
int width);
|
||||
void SplitRGBRow_SSE41(const uint8_t* src_rgb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
uint8_t* dst_b,
|
||||
int width);
|
||||
void SplitRGBRow_AVX2(const uint8_t* src_rgb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
uint8_t* dst_b,
|
||||
int width);
|
||||
void SplitRGBRow_NEON(const uint8_t* src_rgb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
@ -2818,6 +2830,16 @@ void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_g,
|
||||
uint8_t* dst_b,
|
||||
int width);
|
||||
void SplitRGBRow_Any_SSE41(const uint8_t* src_ptr,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
uint8_t* dst_b,
|
||||
int width);
|
||||
void SplitRGBRow_Any_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
uint8_t* dst_b,
|
||||
int width);
|
||||
void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
|
||||
@ -1285,6 +1285,22 @@ void SplitRGBPlane(const uint8_t* src_rgb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITRGBROW_SSE41)
|
||||
if (TestCpuFlag(kCpuHasSSE41)) {
|
||||
SplitRGBRow = SplitRGBRow_Any_SSE41;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SplitRGBRow = SplitRGBRow_SSE41;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITRGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
SplitRGBRow = SplitRGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
SplitRGBRow = SplitRGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITRGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SplitRGBRow = SplitRGBRow_Any_NEON;
|
||||
|
||||
@ -2205,6 +2205,12 @@ ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7)
|
||||
#ifdef HAS_SPLITRGBROW_SSSE3
|
||||
ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_SPLITRGBROW_SSE41
|
||||
ANY13(SplitRGBRow_Any_SSE41, SplitRGBRow_SSE41, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_SPLITRGBROW_AVX2
|
||||
ANY13(SplitRGBRow_Any_AVX2, SplitRGBRow_AVX2, 3, 31)
|
||||
#endif
|
||||
#ifdef HAS_SPLITRGBROW_NEON
|
||||
ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
|
||||
#endif
|
||||
|
||||
@ -5503,6 +5503,98 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
|
||||
}
|
||||
#endif // HAS_SPLITRGBROW_SSSE3
|
||||
|
||||
#ifdef HAS_SPLITRGBROW_SSE41
|
||||
// Shuffle table for converting RGB to Planar, SSE4.1.
|
||||
alignas(16) static const uvec8 kSplitRGBShuffleSSE41[4] = {
|
||||
{1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u},
|
||||
{0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u},
|
||||
{1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u},
|
||||
{2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u}};
|
||||
|
||||
void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r,
|
||||
uint8_t* dst_g, uint8_t* dst_b, int width) {
|
||||
asm volatile(
|
||||
"movdqa 0(%5), %%xmm0 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm1 \n"
|
||||
"movdqu 0x10(%0),%%xmm2 \n"
|
||||
"movdqu 0x20(%0),%%xmm3 \n"
|
||||
"lea 0x30(%0),%0 \n"
|
||||
"movdqa %%xmm1, %%xmm4 \n"
|
||||
"pblendvb %%xmm3, %%xmm1 \n"
|
||||
"pblendvb %%xmm2, %%xmm3 \n"
|
||||
"pblendvb %%xmm4, %%xmm2 \n"
|
||||
"psrlq $0x1, %%xmm0 \n"
|
||||
"pblendvb %%xmm2, %%xmm1 \n"
|
||||
"pblendvb %%xmm3, %%xmm2 \n"
|
||||
"pblendvb %%xmm4, %%xmm3 \n"
|
||||
"psllq $0x1, %%xmm0 \n"
|
||||
"pshufb 16(%5), %%xmm1 \n"
|
||||
"pshufb 32(%5), %%xmm2 \n"
|
||||
"pshufb 48(%5), %%xmm3 \n"
|
||||
"movdqu %%xmm1,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"movdqu %%xmm2,(%2) \n"
|
||||
"lea 0x10(%2),%2 \n"
|
||||
"movdqu %%xmm3,(%3) \n"
|
||||
"lea 0x10(%3),%3 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_rgb), // %0
|
||||
"+r"(dst_r), // %1
|
||||
"+r"(dst_g), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(&kSplitRGBShuffleSSE41[0]) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
||||
}
|
||||
#endif // HAS_SPLITRGBROW_SSE41
|
||||
|
||||
#ifdef HAS_SPLITRGBROW_AVX2
|
||||
void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r,
|
||||
uint8_t* dst_g, uint8_t* dst_b, int width) {
|
||||
asm volatile(
|
||||
"vbroadcasti128 0(%5), %%ymm0 \n"
|
||||
"vpsrlq $0x1,%%ymm0,%%ymm7 \n"
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm4 \n"
|
||||
"vmovdqu 0x20(%0),%%ymm5 \n"
|
||||
"vmovdqu 0x40(%0),%%ymm6 \n"
|
||||
"lea 0x60(%0),%0 \n"
|
||||
"vpblendd $240, %%ymm5, %%ymm4, %%ymm1 \n"
|
||||
"vperm2i128 $33, %%ymm6, %%ymm4, %%ymm2 \n"
|
||||
"vpblendd $240, %%ymm6, %%ymm5, %%ymm3 \n"
|
||||
"vpblendvb %%ymm0, %%ymm3, %%ymm1, %%ymm4 \n"
|
||||
"vpblendvb %%ymm0, %%ymm1, %%ymm2, %%ymm5 \n"
|
||||
"vpblendvb %%ymm0, %%ymm2, %%ymm3, %%ymm6 \n"
|
||||
"vpblendvb %%ymm7, %%ymm5, %%ymm4, %%ymm1 \n"
|
||||
"vpblendvb %%ymm7, %%ymm6, %%ymm5, %%ymm2 \n"
|
||||
"vpblendvb %%ymm7, %%ymm4, %%ymm6, %%ymm3 \n"
|
||||
"vbroadcasti128 16(%5), %%ymm4 \n"
|
||||
"vbroadcasti128 32(%5), %%ymm5 \n"
|
||||
"vbroadcasti128 48(%5), %%ymm6 \n"
|
||||
"vpshufb %%ymm4, %%ymm1, %%ymm1 \n"
|
||||
"vpshufb %%ymm5, %%ymm2, %%ymm2 \n"
|
||||
"vpshufb %%ymm6, %%ymm3, %%ymm3 \n"
|
||||
"vmovdqu %%ymm1,(%1) \n"
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"vmovdqu %%ymm2,(%2) \n"
|
||||
"lea 0x20(%2),%2 \n"
|
||||
"vmovdqu %%ymm3,(%3) \n"
|
||||
"lea 0x20(%3),%3 \n"
|
||||
"sub $0x20,%4 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_rgb), // %0
|
||||
"+r"(dst_r), // %1
|
||||
"+r"(dst_g), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(&kSplitRGBShuffleSSE41[0]) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif // HAS_SPLITRGBROW_AVX2
|
||||
|
||||
#ifdef HAS_MERGERGBROW_SSSE3
|
||||
// Shuffle table for converting Planar to RGB.
|
||||
static const uvec8 kMergeRGBShuffle[9] = {
|
||||
|
||||
@ -322,8 +322,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
|
||||
|
||||
static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
|
||||
22, 24, 27, 30, 0, 0, 0, 0};
|
||||
static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
|
||||
34, 6, 22, 35, 0, 0, 0, 0};
|
||||
static const vec16 kMult38_Div664 = {
|
||||
65536 / 12, 65536 / 12, 65536 / 8, 65536 / 12, 65536 / 12, 65536 / 8, 0, 0};
|
||||
static const vec16 kMult38_Div996 = {65536 / 18, 65536 / 18, 65536 / 12,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user