UVScale down use AVX2 and Neon for aarch32

Intel SkylakeX
Was SSSE3 UVScaleDownBy4_Box (2496 ms)
Now AVX2  UVScaleDownBy4_Box (1983 ms)

Was SSSE3 UVScaleDownBy2_Box (380 ms)
Now AVX2  UVScaleDownBy2_Box (360 ms)

Pixel 4 aarch32
Was UVScaleDownBy4_Box (4295 ms)
Now UVScaleDownBy4_Box (3307 ms)

Was UVScaleDownBy2_Box (1022 ms)
Now UVScaleDownBy2_Box (778 ms)

Bug: libuyv:838
Change-Id: Ic823fa15e5761c1b9a897da27341adbf1ed39883
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2470196
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2020-10-13 19:56:28 -07:00 committed by Commit Bot
parent 725c64015d
commit a4ec5cf9c2
4 changed files with 83 additions and 4 deletions

View File

@ -79,6 +79,15 @@ extern "C" {
#define HAS_SCALEUVROWDOWN2BOX_SSSE3
#endif
// The following are available for gcc/clang x86 platforms, but
// require clang 3.4 or gcc 4.7.
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_SCALEUVROWDOWN2BOX_AVX2
#endif
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
@ -103,10 +112,6 @@ extern "C" {
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON
#endif
// The following are available on 64 bit Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SCALEUVROWDOWN2BOX_NEON
#endif
@ -854,6 +859,10 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
@ -902,6 +911,10 @@ void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,

View File

@ -78,6 +78,14 @@ SDANY(ScaleUVRowDown2Box_Any_SSSE3,
2,
4)
#endif
#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
SDANY(ScaleUVRowDown2Box_Any_AVX2,
ScaleUVRowDown2Box_AVX2,
ScaleUVRowDown2Box_C,
2,
2,
8)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2,

View File

@ -1412,6 +1412,48 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
}
#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
"vbroadcastf128 %4,%%ymm1 \n" // split shuffler
"vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
"lea 0x20(%0),%0 \n"
"vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
"vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
"vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
"vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
"vmovdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" // 8 UV
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kShuffleSplitUV), // %4
"m"(kShuffleMergeUV) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus

View File

@ -96,6 +96,14 @@ static void ScaleUVDown2(int src_width,
}
}
#endif
#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && filtering) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
if (IS_ALIGNED(dst_width, 8)) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
}
}
#endif
#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
if (TestCpuFlag(kCpuHasNEON) && filtering) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
@ -223,6 +231,14 @@ static void ScaleUVDown4Box(int src_width,
}
}
#endif
#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
if (IS_ALIGNED(dst_width, 8)) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
}
}
#endif
#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;