mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
UVScale down use AVX2 and Neon for aarch32
Intel SkylakeX Was SSSE3 UVScaleDownBy4_Box (2496 ms) Now AVX2 UVScaleDownBy4_Box (1983 ms) Was SSSE3 UVScaleDownBy2_Box (380 ms) Now AVX2 UVScaleDownBy2_Box (360 ms) Pixel 4 aarch32 Was UVScaleDownBy4_Box (4295 ms) Now UVScaleDownBy4_Box (3307 ms) Was UVScaleDownBy2_Box (1022 ms) Now UVScaleDownBy2_Box (778 ms) Bug: libuyv:838 Change-Id: Ic823fa15e5761c1b9a897da27341adbf1ed39883 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2470196 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
725c64015d
commit
a4ec5cf9c2
@ -79,6 +79,15 @@ extern "C" {
|
||||
#define HAS_SCALEUVROWDOWN2BOX_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available for gcc/clang x86 platforms, but
|
||||
// require clang 3.4 or gcc 4.7.
|
||||
// TODO(fbarchard): Port to Visual C
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
|
||||
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
|
||||
#define HAS_SCALEUVROWDOWN2BOX_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available on all x86 platforms, but
|
||||
// require VS2012, clang 3.4 or gcc 4.7.
|
||||
// The code supports NaCL but requires a new compiler and validator.
|
||||
@ -103,10 +112,6 @@ extern "C" {
|
||||
#define HAS_SCALEROWDOWN34_NEON
|
||||
#define HAS_SCALEROWDOWN38_NEON
|
||||
#define HAS_SCALEROWDOWN4_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on 64 bit Neon platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_SCALEUVROWDOWN2BOX_NEON
|
||||
#endif
|
||||
|
||||
@ -854,6 +859,10 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
@ -902,6 +911,10 @@ void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
|
||||
@ -78,6 +78,14 @@ SDANY(ScaleUVRowDown2Box_Any_SSSE3,
|
||||
2,
|
||||
4)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
|
||||
SDANY(ScaleUVRowDown2Box_Any_AVX2,
|
||||
ScaleUVRowDown2Box_AVX2,
|
||||
ScaleUVRowDown2Box_C,
|
||||
2,
|
||||
2,
|
||||
8)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_AVX2
|
||||
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
|
||||
SDANY(ScaleRowDown2Linear_Any_AVX2,
|
||||
|
||||
@ -1412,6 +1412,48 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
|
||||
}
|
||||
#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
|
||||
|
||||
#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
|
||||
void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
|
||||
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
||||
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
|
||||
"vbroadcastf128 %4,%%ymm1 \n" // split shuffler
|
||||
"vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
|
||||
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
|
||||
"vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
|
||||
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
||||
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
|
||||
"vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
|
||||
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
|
||||
"vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
|
||||
"vmovdqu %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n" // 8 UV
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"m"(kShuffleSplitUV), // %4
|
||||
"m"(kShuffleMergeUV) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||
}
|
||||
#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
|
||||
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -96,6 +96,14 @@ static void ScaleUVDown2(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2) && filtering) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && filtering) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
|
||||
@ -223,6 +231,14 @@ static void ScaleUVDown4Box(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user