From a4ec5cf9c2a183e389d3fddd9c239e87af56db68 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 13 Oct 2020 19:56:28 -0700 Subject: [PATCH] UVScale down use AVX2 and Neon for aarch32 Intel SkylakeX Was SSSE3 UVScaleDownBy4_Box (2496 ms) Now AVX2 UVScaleDownBy4_Box (1983 ms) Was SSSE3 UVScaleDownBy2_Box (380 ms) Now AVX2 UVScaleDownBy2_Box (360 ms) Pixel 4 aarch32 Was UVScaleDownBy4_Box (4295 ms) Now UVScaleDownBy4_Box (3307 ms) Was UVScaleDownBy2_Box (1022 ms) Now UVScaleDownBy2_Box (778 ms) Bug: libuyv:838 Change-Id: Ic823fa15e5761c1b9a897da27341adbf1ed39883 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2470196 Reviewed-by: richard winterton Commit-Queue: Frank Barchard --- include/libyuv/scale_row.h | 21 +++++++++++++++---- source/scale_any.cc | 8 ++++++++ source/scale_gcc.cc | 42 ++++++++++++++++++++++++++++++++++++++ source/scale_uv.cc | 16 +++++++++++++++ 4 files changed, 83 insertions(+), 4 deletions(-) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 9ebc10024..888582400 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -79,6 +79,15 @@ extern "C" { #define HAS_SCALEUVROWDOWN2BOX_SSSE3 #endif +// The following are available for gcc/clang x86 platforms, but +// require clang 3.4 or gcc 4.7. +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_SCALEUVROWDOWN2BOX_AVX2 +#endif + // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. // The code supports NaCL but requires a new compiler and validator. @@ -103,10 +112,6 @@ extern "C" { #define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN4_NEON -#endif - -// The following are available on 64 bit Neon platforms: -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #define HAS_SCALEUVROWDOWN2BOX_NEON #endif @@ -854,6 +859,10 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); +void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -902,6 +911,10 @@ void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); +void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, diff --git a/source/scale_any.cc b/source/scale_any.cc index 5fca6ffb9..b571aec96 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -78,6 +78,14 @@ SDANY(ScaleUVRowDown2Box_Any_SSSE3, 2, 4) #endif +#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 +SDANY(ScaleUVRowDown2Box_Any_AVX2, + ScaleUVRowDown2Box_AVX2, + ScaleUVRowDown2Box_C, + 2, + 2, + 8) +#endif #ifdef HAS_SCALEROWDOWN2_AVX2 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) SDANY(ScaleRowDown2Linear_Any_AVX2, diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 8806e1363..582403655 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -1412,6 +1412,48 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, } #endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 +#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 +void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero + "vbroadcastf128 %4,%%ymm1 \n" // split shuffler + "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 + "lea 0x20(%0),%0 \n" + "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv + "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv + "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" // 8 UV + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_SCALEUVROWDOWN2BOX_AVX2 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/scale_uv.cc b/source/scale_uv.cc index aa5f600b6..4e276518a 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -96,6 +96,14 @@ static void ScaleUVDown2(int src_width, } } #endif +#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; + } + } +#endif #if defined(HAS_SCALEUVROWDOWN2BOX_NEON) if (TestCpuFlag(kCpuHasNEON) && filtering) { ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; @@ -223,6 +231,14 @@ static void ScaleUVDown4Box(int src_width, } } #endif +#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; + } + } +#endif #if defined(HAS_SCALEUVROWDOWN2BOX_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;