From 9f4636e298e0cc6ab553cbc49eb51a5cb447b1f4 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 30 Apr 2015 01:58:32 +0000 Subject: [PATCH] AVX2 port of ScaleDownBy4. BUG=314 TESTED=out\release\libyuv_unittest --gtest_filter=*.ScaleDownBy4* Review URL: https://webrtc-codereview.appspot.com/46159004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1390 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- include/libyuv/scale_row.h | 11 +++++ source/scale.cc | 9 ++++ source/scale_any.cc | 5 ++ source/scale_win.cc | 95 +++++++++++++++++++++++++++++++++++++- 4 files changed, 118 insertions(+), 2 deletions(-) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 5dc10906a..e3c0c0b74 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -54,6 +54,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) #define HAS_SCALEADDROWS_AVX2 #define HAS_SCALEROWDOWN2_AVX2 +#define HAS_SCALEROWDOWN4_AVX2 #endif // The following are available on Neon platforms: @@ -229,6 +230,11 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, @@ -261,6 +267,11 @@ void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr, diff --git a/source/scale.cc b/source/scale.cc index 30c436770..5460cc7e4 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -191,6 +191,15 @@ static void ScalePlaneDown4(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEROWDOWN4_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; + } + } +#endif #if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && diff --git a/source/scale_any.cc b/source/scale_any.cc index 5fef8edac..61de9d35f 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -82,6 +82,11 @@ SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C, 4, 1, 7) #endif +#ifdef HAS_SCALEROWDOWN4_AVX2 +SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C, + 4, 1, 15) +#endif #ifdef HAS_SCALEROWDOWN4_NEON SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C, diff --git a/source/scale_win.cc b/source/scale_win.cc index e0693ccd6..10513fb89 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -353,11 +353,11 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, psrlw xmm7, 8 wloop: - movdqu xmm0, [eax] + movdqu xmm0, [eax] // average rows movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] - pavgb xmm0, xmm2 // average rows + pavgb xmm0, xmm2 pavgb xmm1, xmm3 movdqu xmm2, [eax + esi * 2] movdqu xmm3, [eax + esi * 2 + 16] @@ -396,6 +396,97 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } } +#ifdef HAS_SCALEROWDOWN4_AVX2 +// Point samples 64 pixels to 16 pixels. +__declspec(naked) +void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 + vpsrld ymm5, ymm5, 24 + vpslld ymm5, ymm5, 16 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x4 rectangle to 16x1. +__declspec(naked) +void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff + vpsrlw ymm7, ymm7, 8 + + wloop: + vmovdqu ymm0, [eax] // average rows + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vmovdqu ymm2, [eax + esi * 2] + vmovdqu ymm3, [eax + esi * 2 + 32] + vpavgb ymm2, ymm2, [eax + edi] + vpavgb ymm3, ymm3, [eax + edi + 32] + lea eax, [eax + 64] + vpavgb ymm0, ymm0, ymm2 + vpavgb ymm1, ymm1, ymm3 + + vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) + vpand ymm3, ymm1, ymm7 + vpsrlw ymm0, ymm0, 8 + vpsrlw ymm1, ymm1, 8 + vpavgw ymm0, ymm0, ymm2 + vpavgw ymm1, ymm1, ymm3 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + + vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels) + vpsrlw ymm0, ymm0, 8 + vpavgw ymm0, ymm0, ymm2 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + + vmovdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_SCALEROWDOWN4_AVX2 + // Point samples 32 pixels to 24 pixels. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling.