From d41fbf40dd2b4589ad78e5b5649c0ed1ec5b9736 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 24 Mar 2015 23:25:30 +0000 Subject: [PATCH] Handle scale down by factor of 2 efficiently by calling SIMD for multiple of 16 destination pixels, and C for remainder. BUG=314 TESTED=out\release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*.ScaleDownBy2* R=bcornell@google.com Review URL: https://webrtc-codereview.appspot.com/48689004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1344 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- include/libyuv/scale_row.h | 15 +++++++++++++-- source/scale.cc | 26 ++++++++++++++++++-------- source/scale_any.cc | 30 ++++++++++++++++++++++++++++++ source/scale_win.cc | 1 - 4 files changed, 61 insertions(+), 11 deletions(-) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 19affbe86..07033fe72 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -222,6 +222,12 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height); @@ -271,10 +277,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, // Note - not static due to reuse in convert for 444 to 420. void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); - void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); - void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); @@ -309,6 +313,13 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height); diff --git a/source/scale.cc b/source/scale.cc index 72b2c203d..15056732a 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -54,17 +54,27 @@ static void ScalePlaneDown2(int src_width, int src_height, } #if defined(HAS_SCALEROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON : - (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON : - ScaleRowDown2Box_NEON); + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON : + ScaleRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON : + (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON : + ScaleRowDown2Box_NEON); + } } #endif #if defined(HAS_SCALEROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : - ScaleRowDown2Box_SSE2); + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 : + ScaleRowDown2Box_Any_SSE2); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : + ScaleRowDown2Box_SSE2); + } } #endif #if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) diff --git a/source/scale_any.cc b/source/scale_any.cc index 702bca263..8adfbd3e1 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -35,6 +35,36 @@ CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) #endif #undef CANY +// Fixed scale down. +#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ + uint8* dst_ptr, int dst_width) { \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, dst_width & MASK); \ + } + + +#ifdef HAS_SCALEROWDOWN2_SSE2 +SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2, + ScaleRowDown2Linear_C, 2, 1, 15) +SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, + ScaleRowDown2Box_C, 2, 1, 15) +#endif +#ifdef HAS_SCALEROWDOWN2_NEON +SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON, + ScaleRowDown2Linear_C, 2, 1, 15) +SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON, + ScaleRowDown2Box_C, 2, 1, 15) +#endif + +#undef SDANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/scale_win.cc b/source/scale_win.cc index 77529ffe5..770828c77 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -666,7 +666,6 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } // Bilinear column filtering. SSSE3 version. -// TODO(fbarchard): Port to Neon // TODO(fbarchard): Switch the following: // xor ebx, ebx // mov bx, word ptr [esi + eax] // 2 source x0 pixels