From 4e78b8dc2e2140f729429e18b5c20d58c5f7dff1 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 27 Apr 2015 21:56:08 +0000 Subject: [PATCH] scale to 3/4 or 3/8 with odd width destinations efficiently. previously if width was not multiple of what the simd loop would do (24), scaling would fall back on slower C code. This change allows SIMD to be used for most of the scaling and C for the remainder, improving efficiency. BUG=314 TESTED=set LIBYUV_WIDTH=1896 & ScaleDownBy3by4_* R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/48249004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1380 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/scale_row.h | 31 +++++++++++++++++ include/libyuv/version.h | 2 +- source/scale.cc | 70 +++++++++++++++++++++++++++++--------- source/scale_any.cc | 34 ++++++++++++++++++ source/scale_win.cc | 1 - 6 files changed, 120 insertions(+), 20 deletions(-) diff --git a/README.chromium b/README.chromium index d28b911fd..0968f54a5 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1378 +Version: 1379 License: BSD License File: LICENSE diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index d076d29cd..5dc10906a 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -261,6 +261,22 @@ void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height); @@ -358,6 +374,21 @@ void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32 -> 12 +void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 492c13ff0..73fc938cc 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1378 +#define LIBYUV_VERSION 1379 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale.cc b/source/scale.cc index 52550fd18..501d391c3 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -277,24 +277,42 @@ static void ScalePlaneDown34(int src_width, int src_height, ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; } #if defined(HAS_SCALEROWDOWN34_NEON) - if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { + if (TestCpuFlag(kCpuHasNEON)) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_NEON; - ScaleRowDown34_1 = ScaleRowDown34_NEON; + ScaleRowDown34_0 = ScaleRowDown34_Any_NEON; + ScaleRowDown34_1 = ScaleRowDown34_Any_NEON; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; + } } } #endif #if defined(HAS_SCALEROWDOWN34_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { + if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; + } } } #endif @@ -450,23 +468,41 @@ static void ScalePlaneDown38(int src_width, int src_height, ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; } + #if defined(HAS_SCALEROWDOWN38_NEON) - if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { + if (TestCpuFlag(kCpuHasNEON)) { if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_NEON; - ScaleRowDown38_2 = ScaleRowDown38_NEON; + ScaleRowDown38_3 = ScaleRowDown38_Any_NEON; + ScaleRowDown38_2 = ScaleRowDown38_Any_NEON; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; + } } } #endif #if defined(HAS_SCALEROWDOWN38_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { + if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; + } + if (dst_width % 12 == 0 && !filtering) { ScaleRowDown38_3 = ScaleRowDown38_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_SSSE3; - } else { + } + if (dst_width % 6 == 0 && filtering) { ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; } diff --git a/source/scale_any.cc b/source/scale_any.cc index 5354afed2..5fef8edac 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -87,6 +87,40 @@ SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C, 4, 1, 7) #endif + + +#ifdef HAS_SCALEROWDOWN34_SSSE3 +SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, ScaleRowDown34_C, + 3 / 4, 1, 23) +SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3, + ScaleRowDown34_0_Box_C, 3 / 4, 1, 23) +SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3, + ScaleRowDown34_1_Box_C, 3 / 4, 1, 23) +#endif +#ifdef HAS_SCALEROWDOWN34_NEON +SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON, ScaleRowDown34_C, + 3 / 4, 1, 23) +SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON, + ScaleRowDown34_0_Box_C, 3 / 4, 1, 23) +SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON, + ScaleRowDown34_1_Box_C, 3 / 4, 1, 23) +#endif +#ifdef HAS_SCALEROWDOWN38_SSSE3 +SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, ScaleRowDown38_C, + 3 / 4, 1, 11) +SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3, + ScaleRowDown38_3_Box_C, 3 / 4, 1, 5) +SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3, + ScaleRowDown38_2_Box_C, 3 / 4, 1, 5) +#endif +#ifdef HAS_SCALEROWDOWN38_NEON +SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON, ScaleRowDown38_C, + 3 / 4, 1, 11) +SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON, + ScaleRowDown38_3_Box_C, 3 / 4, 1, 11) +SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON, + ScaleRowDown38_2_Box_C, 3 / 4, 1, 11) +#endif #undef SDANY // Fixed scale down. diff --git a/source/scale_win.cc b/source/scale_win.cc index db034b389..e0693ccd6 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -400,7 +400,6 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. -// Note that movdqa+palign may be better than movdqu. __declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) {