From dbe4814361fb8fcbc462bbe45a2f39360e14a982 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 28 Nov 2013 01:16:15 +0000 Subject: [PATCH] Move scale row functions to scale_win etc BUG=none TEST=untested R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/4509005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@879 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- Android.mk | 1 + README.chromium | 2 +- include/libyuv/scale_row.h | 228 ++++ include/libyuv/version.h | 2 +- libyuv.gyp | 5 +- linux.mk | 1 + source/row_mips.cc | 5 + source/row_posix.cc | 4 +- source/scale.cc | 2119 ------------------------------------ source/scale_argb.cc | 988 ----------------- source/scale_argb_neon.cc | 142 +-- source/scale_common.cc | 438 ++++++++ source/scale_neon.cc | 120 +- source/scale_posix.cc | 1337 +++++++++++++++++++++++ source/scale_win.cc | 1282 ++++++++++++++++++++++ 15 files changed, 3418 insertions(+), 3256 deletions(-) create mode 100644 source/scale_posix.cc create mode 100644 source/scale_win.cc diff --git a/Android.mk b/Android.mk index 29b61c81e..2284239c7 100644 --- a/Android.mk +++ b/Android.mk @@ -29,6 +29,7 @@ LOCAL_SRC_FILES := \ source/scale_argb.cc \ source/scale_common.cc \ source/scale_mips.cc \ + source/scale_posix.cc \ source/video_common.cc # TODO(fbarchard): Enable mjpeg encoder. diff --git a/README.chromium b/README.chromium index 20e0cfdb5..6e9a2af1e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 877 +Version: 879 License: BSD License File: LICENSE diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 8100cb4ac..ab4d0e724 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -18,6 +18,43 @@ namespace libyuv { extern "C" { #endif +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_SCALEROWDOWN2_SSE2 +#define HAS_SCALEROWDOWN4_SSE2 +#define HAS_SCALEROWDOWN34_SSSE3 +#define HAS_SCALEROWDOWN38_SSSE3 +#define HAS_SCALEADDROWS_SSE2 +#define HAS_SCALEFILTERCOLS_SSSE3 +#define HAS_SCALECOLSUP2_SSE2 +#define HAS_SCALEARGBROWDOWN2_SSE2 +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +#define HAS_SCALEARGBCOLS_SSE2 +#define HAS_SCALEARGBFILTERCOLS_SSSE3 +#define HAS_SCALEARGBCOLSUP2_SSE2 +#endif + +// The following are available on Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_SCALEROWDOWN2_NEON +#define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEROWDOWN34_NEON +#define HAS_SCALEROWDOWN38_NEON +#define HAS_SCALEARGBROWDOWNEVEN_NEON +#define HAS_SCALEARGBROWDOWN2_NEON +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ + defined(__mips__) +#define HAS_SCALEROWDOWN2_MIPS_DSPR2 +#define HAS_SCALEROWDOWN4_MIPS_DSPR2 +#define HAS_SCALEROWDOWN34_MIPS_DSPR2 +#define HAS_SCALEROWDOWN38_MIPS_DSPR2 +#endif + // Scale ARGB vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, int dst_height, @@ -26,6 +63,197 @@ void ScalePlaneVertical(int src_height, int x, int y, int dy, int bpp, FilterMode filtering); +void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int, int); +void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown38_3_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height); +void ScaleARGBRowDown2_C(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int, int); +void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); + +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, + int src_height); +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int /* x */, int /* dx */); +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int /* x */, int /* dx */); +// Row functions. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +// ScaleRowDown2Box also used by planar functions +// NEON downscalers with interpolation. + +// Note - not static due to reuse in convert for 444 to 420. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); + +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index f417f3285..1807ea68f 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 877 +#define LIBYUV_VERSION 879 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/libyuv.gyp b/libyuv.gyp index 73d258fec..e3f5046a9 100644 --- a/libyuv.gyp +++ b/libyuv.gyp @@ -74,6 +74,7 @@ 'include/libyuv/row.h', 'include/libyuv/scale.h', 'include/libyuv/scale_argb.h', + 'include/libyuv/scale_row.h', 'include/libyuv/version.h', 'include/libyuv/video_common.h', @@ -107,10 +108,12 @@ 'source/row_win.cc', 'source/scale.cc', 'source/scale_argb.cc', - 'source/scale_argb_neon.cc', + 'source/scale_argb_neon.cc', # Deprecated. 'source/scale_common.cc', 'source/scale_mips.cc', 'source/scale_neon.cc', + 'source/scale_posix.cc', + 'source/scale_win.cc', 'source/video_common.cc', ], }, diff --git a/linux.mk b/linux.mk index eb524f8ea..5d12135a8 100644 --- a/linux.mk +++ b/linux.mk @@ -28,6 +28,7 @@ LOCAL_OBJ_FILES := \ source/scale_argb.o \ source/scale_common.o \ source/scale_mips.o \ + source/scale_posix.o \ source/video_common.o .cc.o: diff --git a/source/row_mips.cc b/source/row_mips.cc index 69677aa2d..fad2dcf44 100644 --- a/source/row_mips.cc +++ b/source/row_mips.cc @@ -15,6 +15,9 @@ namespace libyuv { extern "C" { #endif +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) + #ifdef HAS_COPYROW_MIPS void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { __asm__ __volatile__ ( @@ -968,6 +971,8 @@ void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, } #endif // __mips_dsp_rev >= 2 +#endif // defined(__mips__) + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_posix.cc b/source/row_posix.cc index 81f321c9a..3281c1d30 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -10,14 +10,12 @@ #include "libyuv/row.h" -#include "libyuv/basic_types.h" - #ifdef __cplusplus namespace libyuv { extern "C" { #endif -// This module is for GCC x86 and x64 +// This module is for GCC x86 and x64. #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) // TODO(nfullagar): For Native Client: When new toolchain becomes available, diff --git a/source/scale.cc b/source/scale.cc index 43c8aac20..e4f069d6b 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -34,2125 +34,6 @@ static __inline int Half(int v) { return v >= 0 ? ((v + 1) >> 1) : -((-v + 1) >> 1); } -// Note: Some SSE2 reference manuals -// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf - -// ScaleRowDown2Box also used by planar functions -// NEON downscalers with interpolation. - -#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_SCALEROWDOWN2_NEON -// Note - not static due to reuse in convert for 444 to 420. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); - -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); - -#define HAS_SCALEROWDOWN4_NEON -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - -#define HAS_SCALEROWDOWN34_NEON -// Down scale from 4 to 3 pixels. Use the neon multilane read/write -// to load up the every 4th pixel into a 4 different registers. -// Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8* src_ptr, - ptrdiff_t /* src_stride */, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - -#define HAS_SCALEROWDOWN38_NEON -// 32 -> 12 -void ScaleRowDown38_NEON(const uint8* src_ptr, - ptrdiff_t /* src_stride */, - uint8* dst_ptr, int dst_width); -// 32x3 -> 12x1 -void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -// 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - -// SSE2 downscalers with interpolation. -// Constants for SSSE3 code -#elif !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) - -// Offsets for source bytes 0 to 9 -static uvec8 kShuf0 = - { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static uvec8 kShuf1 = - { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf2 = - { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Offsets for source bytes 0 to 10 -static uvec8 kShuf01 = - { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; - -// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static uvec8 kShuf11 = - { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf21 = - { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; - -// Coefficients for source bytes 0 to 10 -static uvec8 kMadd01 = - { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; - -// Coefficients for source bytes 10 to 21 -static uvec8 kMadd11 = - { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; - -// Coefficients for source bytes 21 to 31 -static uvec8 kMadd21 = - { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; - -// Coefficients for source bytes 21 to 31 -static vec16 kRound34 = - { 2, 2, 2, 2, 2, 2, 2, 2 }; - -static uvec8 kShuf38a = - { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - -static uvec8 kShuf38b = - { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; - -// Arrange words 0,3,6 into 0,1,2 -static uvec8 kShufAc = - { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Arrange words 0,3,6 into 3,4,5 -static uvec8 kShufAc3 = - { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; - -// Scaling values for boxes of 3x3 and 2x3 -static uvec16 kScaleAc33 = - { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; - -// Arrange first value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb0 = - { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; - -// Arrange second value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb1 = - { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; - -// Arrange third value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb2 = - { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; - -// Scaling values for boxes of 3x2 and 2x2 -static uvec16 kScaleAb2 = - { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; -#endif - -#if !defined(LIBYUV_DISABLE_X86) && \ - defined(_M_IX86) && defined(_MSC_VER) -#define HAS_SCALEROWDOWN2_SSE2 -// Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x1 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop esi - ret - } -} - -// Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - align 16 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x1 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 16 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 16 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop esi - ret - } -} - -#define HAS_SCALEROWDOWN4_SSE2 -// Point samples 32 pixels to 8 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 - psrld xmm5, 24 - pslld xmm5, 16 - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm0, 8 - packuswb xmm0, xmm0 - sub ecx, 8 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - jg wloop - - ret - } -} - -// Blends 32x4 rectangle to 8x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, [eax + esi * 2] - movdqa xmm3, [eax + esi * 2 + 16] - movdqa xmm4, [eax + edi] - movdqa xmm5, [eax + edi + 16] - lea eax, [eax + 32] - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm7 - pand xmm3, xmm7 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - movdqa xmm2, xmm0 // average columns (16 to 8 pixels) - psrlw xmm0, 8 - pand xmm2, xmm7 - pavgw xmm0, xmm2 - packuswb xmm0, xmm0 - - sub ecx, 8 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - jg wloop - - pop edi - pop esi - ret - } -} - -#define HAS_SCALEROWDOWN34_SSSE3 -// Point samples 32 pixels to 24 pixels. -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm3, kShuf0 - movdqa xmm4, kShuf1 - movdqa xmm5, kShuf2 - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm1 - palignr xmm1, xmm0, 8 - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + 8], xmm1 - movq qword ptr [edx + 16], xmm2 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 24x1 -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -// Register usage: -// xmm0 src_row 0 -// xmm1 src_row 1 -// xmm2 shuf 0 -// xmm3 shuf 1 -// xmm4 shuf 2 -// xmm5 madd 0 -// xmm6 madd 1 -// xmm7 kRound34 - -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShuf01 - movdqa xmm3, kShuf11 - movdqa xmm4, kShuf21 - movdqa xmm5, kMadd01 - movdqa xmm6, kMadd11 - movdqa xmm7, kRound34 - - align 16 - wloop: - movdqa xmm0, [eax] // pixels 0..7 - movdqa xmm1, [eax + esi] - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqa xmm0, [eax + 16] // pixels 16..23 - movdqa xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - sub ecx, 24 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx + 24] - jg wloop - - pop esi - ret - } -} - -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShuf01 - movdqa xmm3, kShuf11 - movdqa xmm4, kShuf21 - movdqa xmm5, kMadd01 - movdqa xmm6, kMadd11 - movdqa xmm7, kRound34 - - align 16 - wloop: - movdqa xmm0, [eax] // pixels 0..7 - movdqa xmm1, [eax + esi] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqa xmm0, [eax + 16] // pixels 16..23 - movdqa xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - sub ecx, 24 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx+24] - jg wloop - - pop esi - ret - } -} - -#define HAS_SCALEROWDOWN38_SSSE3 -// 3/8 point sampler - -// Scale 32 pixels to 12 -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm4, kShuf38a - movdqa xmm5, kShuf38b - - align 16 - xloop: - movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 - lea eax, [eax + 32] - pshufb xmm0, xmm4 - pshufb xmm1, xmm5 - paddusb xmm0, xmm1 - - sub ecx, 12 - movq qword ptr [edx], xmm0 // write 12 pixels - movhlps xmm1, xmm0 - movd [edx + 8], xmm1 - lea edx, [edx + 12] - jg xloop - - ret - } -} - -// Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShufAc - movdqa xmm3, kShufAc3 - movdqa xmm4, kScaleAc33 - pxor xmm5, xmm5 - - align 16 - xloop: - movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 - movdqa xmm6, [eax + esi] - movhlps xmm1, xmm0 - movhlps xmm7, xmm6 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - movdqa xmm6, [eax + esi * 2] - lea eax, [eax + 16] - movhlps xmm7, xmm6 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - - movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - pshufb xmm6, xmm2 - - movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - pshufb xmm7, xmm3 - paddusw xmm6, xmm7 - - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 - packuswb xmm6, xmm6 - - sub ecx, 6 - movd [edx], xmm6 // write 6 pixels - psrlq xmm6, 16 - movd [edx + 2], xmm6 - lea edx, [edx + 6] - jg xloop - - pop esi - ret - } -} - -// Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShufAb0 - movdqa xmm3, kShufAb1 - movdqa xmm4, kShufAb2 - movdqa xmm5, kScaleAb2 - - align 16 - xloop: - movdqa xmm0, [eax] // average 2 rows into xmm0 - pavgb xmm0, [eax + esi] - lea eax, [eax + 16] - - movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 - pshufb xmm1, xmm2 - movdqa xmm6, xmm0 - pshufb xmm6, xmm3 - paddusw xmm1, xmm6 - pshufb xmm0, xmm4 - paddusw xmm1, xmm0 - - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 - packuswb xmm1, xmm1 - - sub ecx, 6 - movd [edx], xmm1 // write 6 pixels - psrlq xmm1, 16 - movd [edx + 2], xmm1 - lea edx, [edx + 6] - jg xloop - - pop esi - ret - } -} - -#define HAS_SCALEADDROWS_SSE2 - -// Reads 16xN bytes and produces 16 shorts at a time. -// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. -__declspec(naked) __declspec(align(16)) -static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, - int src_height) { - __asm { - push esi - push edi - push ebx - push ebp - mov esi, [esp + 16 + 4] // src_ptr - mov edx, [esp + 16 + 8] // src_stride - mov edi, [esp + 16 + 12] // dst_ptr - mov ecx, [esp + 16 + 16] // dst_width - mov ebx, [esp + 16 + 20] // height - pxor xmm4, xmm4 - dec ebx - - align 16 - xloop: - // first row - movdqa xmm0, [esi] - lea eax, [esi + edx] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm4 - punpckhbw xmm1, xmm4 - lea esi, [esi + 16] - mov ebp, ebx - test ebp, ebp - je ydone - - // sum remaining rows - align 16 - yloop: - movdqa xmm2, [eax] // read 16 pixels - lea eax, [eax + edx] // advance to next row - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm4 - punpckhbw xmm3, xmm4 - paddusw xmm0, xmm2 // sum 16 words - paddusw xmm1, xmm3 - sub ebp, 1 - jg yloop - - align 16 - ydone: - movdqa [edi], xmm0 - movdqa [edi + 16], xmm1 - lea edi, [edi + 32] - - sub ecx, 16 - jg xloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - -// Bilinear column filtering. SSSE3 version. -// TODO(fbarchard): Port to Neon - -#define HAS_SCALEFILTERCOLS_SSSE3 -__declspec(naked) __declspec(align(16)) -static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { - __asm { - push ebx - push esi - push edi - mov edi, [esp + 12 + 4] // dst_ptr - mov esi, [esp + 12 + 8] // src_ptr - mov ecx, [esp + 12 + 12] // dst_width - movd xmm2, [esp + 12 + 16] // x - movd xmm3, [esp + 12 + 20] // dx - mov eax, 0x04040000 // shuffle to line up fractions with pixel. - movd xmm5, eax - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - align 4 - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm1, 9 // 7 bit fractions. - movzx ebx, word ptr [esi + edx] // 2 source x1 pixels - movd xmm4, ebx - pshufb xmm1, xmm5 // 0011 - punpcklwd xmm0, xmm4 - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // 8 bits, 2 pixels. - movd ebx, xmm0 - mov [edi], bx - lea edi, [edi + 2] - sub ecx, 2 // 2 pixels - jge xloop2 - - align 4 - xloop29: - - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm2, 9 // 7 bit fractions. - pshufb xmm2, xmm5 // 0011 - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // 16 bit - psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // 8 bits - movd ebx, xmm0 - mov [edi], bl - - align 16 - xloop99: - - pop edi - pop esi - pop ebx - ret - } -} - -#define HAS_SCALECOLSUP2_SSE2 -// Reads 16 pixels, duplicates them and writes 32 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int /* x */, int /* dx */) { - __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width - - align 16 - wloop: - movdqa xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 - punpckhbw xmm1, xmm1 - sub ecx, 32 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - jg wloop - - ret - } -} - -#elif !defined(LIBYUV_DISABLE_X86) && \ - ((defined(__x86_64__)) || defined(__i386__)) - -// TODO(nfullagar): For Native Client: When new toolchain becomes available, -// take advantage of bundle lock / unlock feature. This will reduce the amount -// of manual bundle alignment done below, and bundle alignment could even be -// moved into each macro that doesn't use %%nacl: such as MEMOPREG. - -#if defined(__native_client__) && defined(__x86_64__) -#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" -#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" -#define MEMLEA(offset, base) #offset "(%q" #base ")" -#define MEMLEA3(offset, index, scale) \ - #offset "(,%q" #index "," #scale ")" -#define MEMLEA4(offset, base, index, scale) \ - #offset "(%q" #base ",%q" #index "," #scale ")" -#define MEMOPREG(opcode, offset, base, index, scale, reg) \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%%" #reg "\n" -#define MEMOPREGK(opcode, offset, base, index, scale, reg) \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%k" #reg "\n" -#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " %%" #reg ",(%%r15,%%r14)\n" -#define BUNDLEALIGN ".p2align 5 \n" -#else -#define MEMACCESS(base) "(%" #base ")" -#define MEMACCESS2(offset, base) #offset "(%" #base ")" -#define MEMLEA(offset, base) #offset "(%" #base ")" -#define MEMLEA3(offset, index, scale) \ - #offset "(,%" #index "," #scale ")" -#define MEMLEA4(offset, base, index, scale) \ - #offset "(%" #base ",%" #index "," #scale ")" -#define MEMOPREG(opcode, offset, base, index, scale, reg) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" -#define MEMOPREGK(opcode, offset, base, index, scale, reg) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%k" #reg "\n" -#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ - #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" -#define BUNDLEALIGN -#endif - -// GCC versions of row functions are verbatim conversions from Visual C. -// Generated using gcc disassembly on Visual C object file: -// objdump -D yuvscaler.obj >yuvscaler.txt - -#define HAS_SCALEROWDOWN2_SSE2 -static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 - BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -static void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - BUNDLEALIGN - MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -#define HAS_SCALEROWDOWN4_SSE2 -static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - intptr_t stridex3 = 0; - asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0x8,%%xmm7 \n" - "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 - BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 - BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 - MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 - MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm5,%%xmm3 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm7,%%xmm2 \n" - "pand %%xmm7,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "pavgw %%xmm2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(stridex3) // %3 - : "r"(static_cast(src_stride)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" -#endif - ); -} - -#define HAS_SCALEROWDOWN34_SSSE3 -static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kShuf0), // %0 - "m"(kShuf1), // %1 - "m"(kShuf2) // %2 - ); - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movq %%xmm1," MEMACCESS2(0x8,1) " \n" - "movq %%xmm2," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile ( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS(1) " \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif - ); -} - -static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile ( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS(1) " \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" - MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif - ); -} - -#define HAS_SCALEROWDOWN38_SSSE3 -static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1," MEMACCESS2(0x8,1) " \n" - "lea " MEMLEA(0xc,1) ",%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kShuf38a), // %3 - "m"(kShuf38b) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm4", "xmm5" -#endif - ); -} - -static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" - : - : "m"(kShufAb0), // %0 - "m"(kShufAb1), // %1 - "m"(kShufAb2), // %2 - "m"(kScaleAb2) // %3 - ); - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "sub $0x6,%2 \n" - "movd %%xmm1," MEMACCESS(1) " \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1," MEMACCESS2(0x2,1) " \n" - "lea " MEMLEA(0x6,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif - ); -} - -static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - : - : "m"(kShufAc), // %0 - "m"(kShufAc3), // %1 - "m"(kScaleAc33) // %2 - ); - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "sub $0x6,%2 \n" - "movd %%xmm6," MEMACCESS(1) " \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6," MEMACCESS2(0x2,1) " \n" - "lea " MEMLEA(0x6,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif - ); -} - -#define HAS_SCALEADDROWS_SSE2 -static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { - int tmp_height = 0; - intptr_t tmp_src = 0; - asm volatile ( - "pxor %%xmm4,%%xmm4 \n" - "sub $0x1,%5 \n" - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "mov %0,%3 \n" - "add %6,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "punpckhbw %%xmm4,%%xmm1 \n" - "mov %5,%2 \n" - "test %2,%2 \n" - "je 3f \n" - ".p2align 4 \n" - BUNDLEALIGN - "2: \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" - "add %6,%0 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm2 \n" - "punpckhbw %%xmm4,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "sub $0x1,%2 \n" - "jg 2b \n" - ".p2align 4 \n" - "3: \n" - BUNDLEALIGN - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x10,3) ",%0 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_height), // %2 - "+r"(tmp_src), // %3 - "+r"(src_width), // %4 - "+rm"(src_height) // %5 - : "rm"(static_cast(src_stride)) // %6 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" -#endif - ); -} - -// Bilinear column filtering. SSSE3 version. -#define HAS_SCALEFILTERCOLS_SSSE3 -static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { - intptr_t x0 = 0, x1 = 0, temp_pixel = 0; - asm volatile ( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - ".p2align 2 \n" - BUNDLEALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - MEMOPREGK(movzwl,0x00,1,3,1,2) // movzwl (%1,%3,1),%k2 - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - BUNDLEALIGN - MEMOPREGK(movzwl,0x00,1,4,1,2) // movzwl (%1,%4,1),%k2 - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,%k2 \n" - "mov %w2," MEMACCESS(0) " \n" - "lea " MEMLEA(0x2,0) ",%0 \n" - "sub $0x2,%5 \n" - "jge 2b \n" - ".p2align 2 \n" - BUNDLEALIGN - "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - MEMOPREGK(movzwl,0x00,1,3,1,2) // movzwl (%1,%3,1),%k2 - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,%k2 \n" - "mov %b2," MEMACCESS(0) " \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+a"(temp_pixel), // %2 - "+r"(x0), // %3 - "+r"(x1), // %4 - "+rm"(dst_width) // %5 - : "rm"(x), // %6 - "rm"(dx) // %7 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif - ); -} - -// Reads 4 pixels, duplicates them and writes 8 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -#define HAS_SCALECOLSUP2_SSE2 -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int /* x */, int /* dx */) { - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "sub $0x20,%2 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "jg 1b \n" - - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -#endif // defined(__x86_64__) || defined(__i386__) - -#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_SCALEROWDOWN2_MIPS_DSPR2 -void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); -void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -#define HAS_SCALEROWDOWN4_MIPS_DSPR2 -void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); -void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -#define HAS_SCALEROWDOWN34_MIPS_DSPR2 -void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); -void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -#define HAS_SCALEROWDOWN38_MIPS_DSPR2 -void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); -void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) - -// CPU agnostic row functions -static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - uint8* dend = dst + dst_width - 1; - do { - dst[0] = src_ptr[1]; - dst[1] = src_ptr[3]; - dst += 2; - src_ptr += 4; - } while (dst < dend); - if (dst_width & 1) { - dst[0] = src_ptr[1]; - } -} - -void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* s = src_ptr; - uint8* dend = dst + dst_width - 1; - do { - dst[0] = (s[0] + s[1] + 1) >> 1; - dst[1] = (s[2] + s[3] + 1) >> 1; - dst += 2; - s += 4; - } while (dst < dend); - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + 1) >> 1; - } -} - -void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; - uint8* dend = dst + dst_width - 1; - do { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; - dst += 2; - s += 4; - t += 4; - } while (dst < dend); - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - } -} - -static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - uint8* dend = dst + dst_width - 1; - do { - dst[0] = src_ptr[2]; - dst[1] = src_ptr[6]; - dst += 2; - src_ptr += 8; - } while (dst < dend); - if (dst_width & 1) { - dst[0] = src_ptr[2]; - } -} - -static void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - intptr_t stride = src_stride; - uint8* dend = dst + dst_width - 1; - do { - dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; - dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + - src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + - src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + - 8) >> 4; - dst += 2; - src_ptr += 8; - } while (dst < dend); - if (dst_width & 1) { - dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; - } -} - -static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - uint8* dend = dst + dst_width; - do { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[1]; - dst[2] = src_ptr[3]; - dst += 3; - src_ptr += 4; - } while (dst < dend); -} - -// Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; - uint8* dend = d + dst_width; - do { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 * 3 + b0 + 2) >> 2; - d[1] = (a1 * 3 + b1 + 2) >> 2; - d[2] = (a2 * 3 + b2 + 2) >> 2; - d += 3; - s += 4; - t += 4; - } while (d < dend); -} - -// Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; - uint8* dend = d + dst_width; - do { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 + b0 + 1) >> 1; - d[1] = (a1 + b1 + 1) >> 1; - d[2] = (a2 + b2 + 1) >> 1; - d += 3; - s += 4; - t += 4; - } while (d < dend); -} - -// Scales a single row of pixels using point sampling. -void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { - for (int j = 0; j < dst_width - 1; j += 2) { - dst_ptr[0] = src_ptr[x >> 16]; - x += dx; - dst_ptr[1] = src_ptr[x >> 16]; - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - dst_ptr[0] = src_ptr[x >> 16]; - } -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int, int) { - for (int j = 0; j < dst_width - 1; j += 2) { - dst_ptr[1] = dst_ptr[0] = src_ptr[0]; - src_ptr += 1; - dst_ptr += 2; - } - if (dst_width & 1) { - dst_ptr[0] = src_ptr[0]; - } -} - -// (1-f)a + fb can be replaced with a + f(b-a) -#define BLENDER(a, b, f) (static_cast(a) + \ - ((f) * (static_cast(b) - static_cast(a)) >> 16)) - -static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { - for (int j = 0; j < dst_width - 1; j += 2) { - int xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - x += dx; - xi = x >> 16; - a = src_ptr[xi]; - b = src_ptr[xi + 1]; - dst_ptr[1] = BLENDER(a, b, x & 0xffff); - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - int xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - } -} - -static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - assert(dst_width % 3 == 0); - for (int x = 0; x < dst_width; x += 3) { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[3]; - dst[2] = src_ptr[6]; - dst += 3; - src_ptr += 8; - } -} - -// 8x3 -> 3x1 -static void ScaleRowDown38_3_Box_C(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - intptr_t stride = src_stride; - for (int i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -// 8x2 -> 3x1 -static void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - intptr_t stride = src_stride; - for (int i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2]) * (65536 / 6) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5]) * (65536 / 6) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { - assert(src_width > 0); - assert(src_height > 0); - for (int x = 0; x < src_width; ++x) { - const uint8* s = src_ptr + x; - unsigned int sum = 0u; - for (int y = 0; y < src_height; ++y) { - sum += s[0]; - s += src_stride; - } - // TODO(fbarchard): Consider limitting height to 256 to avoid overflow. - dst_ptr[x] = sum < 65535u ? sum : 65535u; - } -} // Scale plane, 1/2 // This is an optimized version for scaling down a plane to 1/2 of diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 6d102c7b4..9a718b386 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -27,994 +27,6 @@ static __inline int Abs(int v) { return v >= 0 ? v : -v; } -// ARGB scaling uses bilinear or point, but not box filter. -#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_SCALEARGBROWDOWNEVEN_NEON -#define HAS_SCALEARGBROWDOWN2_NEON -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, - int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride, - int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -#endif - -#if !defined(LIBYUV_DISABLE_X86) && \ - defined(_M_IX86) && defined(_MSC_VER) -#define HAS_SCALEARGBROWDOWN2_SSE2 -// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, - ptrdiff_t /* src_stride */, - uint8* dst_argb, int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - shufps xmm0, xmm1, 0xdd - sub ecx, 4 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 8x1 rectangle to 4x1. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, - ptrdiff_t /* src_stride */, - uint8* dst_argb, int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 8x2 rectangle to 4x1. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // dst_width - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop esi - ret - } -} - -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -// Reads 4 pixels at a time. -// Alignment requirement: dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { - __asm { - push ebx - push edi - mov eax, [esp + 8 + 4] // src_argb - // src_stride ignored - mov ebx, [esp + 8 + 12] // src_stepx - mov edx, [esp + 8 + 16] // dst_argb - mov ecx, [esp + 8 + 20] // dst_width - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - - align 16 - wloop: - movd xmm0, [eax] - movd xmm1, [eax + ebx] - punpckldq xmm0, xmm1 - movd xmm2, [eax + ebx * 2] - movd xmm3, [eax + edi] - lea eax, [eax + ebx * 4] - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop edi - pop ebx - ret - } -} - -// Blends four 2x2 to 4x1. -// Alignment requirement: dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov esi, [esp + 12 + 8] // src_stride - mov ebx, [esp + 12 + 12] // src_stepx - mov edx, [esp + 12 + 16] // dst_argb - mov ecx, [esp + 12 + 20] // dst_width - lea esi, [eax + esi] // row1 pointer - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - - align 16 - wloop: - movq xmm0, qword ptr [eax] // row0 4 pairs - movhps xmm0, qword ptr [eax + ebx] - movq xmm1, qword ptr [eax + ebx * 2] - movhps xmm1, qword ptr [eax + edi] - lea eax, [eax + ebx * 4] - movq xmm2, qword ptr [esi] // row1 4 pairs - movhps xmm2, qword ptr [esi + ebx] - movq xmm3, qword ptr [esi + ebx * 2] - movhps xmm3, qword ptr [esi + edi] - lea esi, [esi + ebx * 4] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop edi - pop esi - pop ebx - ret - } -} - -// Column scaling unfiltered. SSE2 version. -#define HAS_SCALEARGBCOLS_SSE2 -__declspec(naked) __declspec(align(16)) -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - __asm { - push edi - push esi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - - pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 - pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 - paddd xmm2, xmm0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 2 - pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 - paddd xmm2, xmm0 // x3 x2 x1 x0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 4 - pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 - - pextrw eax, xmm2, 1 // get x0 integer. - pextrw edx, xmm2, 3 // get x1 integer. - - cmp ecx, 0 - jle xloop99 - sub ecx, 4 - jl xloop49 - - // 4 Pixel loop. - align 4 - xloop4: - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - pextrw edx, xmm2, 7 // get x3 integer. - paddd xmm2, xmm3 // x += dx - punpckldq xmm0, xmm1 // x0 x1 - - movd xmm1, [esi + eax * 4] // 1 source x2 pixels - movd xmm4, [esi + edx * 4] // 1 source x3 pixels - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - punpckldq xmm1, xmm4 // x2 x3 - punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 - sub ecx, 4 // 4 pixels - movdqu [edi], xmm0 - lea edi, [edi + 16] - jge xloop4 - - align 4 - xloop49: - test ecx, 2 - je xloop29 - - // 2 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - punpckldq xmm0, xmm1 // x0 x1 - - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - - xloop29: - test ecx, 1 - je xloop99 - - // 1 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x2 pixels - movd dword ptr [edi], xmm0 - align 4 - xloop99: - - pop esi - pop edi - ret - } -} - -// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. -// TODO(fbarchard): Port to Neon - -// Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel -}; - -// Shuffle table for duplicating 2 fractions into 8 bytes each -static uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, -}; - -#define HAS_SCALEARGBFILTERCOLS_SSSE3 -__declspec(naked) __declspec(align(16)) -static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - movdqa xmm4, kShuffleColARGB - movdqa xmm5, kShuffleFractions - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - align 4 - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - psrlw xmm1, 9 // 7 bit fractions. - movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels - pshufb xmm1, xmm5 // 0000000011111111 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 2 // 2 pixels - jge xloop2 - - align 4 - xloop29: - - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - psrlw xmm2, 9 // 7 bit fractions. - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - pshufb xmm2, xmm5 // 00000000 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. - psrlw xmm0, 7 - packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. - movd [edi], xmm0 - - align 16 - xloop99: - - pop edi - pop esi - ret - } -} - -// Reads 4 pixels, duplicates them and writes 8 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -#define HAS_SCALEARGBCOLSUP2_SSE2 -__declspec(naked) __declspec(align(16)) -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int /* x */, int /* dx */) { - __asm { - mov edx, [esp + 4] // dst_argb - mov eax, [esp + 8] // src_argb - mov ecx, [esp + 12] // dst_width - - align 16 - wloop: - movdqa xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpckldq xmm0, xmm0 - punpckhdq xmm1, xmm1 - sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - jg wloop - - ret - } -} - -#elif !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) - -// TODO(nfullagar): For Native Client: When new toolchain becomes available, -// take advantage of bundle lock / unlock feature. This will reduce the amount -// of manual bundle alignment done below, and bundle alignment could even be -// moved into each macro that doesn't use %%nacl: such as MEMOPREG. - -#if defined(__native_client__) && defined(__x86_64__) -#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" -#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" -#define MEMLEA(offset, base) #offset "(%q" #base ")" -#define MEMLEA3(offset, index, scale) \ - #offset "(,%q" #index "," #scale ")" -#define MEMLEA4(offset, base, index, scale) \ - #offset "(%q" #base ",%q" #index "," #scale ")" -#define MEMOPREG(opcode, offset, base, index, scale, reg) \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%%" #reg "\n" -#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " %%" #reg ",(%%r15,%%r14)\n" -#define BUNDLEALIGN ".p2align 5 \n" -#else -#define MEMACCESS(base) "(%" #base ")" -#define MEMACCESS2(offset, base) #offset "(%" #base ")" -#define MEMLEA(offset, base) #offset "(%" #base ")" -#define MEMLEA3(offset, index, scale) \ - #offset "(,%" #index "," #scale ")" -#define MEMLEA4(offset, base, index, scale) \ - #offset "(%" #base ",%" #index "," #scale ")" -#define MEMOPREG(opcode, offset, base, index, scale, reg) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" -#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ - #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" -#define BUNDLEALIGN -#endif - -// GCC versions of row functions are verbatim conversions from Visual C, -// with some additional macro injection for Native Client (see row_posix.cc -// for more details.) -// Generated using gcc disassembly on Visual C object file: -// objdump -D yuvscaler.obj >yuvscaler.txt -#define HAS_SCALEARGBROWDOWN2_SSE2 -static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, - ptrdiff_t /* src_stride */, - uint8* dst_argb, int dst_width) { - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -static void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, - ptrdiff_t /* src_stride */, - uint8* dst_argb, int dst_width) { - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif - ); -} - -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -// Reads 4 pixels at a time. -// Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { - intptr_t src_stepx_x4 = static_cast(src_stepx); - intptr_t src_stepx_x12 = 0; - asm volatile ( - "lea " MEMLEA3(0x00,1,4) ",%1 \n" - "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movd " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 - "punpckldq %%xmm1,%%xmm0 \n" - BUNDLEALIGN - MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 - MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 - "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width), // %3 - "+r"(src_stepx_x12) // %4 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif - ); -} - -// Blends four 2x2 to 4x1. -// Alignment requirement: dst_argb 16 byte aligned. -static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { - intptr_t src_stepx_x4 = static_cast(src_stepx); - intptr_t src_stepx_x12 = 0; - intptr_t row1 = static_cast(src_stride); - asm volatile ( - "lea " MEMLEA3(0x00,1,4) ",%1 \n" - "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" - "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 - MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 - BUNDLEALIGN - MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 - "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" - "movq " MEMACCESS(5) ",%%xmm2 \n" - BUNDLEALIGN - MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 - MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 - MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 - "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+rm"(dst_width), // %3 - "+r"(src_stepx_x12), // %4 - "+r"(row1) // %5 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif - ); -} - -#define HAS_SCALEARGBCOLS_SSE2 -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - intptr_t x0 = 0, x1 = 0; - asm volatile ( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x11,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x5,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "cmp $0x0,%4 \n" - "jl 99f \n" - "sub $0x4,%4 \n" - "jl 49f \n" - ".p2align 2 \n" - BUNDLEALIGN - "40: \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 - "pextrw $0x5,%%xmm2,%k0 \n" - "pextrw $0x7,%%xmm2,%k1 \n" - "paddd %%xmm3,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" - MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 - MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" - "sub $0x4,%4 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "jge 40b \n" - - "49: \n" - "test $0x2,%4 \n" - "je 29f \n" - BUNDLEALIGN - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 - "pextrw $0x5,%%xmm2,%k0 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x8,2) ",%2 \n" - "29: \n" - "test $0x1,%4 \n" - "je 99f \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - "movd %%xmm0," MEMACCESS(2) " \n" - "99: \n" - : "+a"(x0), // %0 - "+d"(x1), // %1 - "+r"(dst_argb), // %2 - "+r"(src_argb), // %3 - "+r"(dst_width) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" -#endif - ); -} - -// Reads 4 pixels, duplicates them and writes 8 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -#define HAS_SCALEARGBCOLSUP2_SSE2 -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int /* x */, int /* dx */) { - asm volatile ( - ".p2align 4 \n" - BUNDLEALIGN - "1: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpckldq %%xmm0,%%xmm0 \n" - "punpckhdq %%xmm1,%%xmm1 \n" - "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "jg 1b \n" - - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -// Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel -}; - -// Shuffle table for duplicating 2 fractions into 8 bytes each -static uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, -}; - -// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version -#define HAS_SCALEARGBFILTERCOLS_SSSE3 -static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - intptr_t x0 = 0, x1 = 0; - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" - : - : "m"(kShuffleColARGB), // %0 - "m"(kShuffleFractions) // %1 - ); - - asm volatile ( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - ".p2align 2 \n" - BUNDLEALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 - "psrlw $0x9,%%xmm1 \n" - BUNDLEALIGN - MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(0) " \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" - - ".p2align 2 \n" - BUNDLEALIGN - "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - BUNDLEALIGN - MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(0) " \n" - - ".p2align 4 \n" - "99: \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+rm"(dst_width), // %2 - "+r"(x0), // %3 - "+r"(x1) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif - ); -} - -#endif // defined(__x86_64__) || defined(__i386__) - -static void ScaleARGBRowDown2_C(const uint8* src_argb, - ptrdiff_t /* src_stride */, - uint8* dst_argb, int dst_width) { - const uint32* src = reinterpret_cast(src_argb); - uint32* dst = reinterpret_cast(dst_argb); - - for (int x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[1]; - dst[1] = src[3]; - src += 4; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[1]; - } -} - -static void ScaleARGBRowDown2Linear_C(const uint8* src_argb, - ptrdiff_t /* src_stride */, - uint8* dst_argb, int dst_width) { - for (int x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; - dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; - dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1; - dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1; - src_argb += 8; - dst_argb += 4; - } -} - -static void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - for (int x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + - src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + - src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + - src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + - src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; - src_argb += 8; - dst_argb += 4; - } -} - -void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */, - int src_stepx, - uint8* dst_argb, int dst_width) { - const uint32* src = reinterpret_cast(src_argb); - uint32* dst = reinterpret_cast(dst_argb); - - for (int x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[0]; - dst[1] = src[src_stepx]; - src += src_stepx * 2; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[0]; - } -} - -static void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { - for (int x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + - src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + - src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + - src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + - src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; - src_argb += src_stepx * 4; - dst_argb += 4; - } -} - -// Scales a single row of pixels using point sampling. -void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint32* src = reinterpret_cast(src_argb); - uint32* dst = reinterpret_cast(dst_argb); - for (int j = 0; j < dst_width - 1; j += 2) { - dst[0] = src[x >> 16]; - x += dx; - dst[1] = src[x >> 16]; - x += dx; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[x >> 16]; - } -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int, int) { - const uint32* src = reinterpret_cast(src_argb); - uint32* dst = reinterpret_cast(dst_argb); - for (int j = 0; j < dst_width - 1; j += 2) { - dst[1] = dst[0] = src[0]; - src += 1; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[0]; - } -} - -// Mimics SSSE3 blender -#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 -#define BLENDERC(a, b, f, s) static_cast( \ - BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) -#define BLENDER(a, b, f) \ - BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ - BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) - -static void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint32* src = reinterpret_cast(src_argb); - uint32* dst = reinterpret_cast(dst_argb); - for (int j = 0; j < dst_width - 1; j += 2) { - int xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - x += dx; - xi = x >> 16; - xf = (x >> 9) & 0x7f; - a = src[xi]; - b = src[xi + 1]; - dst[1] = BLENDER(a, b, xf); - x += dx; - dst += 2; - } - if (dst_width & 1) { - int xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - } -} - // ScaleARGB ARGB, 1/2 // This is an optimized version for scaling down a ARGB to 1/2 of // its original size. diff --git a/source/scale_argb_neon.cc b/source/scale_argb_neon.cc index 51b008724..43842b9d2 100644 --- a/source/scale_argb_neon.cc +++ b/source/scale_argb_neon.cc @@ -1,142 +1,2 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ +// This file is deprecated. -#include "libyuv/basic_types.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) - -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - "vld2.32 {q0, q1}, [%0]! \n" - "vld2.32 {q2, q3}, [%0]! \n" - "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "vst1.8 {q3}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" - ); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx, - uint8* dst_argb, int dst_width) { - asm volatile ( - "mov r12, %3, lsl #2 \n" - ".p2align 2 \n" - "1: \n" - "vld1.32 {d0[0]}, [%0], r12 \n" - "vld1.32 {d0[1]}, [%0], r12 \n" - "vld1.32 {d1[0]}, [%0], r12 \n" - "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stepx) // %3 - : "memory", "cc", "r12", "q0" - ); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { - asm volatile ( - "mov r12, %4, lsl #2 \n" - "add %1, %1, %0 \n" - ".p2align 2 \n" - "1: \n" - "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 - "vld1.8 {d1}, [%1], r12 \n" - "vld1.8 {d2}, [%0], r12 \n" - "vld1.8 {d3}, [%1], r12 \n" - "vld1.8 {d4}, [%0], r12 \n" - "vld1.8 {d5}, [%1], r12 \n" - "vld1.8 {d6}, [%0], r12 \n" - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"(src_stepx) // %4 - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" - ); -} -#endif // __ARM_NEON__ - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/source/scale_common.cc b/source/scale_common.cc index 2c13450ca..3c7d6a29e 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -23,6 +23,444 @@ namespace libyuv { extern "C" { #endif +// CPU agnostic row functions +void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + uint8* dend = dst + dst_width - 1; + do { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } while (dst < dend); + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + uint8* dend = dst + dst_width - 1; + do { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } while (dst < dend); + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + uint8* dend = dst + dst_width - 1; + do { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } while (dst < dend); + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + uint8* dend = dst + dst_width - 1; + do { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } while (dst < dend); + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + intptr_t stride = src_stride; + uint8* dend = dst + dst_width - 1; + do { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + + 8) >> 4; + dst += 2; + src_ptr += 8; + } while (dst < dend); + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + } +} + +void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + uint8* dend = dst + dst_width; + do { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } while (dst < dend); +} + +// Filter rows 0 and 1 together, 3 : 1 +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + uint8* dend = d + dst_width; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +// Filter rows 1 and 2 together, 1 : 1 +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + uint8* dend = d + dst_width; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +// Scales a single row of pixels using point sampling. +void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + for (int j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int, int) { + for (int j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +// (1-f)a + fb can be replaced with a + f(b-a) +#define BLENDER(a, b, f) (static_cast(a) + \ + ((f) * (static_cast(b) - static_cast(a)) >> 16)) + +void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + for (int j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + assert(dst_width % 3 == 0); + for (int x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +void ScaleRowDown38_3_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + intptr_t stride = src_stride; + for (int i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + intptr_t stride = src_stride; + for (int i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + assert(src_width > 0); + assert(src_height > 0); + for (int x = 0; x < src_width; ++x) { + const uint8* s = src_ptr + x; + unsigned int sum = 0u; + for (int y = 0; y < src_height; ++y) { + sum += s[0]; + s += src_stride; + } + // TODO(fbarchard): Consider limitting height to 256 to avoid overflow. + dst_ptr[x] = sum < 65535u ? sum : 65535u; + } +} + +void ScaleARGBRowDown2_C(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + const uint32* src = reinterpret_cast(src_argb); + uint32* dst = reinterpret_cast(dst_argb); + + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[1]; + dst[1] = src[3]; + src += 4; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[1]; + } +} + +void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; + dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; + dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1; + dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */, + int src_stepx, + uint8* dst_argb, int dst_width) { + const uint32* src = reinterpret_cast(src_argb); + uint32* dst = reinterpret_cast(dst_argb); + + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + src_argb += src_stepx * 4; + dst_argb += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = reinterpret_cast(src_argb); + uint32* dst = reinterpret_cast(dst_argb); + for (int j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int, int) { + const uint32* src = reinterpret_cast(src_argb); + uint32* dst = reinterpret_cast(dst_argb); + for (int j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 +#define BLENDERC(a, b, f, s) static_cast( \ + BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ + BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = reinterpret_cast(src_argb); + uint32* dst = reinterpret_cast(dst_argb); + for (int j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + // Scale plane vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, int dst_height, diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 5912f5251..566fb8d61 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/basic_types.h" #include "libyuv/row.h" #ifdef __cplusplus @@ -16,7 +15,7 @@ namespace libyuv { extern "C" { #endif -// This module is for GCC Neon +// This module is for GCC Neon. #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) // NEON downscalers with interpolation. @@ -546,6 +545,123 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" ); } + +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.32 {q0, q1}, [%0]! \n" + "vld2.32 {q2, q3}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "vst1.8 {q3}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %3, lsl #2 \n" + ".p2align 2 \n" + "1: \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" + ); +} + #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/source/scale_posix.cc b/source/scale_posix.cc new file mode 100644 index 000000000..6c69e3926 --- /dev/null +++ b/source/scale_posix.cc @@ -0,0 +1,1337 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +// Offsets for source bytes 0 to 9 +static uvec8 kShuf0 = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static uvec8 kShuf1 = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf2 = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +static uvec8 kShuf01 = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static uvec8 kShuf11 = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf21 = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +static uvec8 kMadd01 = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +static uvec8 kMadd11 = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +static uvec8 kMadd21 = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +static vec16 kRound34 = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static uvec8 kShuf38a = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +static uvec8 kShuf38b = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +static uvec8 kShufAc = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +static uvec8 kShufAc3 = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +static uvec16 kScaleAc33 = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb0 = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb1 = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb2 = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +static uvec16 kScaleAb2 = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; + +// TODO(nfullagar): For Native Client: When new toolchain becomes available, +// take advantage of bundle lock / unlock feature. This will reduce the amount +// of manual bundle alignment done below, and bundle alignment could even be +// moved into each macro that doesn't use %%nacl: such as MEMOPREG. + +#if defined(__native_client__) && defined(__x86_64__) +#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" +#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" +#define MEMLEA(offset, base) #offset "(%q" #base ")" +#define MEMLEA3(offset, index, scale) \ + #offset "(,%q" #index "," #scale ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%q" #base ",%q" #index "," #scale ")" +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%%" #reg "\n" +#define MEMOPREGK(opcode, offset, base, index, scale, reg) \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%k" #reg "\n" +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " %%" #reg ",(%%r15,%%r14)\n" +#define BUNDLEALIGN ".p2align 5 \n" +#else +#define MEMACCESS(base) "(%" #base ")" +#define MEMACCESS2(offset, base) #offset "(%" #base ")" +#define MEMLEA(offset, base) #offset "(%" #base ")" +#define MEMLEA3(offset, index, scale) \ + #offset "(,%" #index "," #scale ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%" #base ",%" #index "," #scale ")" +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" +#define MEMOPREGK(opcode, offset, base, index, scale, reg) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%k" #reg "\n" +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" +#define BUNDLEALIGN +#endif + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt + +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stridex3 = 0; + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 + MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 + MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm7,%%xmm2 \n" + "pand %%xmm7,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "pavgw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(stridex3) // %3 + : "r"(static_cast(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" +#endif + ); +} + +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movq %%xmm1," MEMACCESS2(0x8,1) " \n" + "movq %%xmm2," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS(1) " \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS(1) " \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1," MEMACCESS2(0x8,1) " \n" + "lea " MEMLEA(0xc,1) ",%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm4", "xmm5" +#endif + ); +} + +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "sub $0x6,%2 \n" + "movd %%xmm1," MEMACCESS(1) " \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1," MEMACCESS2(0x2,1) " \n" + "lea " MEMLEA(0x6,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "sub $0x6,%2 \n" + "movd %%xmm6," MEMACCESS(1) " \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6," MEMACCESS2(0x2,1) " \n" + "lea " MEMLEA(0x6,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int tmp_height = 0; + intptr_t tmp_src = 0; + asm volatile ( + "pxor %%xmm4,%%xmm4 \n" + "sub $0x1,%5 \n" + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "mov %0,%3 \n" + "add %6,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "mov %5,%2 \n" + "test %2,%2 \n" + "je 3f \n" + ".p2align 4 \n" + BUNDLEALIGN + "2: \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "add %6,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "sub $0x1,%2 \n" + "jg 2b \n" + ".p2align 4 \n" + "3: \n" + BUNDLEALIGN + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x10,3) ",%0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_height), // %2 + "+r"(tmp_src), // %3 + "+r"(src_width), // %4 + "+rm"(src_height) // %5 + : "rm"(static_cast(src_stride)) // %6 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} + +// Bilinear column filtering. SSSE3 version. +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0, temp_pixel = 0; + asm volatile ( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + ".p2align 2 \n" + BUNDLEALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + MEMOPREGK(movzwl,0x00,1,3,1,2) // movzwl (%1,%3,1),%k2 + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + BUNDLEALIGN + MEMOPREGK(movzwl,0x00,1,4,1,2) // movzwl (%1,%4,1),%k2 + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%k2 \n" + "mov %w2," MEMACCESS(0) " \n" + "lea " MEMLEA(0x2,0) ",%0 \n" + "sub $0x2,%5 \n" + "jge 2b \n" + ".p2align 2 \n" + BUNDLEALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + MEMOPREGK(movzwl,0x00,1,3,1,2) // movzwl (%1,%3,1),%k2 + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%k2 \n" + "mov %b2," MEMACCESS(0) " \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+a"(temp_pixel), // %2 + "+r"(x0), // %3 + "+r"(x1), // %4 + "+rm"(dst_width) // %5 + : "rm"(x), // %6 + "rm"(dx) // %7 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int /* x */, int /* dx */) { + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "sub $0x20,%2 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 + MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + intptr_t src_stepx_x4 = static_cast(src_stepx); + intptr_t src_stepx_x12 = 0; + asm volatile ( + "lea " MEMLEA3(0x00,1,4) ",%1 \n" + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movd " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 + "punpckldq %%xmm1,%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 + MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "+r"(src_stepx_x12) // %4 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, + uint8* dst_argb, int dst_width) { + intptr_t src_stepx_x4 = static_cast(src_stepx); + intptr_t src_stepx_x12 = 0; + intptr_t row1 = static_cast(src_stride); + asm volatile ( + "lea " MEMLEA3(0x00,1,4) ",%1 \n" + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" + "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 + MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 + BUNDLEALIGN + MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" + "movq " MEMACCESS(5) ",%%xmm2 \n" + BUNDLEALIGN + MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 + MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 + MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 + "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "+r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0; + asm volatile ( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" + ".p2align 2 \n" + BUNDLEALIGN + "40: \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 + MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "sub $0x4,%4 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 40b \n" + + "49: \n" + "test $0x2,%4 \n" + "je 29f \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x8,2) ",%2 \n" + "29: \n" + "test $0x1,%4 \n" + "je 99f \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + "movd %%xmm0," MEMACCESS(2) " \n" + "99: \n" + : "+a"(x0), // %0 + "+d"(x1), // %1 + "+r"(dst_argb), // %2 + "+r"(src_argb), // %3 + "+r"(dst_width) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int /* x */, int /* dx */) { + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0; + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" + : + : "m"(kShuffleColARGB), // %0 + "m"(kShuffleFractions) // %1 + ); + + asm volatile ( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + ".p2align 2 \n" + BUNDLEALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 + "psrlw $0x9,%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(0) " \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" + + ".p2align 2 \n" + BUNDLEALIGN + "29: \n" + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + BUNDLEALIGN + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0," MEMACCESS(0) " \n" + + ".p2align 4 \n" + "99: \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+rm"(dst_width), // %2 + "+r"(x0), // %3 + "+r"(x1) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/scale_win.cc b/source/scale_win.cc new file mode 100644 index 000000000..d722e3bc7 --- /dev/null +++ b/source/scale_win.cc @@ -0,0 +1,1282 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C x86. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +// Offsets for source bytes 0 to 9 +static uvec8 kShuf0 = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static uvec8 kShuf1 = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf2 = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +static uvec8 kShuf01 = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static uvec8 kShuf11 = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf21 = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +static uvec8 kMadd01 = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +static uvec8 kMadd11 = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +static uvec8 kMadd21 = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +static vec16 kRound34 = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static uvec8 kShuf38a = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +static uvec8 kShuf38b = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +static uvec8 kShufAc = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +static uvec8 kShufAc3 = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +static uvec16 kScaleAc33 = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb0 = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb1 = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb2 = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +static uvec16 kScaleAb2 = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; + +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + align 16 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 16 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 16 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Point samples 32 pixels to 8 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + psrld xmm5, 24 + pslld xmm5, 16 + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm0, 8 + packuswb xmm0, xmm0 + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + + ret + } +} + +// Blends 32x4 rectangle to 8x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, [eax + esi * 2] + movdqa xmm3, [eax + esi * 2 + 16] + movdqa xmm4, [eax + edi] + movdqa xmm5, [eax + edi + 16] + lea eax, [eax + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) + psrlw xmm0, 8 + pand xmm2, xmm7 + pavgw xmm0, xmm2 + packuswb xmm0, xmm0 + + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + + pop edi + pop esi + ret + } +} + +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm3, kShuf0 + movdqa xmm4, kShuf1 + movdqa xmm5, kShuf2 + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm1 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] + sub ecx, 24 + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 kRound34 + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + align 16 + wloop: + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + sub ecx, 24 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx + 24] + jg wloop + + pop esi + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + align 16 + wloop: + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + sub ecx, 24 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx+24] + jg wloop + + pop esi + ret + } +} + +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm4, kShuf38a + movdqa xmm5, kShuf38b + + align 16 + xloop: + movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + lea eax, [eax + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 + paddusb xmm0, xmm1 + + sub ecx, 12 + movq qword ptr [edx], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edx + 8], xmm1 + lea edx, [edx + 12] + jg xloop + + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAc + movdqa xmm3, kShufAc3 + movdqa xmm4, kScaleAc33 + pxor xmm5, xmm5 + + align 16 + xloop: + movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqa xmm6, [eax + esi] + movhlps xmm1, xmm0 + movhlps xmm7, xmm6 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + movdqa xmm6, [eax + esi * 2] + lea eax, [eax + 16] + movhlps xmm7, xmm6 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + pshufb xmm6, xmm2 + + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + pshufb xmm7, xmm3 + paddusw xmm6, xmm7 + + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + packuswb xmm6, xmm6 + + sub ecx, 6 + movd [edx], xmm6 // write 6 pixels + psrlq xmm6, 16 + movd [edx + 2], xmm6 + lea edx, [edx + 6] + jg xloop + + pop esi + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAb0 + movdqa xmm3, kShufAb1 + movdqa xmm4, kShufAb2 + movdqa xmm5, kScaleAb2 + + align 16 + xloop: + movdqa xmm0, [eax] // average 2 rows into xmm0 + pavgb xmm0, [eax + esi] + lea eax, [eax + 16] + + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + pshufb xmm1, xmm2 + movdqa xmm6, xmm0 + pshufb xmm6, xmm3 + paddusw xmm1, xmm6 + pshufb xmm0, xmm4 + paddusw xmm1, xmm0 + + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + packuswb xmm1, xmm1 + + sub ecx, 6 + movd [edx], xmm1 // write 6 pixels + psrlq xmm1, 16 + movd [edx + 2], xmm1 + lea edx, [edx + 6] + jg xloop + + pop esi + ret + } +} + +// Reads 16xN bytes and produces 16 shorts at a time. +// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. +__declspec(naked) __declspec(align(16)) +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + __asm { + push esi + push edi + push ebx + push ebp + mov esi, [esp + 16 + 4] // src_ptr + mov edx, [esp + 16 + 8] // src_stride + mov edi, [esp + 16 + 12] // dst_ptr + mov ecx, [esp + 16 + 16] // dst_width + mov ebx, [esp + 16 + 20] // height + pxor xmm4, xmm4 + dec ebx + + align 16 + xloop: + // first row + movdqa xmm0, [esi] + lea eax, [esi + edx] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + lea esi, [esi + 16] + mov ebp, ebx + test ebp, ebp + je ydone + + // sum remaining rows + align 16 + yloop: + movdqa xmm2, [eax] // read 16 pixels + lea eax, [eax + edx] // advance to next row + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + paddusw xmm0, xmm2 // sum 16 words + paddusw xmm1, xmm3 + sub ebp, 1 + jg yloop + + align 16 + ydone: + movdqa [edi], xmm0 + movdqa [edi + 16], xmm1 + lea edi, [edi + 32] + + sub ecx, 16 + jg xloop + + pop ebp + pop ebx + pop edi + pop esi + ret + } +} + +// Bilinear column filtering. SSSE3 version. +// TODO(fbarchard): Port to Neon + +__declspec(naked) __declspec(align(16)) +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + __asm { + push ebx + push esi + push edi + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width + movd xmm2, [esp + 12 + 16] // x + movd xmm3, [esp + 12 + 20] // dx + mov eax, 0x04040000 // shuffle to line up fractions with pixel. + movd xmm5, eax + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + align 4 + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm1, 9 // 7 bit fractions. + movzx ebx, word ptr [esi + edx] // 2 source x1 pixels + movd xmm4, ebx + pshufb xmm1, xmm5 // 0011 + punpcklwd xmm0, xmm4 + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits, 2 pixels. + movd ebx, xmm0 + mov [edi], bx + lea edi, [edi + 2] + sub ecx, 2 // 2 pixels + jge xloop2 + + align 4 + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // 16 bit + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits + movd ebx, xmm0 + mov [edi], bl + + align 16 + xloop99: + + pop edi + pop esi + pop ebx + ret + } +} + +// Reads 16 pixels, duplicates them and writes 32 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int /* x */, int /* dx */) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + + align 16 + wloop: + movdqa xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + sub ecx, 32 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + + ret + } +} + +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + shufps xmm0, xmm1, 0xdd + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 8x1 rectangle to 4x1. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 8x2 rectangle to 4x1. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // dst_width + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + __asm { + push ebx + push edi + mov eax, [esp + 8 + 4] // src_argb + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_argb + mov ecx, [esp + 8 + 20] // dst_width + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 16 + wloop: + movd xmm0, [eax] + movd xmm1, [eax + ebx] + punpckldq xmm0, xmm1 + movd xmm2, [eax + ebx * 2] + movd xmm3, [eax + edi] + lea eax, [eax + ebx * 4] + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop ebx + ret + } +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_argb + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 16 + wloop: + movq xmm0, qword ptr [eax] // row0 4 pairs + movhps xmm0, qword ptr [eax + ebx] + movq xmm1, qword ptr [eax + ebx * 2] + movhps xmm1, qword ptr [eax + edi] + lea eax, [eax + ebx * 4] + movq xmm2, qword ptr [esi] // row1 4 pairs + movhps xmm2, qword ptr [esi + ebx] + movq xmm3, qword ptr [esi + ebx * 2] + movhps xmm3, qword ptr [esi + edi] + lea esi, [esi + ebx * 4] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop esi + pop ebx + ret + } +} + +// Column scaling unfiltered. SSE2 version. +__declspec(naked) __declspec(align(16)) +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + push edi + push esi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 + paddd xmm2, xmm0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 + paddd xmm2, xmm0 // x3 x2 x1 x0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 + + pextrw eax, xmm2, 1 // get x0 integer. + pextrw edx, xmm2, 3 // get x1 integer. + + cmp ecx, 0 + jle xloop99 + sub ecx, 4 + jl xloop49 + + // 4 Pixel loop. + align 4 + xloop4: + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + pextrw edx, xmm2, 7 // get x3 integer. + paddd xmm2, xmm3 // x += dx + punpckldq xmm0, xmm1 // x0 x1 + + movd xmm1, [esi + eax * 4] // 1 source x2 pixels + movd xmm4, [esi + edx * 4] // 1 source x3 pixels + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + punpckldq xmm1, xmm4 // x2 x3 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 + sub ecx, 4 // 4 pixels + movdqu [edi], xmm0 + lea edi, [edi + 16] + jge xloop4 + + align 4 + xloop49: + test ecx, 2 + je xloop29 + + // 2 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + punpckldq xmm0, xmm1 // x0 x1 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + + xloop29: + test ecx, 1 + je xloop99 + + // 1 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x2 pixels + movd dword ptr [edi], xmm0 + align 4 + xloop99: + + pop esi + pop edi + ret + } +} + +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. +// TODO(fbarchard): Port to Neon + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +__declspec(naked) __declspec(align(16)) +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + movdqa xmm4, kShuffleColARGB + movdqa xmm5, kShuffleFractions + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + align 4 + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + psrlw xmm1, 9 // 7 bit fractions. + movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels + pshufb xmm1, xmm5 // 0000000011111111 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 2 // 2 pixels + jge xloop2 + + align 4 + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + psrlw xmm2, 9 // 7 bit fractions. + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + pshufb xmm2, xmm5 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. + movd [edi], xmm0 + + align 16 + xloop99: + + pop edi + pop esi + ret + } +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int /* x */, int /* dx */) { + __asm { + mov edx, [esp + 4] // dst_argb + mov eax, [esp + 8] // src_argb + mov ecx, [esp + 12] // dst_width + + align 16 + wloop: + movdqa xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpckldq xmm0, xmm0 + punpckhdq xmm1, xmm1 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + + ret + } +} + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif