From bcb69c8eb6f6610dcc7f98d9943a34b2d37f2ba9 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 29 Apr 2026 16:54:54 -0700 Subject: [PATCH] [libyuv] Remove all x86 SSE optimizations Removed all SSE functions, macros, dispatching logic, and related unit tests across the repository to reduce code size and complexity. Left cpuid detection intact. Supported architectures like AVX2, NEON, SVE, etc. are unaffected. R=rrwinterton@gmail.com Bug: None Test: Build and run libyuv_unittest Change-Id: Id19608dba35b79c4c8fc31f920a6a968883d300f --- include/libyuv/compare_row.h | 24 - include/libyuv/rotate_row.h | 53 - include/libyuv/row.h | 1138 --------- include/libyuv/scale_row.h | 335 --- source/compare.cc | 22 +- source/compare_gcc.cc | 232 -- source/compare_win.cc | 93 - source/convert.cc | 214 +- source/convert_argb.cc | 383 +-- source/convert_from.cc | 16 +- source/convert_from_argb.cc | 228 +- source/planar_functions.cc | 255 +- source/rotate.cc | 21 +- source/rotate_any.cc | 9 - source/rotate_argb.cc | 10 +- source/rotate_gcc.cc | 413 --- source/rotate_win.cc | 221 -- source/row_any.cc | 347 +-- source/row_common.cc | 36 +- source/row_gcc.cc | 4342 +------------------------------- source/row_rvv.cc | 8 - source/row_win.cc | 12 - source/scale.cc | 119 +- source/scale_any.cc | 42 +- source/scale_argb.cc | 51 +- source/scale_common.cc | 12 +- source/scale_gcc.cc | 1832 +------------- source/scale_uv.cc | 50 +- source/scale_win.cc | 917 ------- unit_test/compare_test.cc | 16 +- unit_test/convert_argb_test.cc | 7 - unit_test/cpu_test.cc | 7 - unit_test/planar_test.cc | 16 +- unit_test/rotate_test.cc | 6 +- unit_test/scale_plane_test.cc | 5 +- util/psnr.cc | 122 - 36 files changed, 444 insertions(+), 11170 deletions(-) diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index a08734e90..231a0177b 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -19,14 +19,6 @@ namespace libyuv { extern "C" { #endif -// The following are available for Visual C and GCC: -#if !defined(LIBYUV_DISABLE_X86) && \ - ((defined(__x86_64__) && !defined(LIBYUV_ENABLE_ROWWIN)) || \ - defined(__i386__) || defined(_M_IX86)) -#define HAS_HASHDJB2_SSE41 -#define HAS_SUMSQUAREERROR_SSE2 -#define HAS_HAMMINGDISTANCE_SSE42 -#endif // The following are available for Visual C and clangcl 32 bit: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ @@ -36,12 +28,6 @@ extern "C" { #define HAS_SUMSQUAREERROR_AVX2 #endif -// The following are available for GCC and clangcl: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - !defined(LIBYUV_ENABLE_ROWWIN) -#define HAS_HAMMINGDISTANCE_SSSE3 -#endif // The following are available for GCC and clangcl: #if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ @@ -68,12 +54,6 @@ extern "C" { uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, int count); -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t HammingDistance_SSSE3(const uint8_t* src_a, - const uint8_t* src_b, - int count); uint32_t HammingDistance_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count); @@ -86,9 +66,6 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a, uint32_t SumSquareError_C(const uint8_t* src_a, const uint8_t* src_b, int count); -uint32_t SumSquareError_SSE2(const uint8_t* src_a, - const uint8_t* src_b, - int count); uint32_t SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count); @@ -100,7 +77,6 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a, int count); uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed); -uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed); uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed); uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed); diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index 49f8a44bf..b84764657 100644 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -20,28 +20,14 @@ extern "C" { #endif // The following are available for Visual C 32 bit: -// TODO - port to clangcl on rotate_win -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ - !defined(__clang__) -#define HAS_TRANSPOSEWX8_SSSE3 -#define HAS_TRANSPOSEUVWX8_SSE2 -#endif // The following are available for GCC 32 or 64 bit: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__i386__) || defined(__x86_64__)) && \ !defined(LIBYUV_ENABLE_ROWWIN) -#define HAS_TRANSPOSEWX8_SSSE3 -#define HAS_TRANSPOSE4X4_32_SSE2 #define HAS_TRANSPOSE4X4_32_AVX2 #endif -// The following are available for 64 bit GCC: -#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \ - !defined(LIBYUV_ENABLE_ROWWIN) -#define HAS_TRANSPOSEWX8_FAST_SSSE3 -#define HAS_TRANSPOSEUVWX8_SSE2 -#endif #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) @@ -98,16 +84,6 @@ void TransposeWxH_SME(const uint8_t* src, int dst_stride, int width, int height); -void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_Fast_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); void TransposeWx16_LSX(const uint8_t* src, int src_stride, uint8_t* dst, @@ -124,16 +100,6 @@ void TransposeWx16_Any_NEON(const uint8_t* src, uint8_t* dst, int dst_stride, int width); -void TransposeWx8_Any_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); void TransposeWx16_Any_LSX(const uint8_t* src, int src_stride, uint8_t* dst, @@ -163,13 +129,6 @@ void TransposeUVWx16_C(const uint8_t* src, uint8_t* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); void TransposeUVWx8_NEON(const uint8_t* src, int src_stride, uint8_t* dst_a, @@ -193,13 +152,6 @@ void TransposeUVWx16_LSX(const uint8_t* src, int dst_stride_b, int width); -void TransposeUVWx8_Any_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); void TransposeUVWx8_Any_NEON(const uint8_t* src, int src_stride, uint8_t* dst_a, @@ -239,11 +191,6 @@ void Transpose4x4_32_NEON(const uint8_t* src, int dst_stride, int width); -void Transpose4x4_32_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); void Transpose4x4_32_AVX2(const uint8_t* src, int src_stride, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 40272cf5a..b8e3294ca 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -36,85 +36,14 @@ extern "C" { // The following are available on clang x86 platforms: #if defined(USE_ROW_GCC) // Conversions: -#define HAS_ARGB1555TOARGBROW_SSE2 -#define HAS_ARGB4444TOARGBROW_SSE2 -#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_ARGBSETROW_X86 -#define HAS_ARGBSHUFFLEROW_SSSE3 -#define HAS_ARGBTOARGB1555ROW_SSE2 -#define HAS_ARGBTOARGB4444ROW_SSE2 -#define HAS_ARGBTORAWROW_SSSE3 -#define HAS_ARGBTORGB24ROW_SSSE3 -#define HAS_ARGBTORGB565DITHERROW_SSE2 -#define HAS_ARGBTORGB565ROW_SSE2 #define HAS_COPYROW_ERMS -#define HAS_COPYROW_SSE2 -#define HAS_H422TOARGBROW_SSSE3 -#define HAS_HALFFLOATROW_SSE2 -#define HAS_I422TOARGB1555ROW_SSSE3 -#define HAS_I422TOARGB4444ROW_SSSE3 -#define HAS_I422TOARGBROW_SSSE3 -#define HAS_I422TORGB24ROW_SSSE3 -#define HAS_I422TORGB565ROW_SSSE3 -#define HAS_I422TORGBAROW_SSSE3 -#define HAS_I422TOUYVYROW_SSE2 -#define HAS_I422TOYUY2ROW_SSE2 -#define HAS_I444TOARGBROW_SSSE3 -#define HAS_I444TORGB24ROW_SSSE3 -#define HAS_INTERPOLATEROW_SSSE3 -#define HAS_J400TOARGBROW_SSE2 -#define HAS_J422TOARGBROW_SSSE3 -#define HAS_MERGEUVROW_SSE2 -#define HAS_MIRRORROW_SSSE3 -#define HAS_MIRRORSPLITUVROW_SSSE3 -#define HAS_NV12TOARGBROW_SSSE3 -#define HAS_NV12TORGB24ROW_SSSE3 -#define HAS_NV12TORGB565ROW_SSSE3 -#define HAS_NV21TOARGBROW_SSSE3 -#define HAS_NV21TORGB24ROW_SSSE3 -#define HAS_RAWTOARGBROW_SSSE3 -#define HAS_RAWTORGB24ROW_SSSE3 -#define HAS_RGB24TOARGBROW_SSSE3 -#define HAS_RGB565TOARGBROW_SSE2 #define HAS_SETROW_ERMS #define HAS_SETROW_X86 -#define HAS_SPLITUVROW_SSE2 -#define HAS_UYVYTOARGBROW_SSSE3 -#define HAS_UYVYTOUV422ROW_SSE2 -#define HAS_UYVYTOUVROW_SSE2 -#define HAS_UYVYTOYROW_SSE2 -#define HAS_YUY2TOARGBROW_SSSE3 -#define HAS_YUY2TOUV422ROW_SSE2 -#define HAS_YUY2TOUVROW_SSE2 -#define HAS_YUY2TOYROW_SSE2 // Effects: -#define HAS_ARGBADDROW_SSE2 -#define HAS_ARGBAFFINEROW_SSE2 -#define HAS_ARGBBLENDROW_SSSE3 -#define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORTABLEROW_X86 -#define HAS_ARGBCOPYALPHAROW_SSE2 -#define HAS_ARGBCOPYYTOALPHAROW_SSE2 -#define HAS_ARGBGRAYROW_SSSE3 -#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 -#define HAS_ARGBMIRRORROW_SSE2 -// TODO: Re-enable once rounding behaviour is fixed. -// #define HAS_ARGBMULTIPLYROW_SSE2 -#define HAS_ARGBPOLYNOMIALROW_SSE2 -#define HAS_ARGBQUANTIZEROW_SSE2 -#define HAS_ARGBSEPIAROW_SSSE3 -#define HAS_ARGBSHADEROW_SSE2 -#define HAS_ARGBSUBTRACTROW_SSE2 -#define HAS_BLENDPLANEROW_SSSE3 -#define HAS_COMPUTECUMULATIVESUMROW_SSE2 -#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #define HAS_RGBCOLORTABLEROW_X86 -#define HAS_SOBELROW_SSE2 -#define HAS_SOBELTOPLANEROW_SSE2 -#define HAS_SOBELXROW_SSE2 -#define HAS_SOBELXYROW_SSE2 -#define HAS_SOBELYROW_SSE2 // The following functions fail on gcc/clang 32 bit with fpic and framepointer. // caveat: clangcl uses row_win.cc which works. @@ -123,8 +52,6 @@ extern "C" { !defined(LIBYUV_ENABLE_ROWWIN) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_SSSE3 -#define HAS_I444ALPHATOARGBROW_SSSE3 #endif #if (defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ defined(_MSC_VER)) && \ @@ -200,77 +127,11 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ (defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_ENABLE_ROWWIN) -#define HAS_AB64TOARGBROW_SSSE3 -#define HAS_ABGRTOAR30ROW_SSSE3 -#define HAS_ABGRTOYJROW_SSSE3 -#define HAS_AR64TOARGBROW_SSSE3 -#define HAS_ARGBATTENUATEROW_SSSE3 -#define HAS_ARGBTOAB64ROW_SSSE3 -#define HAS_ARGBTOAR30ROW_SSSE3 -#define HAS_ARGBTOAR64ROW_SSSE3 -#define HAS_ARGBTOUV444ROW_SSSE3 -#define HAS_ARGBTOUVJ444ROW_SSSE3 -#define HAS_ARGBUNATTENUATEROW_SSE2 -#define HAS_CONVERT16TO8ROW_SSSE3 -#define HAS_CONVERT8TO16ROW_SSE2 -#define HAS_DETILEROW_16_SSE2 -#define HAS_DETILEROW_SSE2 -#define HAS_DETILESPLITUVROW_SSSE3 -#define HAS_DETILETOYUY2_SSE2 -#define HAS_HALFMERGEUVROW_SSSE3 -#define HAS_I210TOAR30ROW_SSSE3 -#define HAS_I210TOARGBROW_SSSE3 -#define HAS_I212TOAR30ROW_SSSE3 -#define HAS_I212TOARGBROW_SSSE3 -#define HAS_I400TOARGBROW_SSE2 -#define HAS_I410TOAR30ROW_SSSE3 -#define HAS_I410TOARGBROW_SSSE3 -#define HAS_I422TOAR30ROW_SSSE3 -#define HAS_MERGEARGBROW_SSE2 -#define HAS_MERGERGBROW_SSSE3 -#define HAS_MERGEXRGBROW_SSE2 -#define HAS_MIRRORUVROW_SSSE3 -#define HAS_NV21TOYUV24ROW_SSSE3 -#define HAS_P210TOAR30ROW_SSSE3 -#define HAS_P210TOARGBROW_SSSE3 -#define HAS_P410TOAR30ROW_SSSE3 -#define HAS_P410TOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_AVX2 -#define HAS_RAWTORGBAROW_SSSE3 -#define HAS_RGB24MIRRORROW_SSSE3 -#define HAS_RGBATOYJROW_SSSE3 -#define HAS_SPLITARGBROW_SSE2 -#define HAS_SPLITARGBROW_SSSE3 -#define HAS_SPLITRGBROW_SSE41 -#define HAS_SPLITRGBROW_SSSE3 -#define HAS_SPLITXRGBROW_SSE2 -#define HAS_SPLITXRGBROW_SSSE3 -#define HAS_SWAPUVROW_SSSE3 -#define HAS_YUY2TONVUVROW_SSE2 // TODO: port row_win to use 8 bit coefficients. -#define HAS_ARGBTOYJROW_SSSE3 -#define HAS_ARGBTOYROW_SSSE3 -#define HAS_ARGBTOYMATRIXROW_SSSE3 -#define HAS_BGRATOYROW_SSSE3 -#define HAS_ABGRTOYROW_SSSE3 -#define HAS_RGBATOYROW_SSSE3 // TODO: adjust row_win to use 8 bit negative coefficients. -#define HAS_ABGRTOUVJROW_SSSE3 -#define HAS_ARGBTOUVJROW_SSSE3 -#define HAS_ABGRTOUVROW_SSSE3 -#define HAS_ARGBTOUVROW_SSSE3 -#define HAS_BGRATOUVROW_SSSE3 -#define HAS_RGBATOUVROW_SSSE3 -#define HAS_ARGBTOUVMATRIXROW_SSSE3 -#define HAS_ARGBTOUV444MATRIXROW_SSSE3 -#if defined(__x86_64__) || !defined(__pic__) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I210ALPHATOARGBROW_SSSE3 -#define HAS_I410ALPHATOARGBROW_SSSE3 -#endif #endif // The following are available for AVX2 gcc/clang x86 platforms: @@ -1753,30 +1614,22 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_AVX512BW(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); -void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYJRow_AVX512BW(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYJRow_AVX512BW(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); -void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); void BGRAToYRow_AVX512BW(const uint8_t* src_bgra, uint8_t* dst_y, int width); void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width); -void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); -void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYRow_AVX512BW(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); @@ -2167,12 +2020,6 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2190,10 +2037,6 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width, @@ -2202,10 +2045,6 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGBToYMatrixRow_Any_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); void ARGBToYMatrixRow_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width, @@ -2251,11 +2090,6 @@ void ARGBToYMatrixRow_Any_LASX(const uint8_t* src_argb, const struct ArgbConstants* c); -void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2266,12 +2100,6 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGBToUVMatrixRow_Any_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); void ARGBToUVMatrixRow_Any_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2284,11 +2112,6 @@ void ARGBToUVMatrixRow_Any_AVX512BW(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGBToUV444MatrixRow_Any_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); void ARGBToUV444MatrixRow_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2317,13 +2140,6 @@ void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON_DotProd(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -2424,36 +2240,6 @@ void ABGRToUVJRow_AVX512BW(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2494,36 +2280,6 @@ void ABGRToUVJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVJRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -2854,23 +2610,7 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, uint8_t* dst_v, int width); -void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJ444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJ444Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void ARGBToUV444Row_AVX2(const uint8_t* src_argb, uint8_t* dst_u, @@ -2917,33 +2657,24 @@ void ARGBToUVJ444Row_C(const uint8_t* src_argb, int width); void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width); -void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -2960,7 +2691,6 @@ void MirrorSplitUVRow_C(const uint8_t* src_uv, void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width); void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); @@ -2968,9 +2698,6 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -2979,16 +2706,10 @@ void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width); void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); -void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -2997,10 +2718,6 @@ void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); -void SplitUVRow_SSE2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -3017,10 +2734,6 @@ void SplitUVRow_RVV(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); -void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void SplitUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -3045,14 +2758,6 @@ void DetileRow_Any_NEON(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width); -void DetileRow_SSE2(const uint8_t* src, - ptrdiff_t src_tile_stride, - uint8_t* dst, - int width); -void DetileRow_Any_SSE2(const uint8_t* src, - ptrdiff_t src_tile_stride, - uint8_t* dst, - int width); void DetileRow_AVX(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, @@ -3073,14 +2778,6 @@ void DetileRow_16_Any_NEON(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width); -void DetileRow_16_SSE2(const uint16_t* src, - ptrdiff_t src_tile_stride, - uint16_t* dst, - int width); -void DetileRow_16_Any_SSE2(const uint16_t* src, - ptrdiff_t src_tile_stride, - uint16_t* dst, - int width); void DetileRow_16_AVX(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, @@ -3094,16 +2791,6 @@ void DetileSplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); -void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, - ptrdiff_t src_tile_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void DetileSplitUVRow_Any_SSSE3(const uint8_t* src_uv, - ptrdiff_t src_tile_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void DetileSplitUVRow_NEON(const uint8_t* src_uv, ptrdiff_t src_tile_stride, uint8_t* dst_u, @@ -3120,18 +2807,6 @@ void DetileToYUY2_C(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width); -void DetileToYUY2_SSE2(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width); -void DetileToYUY2_Any_SSE2(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width); void DetileToYUY2_NEON(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, const uint8_t* src_uv, @@ -3150,10 +2825,6 @@ void MergeUVRow_C(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); -void MergeUVRow_SSE2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width); void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -3178,10 +2849,6 @@ void MergeUVRow_RVV(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); -void MergeUVRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void MergeUVRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -3213,12 +2880,6 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, uint8_t* dst_uv, int width); -void HalfMergeUVRow_SSSE3(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width); void HalfMergeUVRow_AVX2(const uint8_t* src_u, int src_stride_u, @@ -3232,16 +2893,6 @@ void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitRGBRow_SSSE3(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitRGBRow_SSE41(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -3257,16 +2908,6 @@ void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitRGBRow_Any_SSE41(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); void SplitRGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -3283,11 +2924,6 @@ void MergeRGBRow_C(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width); -void MergeRGBRow_SSSE3(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width); void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -3298,11 +2934,6 @@ void MergeRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width); -void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); void MergeRGBRow_Any_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -3314,12 +2945,6 @@ void MergeARGBRow_C(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width); -void MergeARGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width); void MergeARGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -3338,12 +2963,6 @@ void MergeARGBRow_RVV(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width); -void MergeARGBRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - int width); void MergeARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3362,18 +2981,6 @@ void SplitARGBRow_C(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); void SplitARGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -3392,18 +2999,6 @@ void SplitARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -3421,11 +3016,6 @@ void MergeXRGBRow_C(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width); -void MergeXRGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width); void MergeXRGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -3441,11 +3031,6 @@ void MergeXRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width); -void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3461,16 +3046,6 @@ void SplitXRGBRow_C(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -3486,16 +3061,6 @@ void SplitXRGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -3791,18 +3356,10 @@ void Convert8To16Row_C(const uint8_t* src_y, uint16_t* dst_y, int scale, int width); -void Convert8To16Row_SSE2(const uint8_t* src_y, - uint16_t* dst_y, - int scale, - int width); void Convert8To16Row_AVX2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width); -void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int scale, - int width); void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int scale, @@ -3824,10 +3381,6 @@ void Convert16To8Row_C(const uint16_t* src_y, uint8_t* dst_y, int scale, int width); -void Convert16To8Row_SSSE3(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width); void Convert16To8Row_AVX2(const uint16_t* src_y, uint8_t* dst_y, int scale, @@ -3836,10 +3389,6 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, uint8_t* dst_y, int scale, int width); -void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int scale, - int width); void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr, uint8_t* dst_ptr, int scale, @@ -3897,7 +3446,6 @@ void Convert8To8Row_Any_AVX2(const uint8_t* src_ptr, int bias, int width); -void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width); void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); @@ -3905,7 +3453,6 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); void CopyRow_SME(const uint8_t* src, uint8_t* dst, int width); void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count); void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); -void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3913,19 +3460,12 @@ void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count); void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width); -void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_a, - int width); void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width); @@ -3938,9 +3478,6 @@ void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb, void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, uint8_t* dst_a, int width); -void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3952,12 +3489,8 @@ void ARGBExtractAlphaRow_Any_LSX(const uint8_t* src_ptr, int width); void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3983,10 +3516,6 @@ void ARGBShuffleRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); -void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width); void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, @@ -4003,10 +3532,6 @@ void ARGBShuffleRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); -void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint8_t* param, - int width); void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, @@ -4024,17 +3549,8 @@ void ARGBShuffleRow_Any_LASX(const uint8_t* src_ptr, const uint8_t* param, int width); -void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width); -void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); -void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); -void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); @@ -4114,32 +3630,11 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width); void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width); -void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4196,13 +3691,6 @@ void ARGB4444ToARGBRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width); @@ -4213,10 +3701,6 @@ void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, uint32_t dither4, int width); -void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, - uint8_t* dst, - uint32_t dither4, - int width); void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, uint32_t dither4, @@ -4315,18 +3799,6 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64, uint8_t* dst_ar64, const uint8_t* shuffler, int width); -void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width); -void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width); -void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width); -void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width); void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width); void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width); void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width); @@ -4341,18 +3813,6 @@ void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width); void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width); void AR64ToAB64Row_RVV(const uint16_t* src_ar64, uint16_t* dst_ab64, int width); void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width); -void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int width); -void ARGBToAB64Row_Any_SSSE3(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int width); -void AR64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int width); -void AB64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBToAR64Row_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int width); @@ -4378,15 +3838,11 @@ void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr, uint8_t* dst_ptr, int width); -void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); -void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4589,93 +4045,19 @@ void I422ToRGBARow_AVX2(const uint8_t* y_buf, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I444ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I444ToRGB24Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); void I444ToRGB24Row_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToAR30Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I212ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I212ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I422ToAR30Row_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4732,13 +4114,6 @@ void I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4746,13 +4121,6 @@ void I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4760,31 +4128,11 @@ void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void NV12ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); void NV12ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, @@ -4795,10 +4143,6 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width); void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, @@ -4808,24 +4152,11 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void NV21ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* vu_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -4835,26 +4166,6 @@ void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, const struct YuvConstants* yuvconstants, int width); -void P210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); void P210ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, @@ -4876,54 +4187,24 @@ void P410ToAR30Row_AVX2(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width); void I422ToARGB4444Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width); void I422ToARGB1555Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); void I422ToRGB565Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); void I422ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4948,18 +4229,6 @@ void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I444ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4972,68 +4241,6 @@ void I444ToRGB24Row_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5090,13 +4297,6 @@ void I410AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I444AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I444AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5104,13 +4304,6 @@ void I444AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5118,36 +4311,16 @@ void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -5158,32 +4331,15 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToYUV24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void NV21ToYUV24Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, @@ -5192,26 +4348,6 @@ void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void P210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void P210ToARGBRow_Any_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ptr, @@ -5232,54 +4368,24 @@ void P410ToAR30Row_Any_AVX2(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5291,10 +4397,6 @@ void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I400ToARGBRow_SSE2(const uint8_t* y_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -5319,10 +4421,6 @@ void I400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* param, - int width); void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* param, @@ -5337,10 +4435,6 @@ void I400ToARGBRow_Any_LSX(const uint8_t* src_ptr, int width); // ARGB preattenuated alpha blend. -void ARGBBlendRow_SSSE3(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -5359,16 +4453,6 @@ void ARGBBlendRow_C(const uint8_t* src_argb, int width); // Unattenuated planar alpha blend. -void BlendPlaneRow_SSSE3(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width); -void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); void BlendPlaneRow_AVX2(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, @@ -5396,14 +4480,6 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -5446,14 +4522,6 @@ void ARGBAddRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBAddRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBAddRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -5493,14 +4561,6 @@ void ARGBSubtractRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBSubtractRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -5534,27 +4594,6 @@ void ARGBSubtractRow_Any_LASX(const uint8_t* y_buf, uint8_t* dst_ptr, int width); -void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -5562,10 +4601,6 @@ void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr, void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint32_t param, - int width); void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, @@ -6028,20 +5063,6 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); -void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); -void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_uv, - int width); -void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, int stride_yuy2, @@ -6104,20 +5125,6 @@ void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); -void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToNVUVRow_Any_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_uv, - int width); -void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, @@ -6162,16 +5169,6 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); -void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); -void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, int stride_uyvy, @@ -6233,16 +5230,6 @@ void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); -void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, @@ -6276,8 +5263,6 @@ void UYVYToUV422Row_Any_LASX(const uint8_t* src_ptr, void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width); -void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); @@ -6334,26 +5319,6 @@ void I422ToUYVYRow_C(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_frame, int width); -void I422ToYUY2Row_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width); -void I422ToUYVYRow_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width); -void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); void I422ToYUY2Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -6437,9 +5402,6 @@ void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf, // Effects related row functions. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width); @@ -6455,9 +5417,6 @@ void ARGBAttenuateRow_LASX(const uint8_t* src_argb, void ARGBAttenuateRow_RVV(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -6476,21 +5435,14 @@ extern const uint32_t fixed_invtbl8[256]; void ARGBUnattenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb, uint8_t* dst_argb, @@ -6499,7 +5451,6 @@ void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBSepiaRow_C(uint8_t* dst_argb, int width); -void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width); void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width); void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width); void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width); @@ -6509,10 +5460,6 @@ void ARGBColorMatrixRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width); -void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width); void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, @@ -6545,11 +5492,6 @@ void ARGBQuantizeRow_C(uint8_t* dst_argb, int interval_size, int interval_offset, int width); -void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width); void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int scale, int interval_size, @@ -6565,10 +5507,6 @@ void ARGBShadeRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value); -void ARGBShadeRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value); void ARGBShadeRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width, @@ -6583,16 +5521,6 @@ void ARGBShadeRow_LASX(const uint8_t* src_argb, uint32_t value); // Used for blur. -void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, - const int32_t* botleft, - int width, - int area, - uint8_t* dst, - int count); -void ComputeCumulativeSumRow_SSE2(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width); void CumulativeSumToAverageRow_C(const int32_t* tl, const int32_t* bl, @@ -6612,11 +5540,6 @@ void ARGBAffineRow_C(const uint8_t* src_argb, const float* uv_dudv, int width); LIBYUV_API -void ARGBAffineRow_SSE2(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* src_dudv, - int width); // Used for I420Scale, ARGBScale, and ARGBInterpolate. void InterpolateRow_C(uint8_t* dst_ptr, @@ -6624,11 +5547,6 @@ void InterpolateRow_C(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); -void InterpolateRow_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction); void InterpolateRow_AVX2(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, @@ -6659,11 +5577,6 @@ void InterpolateRow_Any_NEON(uint8_t* dst_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride_ptr, - int width, - int source_y_fraction); void InterpolateRow_Any_AVX2(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, @@ -6739,11 +5652,6 @@ void SobelXRow_C(const uint8_t* src_y0, const uint8_t* src_y2, uint8_t* dst_sobelx, int width); -void SobelXRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width); void SobelXRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, @@ -6753,10 +5661,6 @@ void SobelYRow_C(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width); -void SobelYRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width); void SobelYRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, @@ -6765,10 +5669,6 @@ void SobelRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); -void SobelRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); void SobelRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, @@ -6781,10 +5681,6 @@ void SobelToPlaneRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width); -void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width); void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, @@ -6797,10 +5693,6 @@ void SobelXYRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); -void SobelXYRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); void SobelXYRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, @@ -6809,10 +5701,6 @@ void SobelXYRow_LSX(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); -void SobelRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void SobelRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -6821,10 +5709,6 @@ void SobelRow_Any_LSX(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -6833,10 +5717,6 @@ void SobelToPlaneRow_Any_LSX(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void SobelXYRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void SobelXYRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -6850,10 +5730,6 @@ void ARGBPolynomialRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width); -void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width); void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, @@ -6861,14 +5737,6 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, // Scale and convert to half float. void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width); -void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - float param, - int width); void HalfFloatRow_AVX2(const uint16_t* src, uint16_t* dst, float scale, @@ -6944,11 +5812,6 @@ void ARGBLumaColorTableRow_C(const uint8_t* src_argb, int width, const uint8_t* luma, uint32_t lumacoeff); -void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff); float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width); float ScaleMaxSamples_NEON(const float* src, @@ -6999,7 +5862,6 @@ void GaussCol_NEON(const uint16_t* src0, uint32_t* dst, int width); -void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width); #ifdef __cplusplus } // extern "C" diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 16f626439..8ec7a9436 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -27,39 +27,9 @@ extern "C" { defined(__i386__)) #define HAS_FIXEDDIV1_X86 #define HAS_FIXEDDIV_X86 -#define HAS_SCALEADDROW_SSE2 -#define HAS_SCALEARGBCOLS_SSE2 -#define HAS_SCALEARGBCOLSUP2_SSE2 -#define HAS_SCALEARGBFILTERCOLS_SSSE3 -#define HAS_SCALEARGBROWDOWN2_SSE2 -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -#define HAS_SCALECOLSUP2_SSE2 -#define HAS_SCALEFILTERCOLS_SSSE3 -#define HAS_SCALEROWDOWN2_SSSE3 -#define HAS_SCALEROWDOWN34_SSSE3 -#define HAS_SCALEROWDOWN38_SSSE3 -#define HAS_SCALEROWDOWN4_SSSE3 #endif // The following are available for gcc/clang x86 platforms: -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - !defined(LIBYUV_ENABLE_ROWWIN) -#define HAS_SCALEUVROWDOWN2BOX_SSSE3 -#define HAS_SCALEROWUP2_LINEAR_SSE2 -#define HAS_SCALEROWUP2_LINEAR_SSSE3 -#define HAS_SCALEROWUP2_BILINEAR_SSE2 -#define HAS_SCALEROWUP2_BILINEAR_SSSE3 -#define HAS_SCALEROWUP2_LINEAR_12_SSSE3 -#define HAS_SCALEROWUP2_BILINEAR_12_SSSE3 -#define HAS_SCALEROWUP2_LINEAR_16_SSE2 -#define HAS_SCALEROWUP2_BILINEAR_16_SSE2 -#define HAS_SCALEUVROWUP2_LINEAR_SSSE3 -#define HAS_SCALEUVROWUP2_BILINEAR_SSSE3 -#define HAS_SCALEUVROWUP2_LINEAR_16_SSE41 -#define HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 -#endif // The following are available for gcc/clang x86 platforms, but // require clang 3.4 or gcc 4.7. @@ -610,18 +580,6 @@ void ScaleUVFilterCols64_C(uint8_t* dst_uv, int dx); // Specialized scalers for x86. -void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleRowDown2_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -634,14 +592,6 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); -void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleRowDown4_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -651,63 +601,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); -void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); @@ -732,38 +626,6 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); @@ -789,22 +651,6 @@ void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -821,14 +667,6 @@ void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); -void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -838,67 +676,14 @@ void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); -void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); -void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width); void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); -void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); // ARGB Column functions -void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, @@ -926,18 +711,6 @@ void ScaleARGBFilterCols_RVV(uint8_t* dst_argb, int dx); // ARGB Row functions -void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -986,18 +759,6 @@ void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); -void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -1022,16 +783,6 @@ void ScaleARGBRowDown2Box_Any_LSX(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); -void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -1062,16 +813,6 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width); -void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, @@ -1094,18 +835,6 @@ void ScaleARGBRowDownEvenBox_Any_LSX(const uint8_t* src_ptr, int dst_width); // UV Row functions -void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, @@ -1146,18 +875,6 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); -void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -1174,16 +891,6 @@ void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); -void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, @@ -1204,16 +911,6 @@ void ScaleUVRowDownEven_RVV(const uint8_t* src_ptr, int32_t src_stepx, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, @@ -1225,22 +922,6 @@ void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); -void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); @@ -1281,22 +962,6 @@ void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); diff --git a/source/compare.cc b/source/compare.cc index e85cc6d07..ca8c78cbb 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -32,32 +32,31 @@ LIBYUV_API uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { const int kBlockSize = 1 << 15; // 32768; int remainder; - uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) = + uint32_t (*HashDjb2_Opt)(const uint8_t* src, int count, uint32_t seed) = HashDjb2_C; -#if defined(HAS_HASHDJB2_SSE41) +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - HashDjb2_SSE = HashDjb2_SSE41; } #endif #if defined(HAS_HASHDJB2_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - HashDjb2_SSE = HashDjb2_AVX2; + HashDjb2_Opt = HashDjb2_AVX2; } #endif #if defined(HAS_HASHDJB2_NEON) if (TestCpuFlag(kCpuHasNEON)) { - HashDjb2_SSE = HashDjb2_NEON; + HashDjb2_Opt = HashDjb2_NEON; } #endif while (count >= (uint64_t)kBlockSize) { - seed = HashDjb2_SSE(src, kBlockSize, seed); + seed = HashDjb2_Opt(src, kBlockSize, seed); src += kBlockSize; count -= kBlockSize; } remainder = (int)count & ~15; if (remainder) { - seed = HashDjb2_SSE(src, remainder, seed); + seed = HashDjb2_Opt(src, remainder, seed); src += remainder; count -= remainder; } @@ -144,14 +143,12 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a, HammingDistance = HammingDistance_NEON_DotProd; } #endif -#if defined(HAS_HAMMINGDISTANCE_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - HammingDistance = HammingDistance_SSSE3; } #endif -#if defined(HAS_HAMMINGDISTANCE_SSE42) +#if 0 if (TestCpuFlag(kCpuHasSSE42)) { - HammingDistance = HammingDistance_SSE42; } #endif #if defined(HAS_HAMMINGDISTANCE_AVX2) @@ -204,10 +201,9 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a, SumSquareError = SumSquareError_NEON_DotProd; } #endif -#if defined(HAS_SUMSQUAREERROR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { // Note only used for multiples of 16 so count is not checked. - SumSquareError = SumSquareError_SSE2; } #endif #if defined(HAS_SUMSQUAREERROR_AVX2) diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 33a725e58..3f6502909 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -26,155 +26,16 @@ extern "C" { // "memory" clobber prevents the reads from being removed #if defined(__x86_64__) -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint64_t diff; - asm volatile( - "xor %3,%3 \n" - "xor %%r8,%%r8 \n" - "xor %%r9,%%r9 \n" - "xor %%r10,%%r10 \n" - - // Process 32 bytes per loop. - LABELALIGN - "1: \n" - "mov (%0),%%rcx \n" - "mov 0x8(%0),%%rdx \n" - "xor (%1),%%rcx \n" - "xor 0x8(%1),%%rdx \n" - "popcnt %%rcx,%%rcx \n" - "popcnt %%rdx,%%rdx \n" - "mov 0x10(%0),%%rsi \n" - "mov 0x18(%0),%%rdi \n" - "xor 0x10(%1),%%rsi \n" - "xor 0x18(%1),%%rdi \n" - "popcnt %%rsi,%%rsi \n" - "popcnt %%rdi,%%rdi \n" - "add $0x20,%0 \n" - "add $0x20,%1 \n" - "add %%rcx,%3 \n" - "add %%rdx,%%r8 \n" - "add %%rsi,%%r9 \n" - "add %%rdi,%%r10 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - - "add %%r8, %3 \n" - "add %%r9, %3 \n" - "add %%r10, %3 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=&r"(diff) // %3 - : - : "cc", "memory", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); - - return (uint32_t)(diff); -} #else -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - asm volatile( - // Process 16 bytes per loop. - LABELALIGN - "1: \n" - "mov (%0),%%ecx \n" - "mov 0x4(%0),%%edx \n" - "xor (%1),%%ecx \n" - "xor 0x4(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "mov 0x8(%0),%%ecx \n" - "mov 0xc(%0),%%edx \n" - "xor 0x8(%1),%%ecx \n" - "xor 0xc(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "add $0x10,%0 \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "+r"(diff) // %3 - : - : "cc", "memory", "ecx", "edx"); - - return diff; -} #endif static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; -uint32_t HammingDistance_SSSE3(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff; - asm volatile( - "movdqa %4,%%xmm2 \n" - "movdqa %5,%%xmm3 \n" - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub %0,%1 \n" - - LABELALIGN - "1: \n" - "movdqa (%0),%%xmm4 \n" - "movdqa 0x10(%0), %%xmm5 \n" - "pxor (%0,%1), %%xmm4 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pand %%xmm2,%%xmm6 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm6,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "paddb %%xmm7,%%xmm6 \n" - "pxor 0x10(%0,%1),%%xmm5 \n" - "add $0x20,%0 \n" - "movdqa %%xmm5,%%xmm4 \n" - "pand %%xmm2,%%xmm5 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm5,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufb %%xmm4,%%xmm5 \n" - "paddb %%xmm7,%%xmm5 \n" - "paddb %%xmm5,%%xmm6 \n" - "psadbw %%xmm1,%%xmm6 \n" - "paddd %%xmm6,%%xmm0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - - "pshufd $0xaa,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0, %3 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=r"(diff) // %3 - : "m"(kNibbleMask), // %4 - "m"(kBitCount) // %5 - : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); - - return diff; -} #ifdef HAS_HAMMINGDISTANCE_AVX2 uint32_t HammingDistance_AVX2(const uint8_t* src_a, @@ -232,47 +93,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, } #endif // HAS_HAMMINGDISTANCE_AVX2 -uint32_t SumSquareError_SSE2(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse; - asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=r"(sse) // %3 - : - : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); - return sse; -} static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 static const uvec32 kHashMul0 = { @@ -300,60 +121,7 @@ static const uvec32 kHashMul3 = { 0x00000001, // 33 ^ 0 }; -uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { - uint32_t hash; - asm volatile( - "movd %2,%%xmm0 \n" - "pxor %%xmm7,%%xmm7 \n" - "movdqa %4,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pmulld %%xmm6,%%xmm0 \n" - "movdqa %5,%%xmm5 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "pmulld %%xmm5,%%xmm3 \n" - "movdqa %6,%%xmm5 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpckhwd %%xmm7,%%xmm4 \n" - "pmulld %%xmm5,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "punpckhbw %%xmm7,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "pmulld %%xmm5,%%xmm2 \n" - "movdqa %8,%%xmm5 \n" - "punpckhwd %%xmm7,%%xmm1 \n" - "pmulld %%xmm5,%%xmm1 \n" - "paddd %%xmm4,%%xmm3 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm1 \n" - "pshufd $0xe,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "pshufd $0x1,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "sub $0x10,%1 \n" - "jg 1b \n" - "movd %%xmm0,%3 \n" - : "+r"(src), // %0 - "+r"(count), // %1 - "+rm"(seed), // %2 - "=r"(hash) // %3 - : "m"(kHash16x33), // %4 - "m"(kHashMul0), // %5 - "m"(kHashMul1), // %6 - "m"(kHashMul2), // %7 - "m"(kHashMul3) // %8 - : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); - return hash; -} #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) #ifdef __cplusplus diff --git a/source/compare_win.cc b/source/compare_win.cc index 9d5bb27cd..1268a084d 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -26,57 +26,10 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_IX86) && \ (!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN)) -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - int i; - for (i = 0; i < count - 3; i += 4) { - uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT - src_a += 4; - src_b += 4; - diff += __popcnt(x); - } - return diff; -} __declspec(naked) uint32_t - SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - pxor xmm0, xmm0 - pxor xmm5, xmm5 - wloop: - movdqu xmm1, [eax] - lea eax, [eax + 16] - movdqu xmm2, [edx] - lea edx, [edx + 16] - movdqa xmm3, xmm1 // abs trick - psubusb xmm1, xmm2 - psubusb xmm2, xmm3 - por xmm1, xmm2 - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm0, xmm1 - paddd xmm0, xmm2 - sub ecx, 16 - jg wloop - - pshufd xmm1, xmm0, 0xee - paddd xmm0, xmm1 - pshufd xmm1, xmm0, 0x01 - paddd xmm0, xmm1 - movd eax, xmm0 - ret - } -} #ifdef HAS_SUMSQUAREERROR_AVX2 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. @@ -147,53 +100,7 @@ uvec32 kHashMul3 = { }; __declspec(naked) uint32_t - HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { - __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count - movd xmm0, [esp + 12] // seed - pxor xmm7, xmm7 // constant 0 for unpck - movdqa xmm6, xmmword ptr kHash16x33 - - wloop: - movdqu xmm1, [eax] // src[0-15] - lea eax, [eax + 16] - pmulld xmm0, xmm6 // hash *= 33 ^ 16 - movdqa xmm5, xmmword ptr kHashMul0 - movdqa xmm2, xmm1 - punpcklbw xmm2, xmm7 // src[0-7] - movdqa xmm3, xmm2 - punpcklwd xmm3, xmm7 // src[0-3] - pmulld xmm3, xmm5 - movdqa xmm5, xmmword ptr kHashMul1 - movdqa xmm4, xmm2 - punpckhwd xmm4, xmm7 // src[4-7] - pmulld xmm4, xmm5 - movdqa xmm5, xmmword ptr kHashMul2 - punpckhbw xmm1, xmm7 // src[8-15] - movdqa xmm2, xmm1 - punpcklwd xmm2, xmm7 // src[8-11] - pmulld xmm2, xmm5 - movdqa xmm5, xmmword ptr kHashMul3 - punpckhwd xmm1, xmm7 // src[12-15] - pmulld xmm1, xmm5 - paddd xmm3, xmm4 // add 16 results - paddd xmm1, xmm2 - paddd xmm1, xmm3 - - pshufd xmm2, xmm1, 0x0e // upper 2 dwords - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0x01 - paddd xmm1, xmm2 - paddd xmm0, xmm1 - sub ecx, 16 - jg wloop - - movd eax, xmm0 // return hash - ret - } -} // Visual C 2012 required for AVX2. #ifdef HAS_HASHDJB2_AVX2 diff --git a/source/convert.cc b/source/convert.cc index d9fb54778..3f2f53a1a 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -689,11 +689,9 @@ int I010ToNV12(const uint16_t* src_y, Convert16To8Row = Convert16To8Row_SME; } #endif -#if defined(HAS_CONVERT16TO8ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Convert16To8Row = Convert16To8Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - Convert16To8Row = Convert16To8Row_SSSE3; } } #endif @@ -714,11 +712,9 @@ int I010ToNV12(const uint16_t* src_y, } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1154,11 +1150,9 @@ int I422ToNV21(const uint8_t* src_y, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1204,11 +1198,9 @@ int I422ToNV21(const uint8_t* src_y, MergeUVRow = MergeUVRow_RVV; } #endif -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -1673,13 +1665,9 @@ int YUY2ToI420(const uint8_t* src_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } -#if defined(HAS_YUY2TOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; - YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_SSE2; - YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -1764,13 +1752,9 @@ int UYVYToI420(const uint8_t* src_uyvy, src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } -#if defined(HAS_UYVYTOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - UYVYToUVRow = UYVYToUVRow_Any_SSE2; - UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUVRow = UYVYToUVRow_SSE2; - UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -1863,13 +1847,9 @@ int AYUVToNV12(const uint8_t* src_ayuv, src_stride_ayuv = -src_stride_ayuv; } // place holders for future intel code -#if defined(HAS_AYUVTOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - AYUVToUVRow = AYUVToUVRow_Any_SSE2; - AYUVToYRow = AYUVToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - AYUVToUVRow = AYUVToUVRow_SSE2; - AYUVToYRow = AYUVToYRow_SSE2; } } #endif @@ -1940,13 +1920,9 @@ int AYUVToNV21(const uint8_t* src_ayuv, src_stride_ayuv = -src_stride_ayuv; } // place holders for future intel code -#if defined(HAS_AYUVTOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - AYUVToVURow = AYUVToVURow_Any_SSE2; - AYUVToYRow = AYUVToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - AYUVToVURow = AYUVToVURow_SSE2; - AYUVToYRow = AYUVToYRow_SSE2; } } #endif @@ -2070,19 +2046,15 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -2183,11 +2155,9 @@ int ARGBToI420Matrix(const uint8_t* src_argb, const struct ArgbConstants* c) = ARGBToUVMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -2271,11 +2241,9 @@ ARGBToUVMatrixRow_C; ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3; } } #endif @@ -2421,19 +2389,15 @@ int ARGBToI420Alpha(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -2487,7 +2451,7 @@ int ARGBToI420Alpha(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 : ARGBExtractAlphaRow_Any_SSE2; @@ -2614,19 +2578,15 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif -#if defined(HAS_BGRATOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - BGRAToYRow = BGRAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - BGRAToYRow = BGRAToYRow_SSSE3; } } #endif -#if defined(HAS_BGRATOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - BGRAToUVRow = BGRAToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_SSSE3; } } #endif @@ -2721,19 +2681,15 @@ int ABGRToI420(const uint8_t* src_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; } } #endif @@ -2876,11 +2832,9 @@ int RGBAToI420(const uint8_t* src_rgba, src_rgba = src_rgba + (height - 1) * src_stride_rgba; src_stride_rgba = -src_stride_rgba; } -#if defined(HAS_RGBATOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGBAToYRow = RGBAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToYRow = RGBAToYRow_SSSE3; } } #endif @@ -2900,11 +2854,9 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif -#if defined(HAS_RGBATOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGBAToUVRow = RGBAToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_SSSE3; } } #endif @@ -3022,11 +2974,9 @@ int RGB24ToI420(const uint8_t* src_rgb24, ARGBToUVRow_C; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -3095,11 +3045,9 @@ int RGB24ToI420(const uint8_t* src_rgb24, src_stride_rgb24 = -src_stride_rgb24; } -#if defined(HAS_RGB24TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif @@ -3153,11 +3101,9 @@ int RGB24ToI420(const uint8_t* src_rgb24, RGB24ToARGBRow = RGB24ToARGBRow_RVV; } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -3288,11 +3234,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24, // Other platforms do intermediate conversion from RGB24 to ARGB. #else // HAS_RGB24TOYJROW -#if defined(HAS_RGB24TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif @@ -3346,11 +3290,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24, RGB24ToARGBRow = RGB24ToARGBRow_RVV; } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -3370,11 +3312,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_ARGBTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif @@ -3458,11 +3398,9 @@ int RAWToI420(const uint8_t* src_raw, ARGBToUVRow_C; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -3531,11 +3469,9 @@ int RAWToI420(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -3589,11 +3525,9 @@ int RAWToI420(const uint8_t* src_raw, RAWToARGBRow = RAWToARGBRow_RVV; } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -3728,11 +3662,9 @@ int RAWToJ420(const uint8_t* src_raw, // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYJROW -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -3786,11 +3718,9 @@ int RAWToJ420(const uint8_t* src_raw, RAWToARGBRow = RAWToARGBRow_RVV; } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -3802,11 +3732,9 @@ int RAWToJ420(const uint8_t* src_raw, } } #endif -#if defined(HAS_ARGBTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif @@ -3894,11 +3822,9 @@ int RAWToI444(const uint8_t* src_raw, } // TODO: add row coalesce when main loop handles large width in blocks // TODO: implement UV444 or trim the ifdef below -#if defined(HAS_ARGBTOUV444ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; } } #endif @@ -3950,11 +3876,9 @@ int RAWToI444(const uint8_t* src_raw, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -4012,11 +3936,9 @@ int RAWToI444(const uint8_t* src_raw, } #endif -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -4121,11 +4043,9 @@ int RAWToJ444(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } // TODO: add row coalesce when main loop handles large width in blocks -#if defined(HAS_ARGBTOUVJ444ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJ444Row = ARGBToUVJ444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJ444Row = ARGBToUVJ444Row_SSSE3; } } #endif @@ -4177,11 +4097,9 @@ int RAWToJ444(const uint8_t* src_raw, } } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -4231,11 +4149,9 @@ int RAWToJ444(const uint8_t* src_raw, } #endif -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -4390,11 +4306,9 @@ int RGB565ToI420(const uint8_t* src_rgb565, } #endif // Other platforms do intermediate conversion from RGB565 to ARGB. -#if defined(HAS_RGB565TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_SSE2; } } #endif @@ -4406,11 +4320,9 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -4430,11 +4342,9 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -4581,11 +4491,9 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, #endif // Other platforms do intermediate conversion from ARGB1555 to ARGB. -#if defined(HAS_ARGB1555TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; } } #endif @@ -4597,19 +4505,15 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -4750,11 +4654,9 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif -#if defined(HAS_ARGB4444TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; } } #endif @@ -4782,19 +4684,15 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -4917,11 +4815,9 @@ int RGB24ToJ400(const uint8_t* src_rgb24, RGB24ToARGBRow_C; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -4993,11 +4889,9 @@ int RGB24ToJ400(const uint8_t* src_rgb24, height = 1; src_stride_rgb24 = dst_stride_yj = 0; } -#if defined(HAS_RGB24TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif @@ -5082,11 +4976,9 @@ int RAWToJ400(const uint8_t* src_raw, RAWToARGBRow_C; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -5160,11 +5052,9 @@ int RAWToJ400(const uint8_t* src_raw, src_stride_raw = dst_stride_yj = 0; } -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 7672a6692..c66b5bb30 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -76,11 +76,9 @@ int I420ToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I422TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -332,11 +330,9 @@ int I422ToARGBMatrix(const uint8_t* src_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } -#if defined(HAS_I422TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -586,11 +582,9 @@ int I444ToARGBMatrix(const uint8_t* src_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } -#if defined(HAS_I444TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToARGBRow = I444ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; } } #endif @@ -823,11 +817,9 @@ int I444ToRGB24Matrix(const uint8_t* src_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_rgb24 = 0; } -#if defined(HAS_I444TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - I444ToRGB24Row = I444ToRGB24Row_SSSE3; } } #endif @@ -965,11 +957,9 @@ int I010ToAR30Matrix(const uint16_t* src_y, I210ToAR30Row = I210ToAR30Row_SME; } #endif -#if defined(HAS_I210TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToAR30Row = I210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210ToAR30Row = I210ToAR30Row_SSSE3; } } #endif @@ -1125,11 +1115,9 @@ int I012ToAR30Matrix(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_I212TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I212ToAR30Row = I212ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I212ToAR30Row = I212ToAR30Row_SSSE3; } } #endif @@ -1219,11 +1207,9 @@ int I210ToAR30Matrix(const uint16_t* src_y, I210ToAR30Row = I210ToAR30Row_SME; } #endif -#if defined(HAS_I210TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToAR30Row = I210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210ToAR30Row = I210ToAR30Row_SSSE3; } } #endif @@ -1392,11 +1378,9 @@ int I410ToAR30Matrix(const uint16_t* src_y, I410ToAR30Row = I410ToAR30Row_SME; } #endif -#if defined(HAS_I410TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToAR30Row = I410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToAR30Row = I410ToAR30Row_SSSE3; } } #endif @@ -1446,11 +1430,9 @@ int I010ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I210TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToARGBRow = I210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210ToARGBRow = I210ToARGBRow_SSSE3; } } #endif @@ -1628,11 +1610,9 @@ int I012ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I212TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I212ToARGBRow = I212ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I212ToARGBRow = I212ToARGBRow_SSSE3; } } #endif @@ -1702,11 +1682,9 @@ int I210ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I210TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToARGBRow = I210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210ToARGBRow = I210ToARGBRow_SSSE3; } } #endif @@ -1881,11 +1859,9 @@ int I410ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I410TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToARGBRow = I410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToARGBRow = I410ToARGBRow_SSSE3; } } #endif @@ -1949,11 +1925,9 @@ int P010ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_P210TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToARGBRow = P210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P210ToARGBRow = P210ToARGBRow_SSSE3; } } #endif @@ -2018,11 +1992,9 @@ int P210ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_P210TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToARGBRow = P210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P210ToARGBRow = P210ToARGBRow_SSSE3; } } #endif @@ -2085,11 +2057,9 @@ int P010ToAR30Matrix(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_P210TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToAR30Row = P210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P210ToAR30Row = P210ToAR30Row_SSSE3; } } #endif @@ -2154,11 +2124,9 @@ int P210ToAR30Matrix(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_P210TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToAR30Row = P210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P210ToAR30Row = P210ToAR30Row_SSSE3; } } #endif @@ -2232,11 +2200,9 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; } } #endif @@ -2287,11 +2253,9 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, I422AlphaToARGBRow = I422AlphaToARGBRow_RVV; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -2385,11 +2349,9 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; } } #endif @@ -2440,11 +2402,9 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, I422AlphaToARGBRow = I422AlphaToARGBRow_RVV; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -2536,11 +2496,9 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; } } #endif @@ -2575,11 +2533,9 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -2818,11 +2774,9 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y, I210AlphaToARGBRow = I210AlphaToARGBRow_SME; } #endif -#if defined(HAS_I210ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; } } #endif @@ -2834,11 +2788,9 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y, } } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -2950,11 +2902,9 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y, I210AlphaToARGBRow = I210AlphaToARGBRow_SME; } #endif -#if defined(HAS_I210ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; } } #endif @@ -2966,11 +2916,9 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y, } } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -3080,11 +3028,9 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y, I410AlphaToARGBRow = I410AlphaToARGBRow_SME; } #endif -#if defined(HAS_I410ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; } } #endif @@ -3096,11 +3042,9 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y, } } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -3186,11 +3130,9 @@ int I400ToARGBMatrix(const uint8_t* src_y, height = 1; src_stride_y = dst_stride_argb = 0; } -#if defined(HAS_I400TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I400ToARGBRow = I400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_SSE2; } } #endif @@ -3280,11 +3222,9 @@ int J400ToARGB(const uint8_t* src_y, height = 1; src_stride_y = dst_stride_argb = 0; } -#if defined(HAS_J400TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - J400ToARGBRow = J400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - J400ToARGBRow = J400ToARGBRow_SSE2; } } #endif @@ -3630,11 +3570,9 @@ int RGB24ToARGB(const uint8_t* src_rgb24, height = 1; src_stride_rgb24 = dst_stride_argb = 0; } -#if defined(HAS_RGB24TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif @@ -3722,11 +3660,9 @@ int RAWToARGB(const uint8_t* src_raw, height = 1; src_stride_raw = dst_stride_argb = 0; } -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -3815,11 +3751,9 @@ int RAWToRGBA(const uint8_t* src_raw, height = 1; src_stride_raw = dst_stride_rgba = 0; } -#if defined(HAS_RAWTORGBAROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToRGBARow = RAWToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToRGBARow = RAWToRGBARow_SSSE3; } } #endif @@ -3876,11 +3810,9 @@ int RGB565ToARGB(const uint8_t* src_rgb565, height = 1; src_stride_rgb565 = dst_stride_argb = 0; } -#if defined(HAS_RGB565TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_SSE2; } } #endif @@ -3951,11 +3883,9 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555, height = 1; src_stride_argb1555 = dst_stride_argb = 0; } -#if defined(HAS_ARGB1555TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; } } #endif @@ -4031,11 +3961,9 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444, height = 1; src_stride_argb4444 = dst_stride_argb = 0; } -#if defined(HAS_ARGB4444TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; } } #endif @@ -4202,11 +4130,9 @@ int AR64ToARGB(const uint16_t* src_ar64, height = 1; src_stride_ar64 = dst_stride_argb = 0; } -#if defined(HAS_AR64TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - AR64ToARGBRow = AR64ToARGBRow_SSSE3; } } #endif @@ -4266,11 +4192,9 @@ int AB64ToARGB(const uint16_t* src_ab64, height = 1; src_stride_ab64 = dst_stride_argb = 0; } -#if defined(HAS_AB64TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - AB64ToARGBRow = AB64ToARGBRow_SSSE3; } } #endif @@ -4329,11 +4253,9 @@ int NV12ToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_NV12TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; } } #endif @@ -4421,11 +4343,9 @@ int NV21ToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_NV21TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_SSSE3; } } #endif @@ -4590,11 +4510,9 @@ int NV12ToRGB24Matrix(const uint8_t* src_y, NV12ToRGB24Row = NV12ToRGB24Row_SME; } #endif -#if defined(HAS_NV12TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - NV12ToRGB24Row = NV12ToRGB24Row_SSSE3; } } #endif @@ -4666,11 +4584,9 @@ int NV21ToRGB24Matrix(const uint8_t* src_y, NV21ToRGB24Row = NV21ToRGB24Row_SME; } #endif -#if defined(HAS_NV21TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - NV21ToRGB24Row = NV21ToRGB24Row_SSSE3; } } #endif @@ -4786,11 +4702,9 @@ int NV21ToYUV24(const uint8_t* src_y, } } #endif -#if defined(HAS_NV21TOYUV24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV21ToYUV24Row = NV21ToYUV24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - NV21ToYUV24Row = NV21ToYUV24Row_SSSE3; } } #endif @@ -4841,11 +4755,9 @@ int YUY2ToARGBMatrix(const uint8_t* src_yuy2, height = 1; src_stride_yuy2 = dst_stride_argb = 0; } -#if defined(HAS_YUY2TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; } } #endif @@ -4931,11 +4843,9 @@ int UYVYToARGBMatrix(const uint8_t* src_uyvy, height = 1; src_stride_uyvy = dst_stride_argb = 0; } -#if defined(HAS_UYVYTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - UYVYToARGBRow = UYVYToARGBRow_SSSE3; } } #endif @@ -5140,11 +5050,9 @@ int I422ToRGBAMatrix(const uint8_t* src_y, dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; dst_stride_rgba = -dst_stride_rgba; } -#if defined(HAS_I422TORGBAROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif @@ -5267,11 +5175,9 @@ int NV12ToRGB565Matrix(const uint8_t* src_y, dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } -#if defined(HAS_NV12TORGB565ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; } } #endif @@ -5362,11 +5268,9 @@ int I420ToRGBAMatrix(const uint8_t* src_y, dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; dst_stride_rgba = -dst_stride_rgba; } -#if defined(HAS_I422TORGBAROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif @@ -5494,11 +5398,9 @@ int I420ToRGB24Matrix(const uint8_t* src_y, dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } -#if defined(HAS_I422TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } #endif @@ -5698,11 +5600,9 @@ int I422ToRGB24Matrix(const uint8_t* src_y, dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } -#if defined(HAS_I422TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } #endif @@ -5827,11 +5727,9 @@ int I420ToARGB1555(const uint8_t* src_y, dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; dst_stride_argb1555 = -dst_stride_argb1555; } -#if defined(HAS_I422TOARGB1555ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; } } #endif @@ -5918,11 +5816,9 @@ int I420ToARGB4444(const uint8_t* src_y, dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; dst_stride_argb4444 = -dst_stride_argb4444; } -#if defined(HAS_I422TOARGB4444ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; } } #endif @@ -6010,11 +5906,9 @@ int I420ToRGB565Matrix(const uint8_t* src_y, dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } -#if defined(HAS_I422TORGB565ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_SSSE3; } } #endif @@ -6152,11 +6046,9 @@ int I422ToRGB565Matrix(const uint8_t* src_y, dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } -#if defined(HAS_I422TORGB565ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_SSSE3; } } #endif @@ -6268,11 +6160,9 @@ int I420ToRGB565Dither(const uint8_t* src_y, if (!dither4x4) { dither4x4 = kDither565_4x4; } -#if defined(HAS_I422TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -6332,11 +6222,9 @@ int I420ToRGB565Dither(const uint8_t* src_y, I422ToARGBRow = I422ToARGBRow_RVV; } #endif -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; } } #endif @@ -6429,11 +6317,9 @@ int I420ToAR30Matrix(const uint8_t* src_y, dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_I422TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToAR30Row = I422ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToAR30Row = I422ToAR30Row_SSSE3; } } #endif @@ -6575,11 +6461,9 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I444TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToARGBRow = I444ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; } } #endif @@ -6623,17 +6507,13 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif @@ -6724,11 +6604,9 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I444TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToARGBRow = I444ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; } } #endif @@ -6771,14 +6649,12 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y, I444ToARGBRow = I444ToARGBRow_RVV; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_AVX2) @@ -6850,11 +6726,9 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y, dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } -#if defined(HAS_I444TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - I444ToRGB24Row = I444ToRGB24Row_SSSE3; } } #endif @@ -6898,17 +6772,13 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif @@ -7020,11 +6890,9 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y, I410ToAR30Row = I410ToAR30Row_SME; } #endif -#if defined(HAS_I410TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToAR30Row = I410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToAR30Row = I410ToAR30Row_SSSE3; } } #endif @@ -7037,10 +6905,8 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; - ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif @@ -7144,11 +7010,9 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y, I410ToAR30Row = I410ToAR30Row_SME; } #endif -#if defined(HAS_I410TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToAR30Row = I410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToAR30Row = I410ToAR30Row_SSSE3; } } #endif @@ -7161,9 +7025,8 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) @@ -7229,11 +7092,9 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I410TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToARGBRow = I410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToARGBRow = I410ToARGBRow_SSSE3; } } #endif @@ -7264,10 +7125,8 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; - ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif @@ -7352,11 +7211,9 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I410TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToARGBRow = I410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToARGBRow = I410ToARGBRow_SSSE3; } } #endif @@ -7387,9 +7244,8 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) @@ -7464,11 +7320,9 @@ static int I420AlphaToARGBMatrixBilinear( dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; } } #endif @@ -7511,11 +7365,9 @@ static int I420AlphaToARGBMatrixBilinear( I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -7557,17 +7409,13 @@ static int I420AlphaToARGBMatrixBilinear( } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif @@ -7684,11 +7532,9 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; } } #endif @@ -7731,11 +7577,9 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -7777,14 +7621,12 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_AVX2) @@ -7887,11 +7729,9 @@ static int I010AlphaToARGBMatrixBilinear( I410AlphaToARGBRow = I410AlphaToARGBRow_SME; } #endif -#if defined(HAS_I410ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; } } #endif @@ -7903,11 +7743,9 @@ static int I010AlphaToARGBMatrixBilinear( } } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -7949,10 +7787,8 @@ static int I010AlphaToARGBMatrixBilinear( } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; - ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif @@ -8081,11 +7917,9 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, I410AlphaToARGBRow = I410AlphaToARGBRow_SME; } #endif -#if defined(HAS_I410ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; } } #endif @@ -8097,11 +7931,9 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, } } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -8143,9 +7975,8 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) @@ -8211,11 +8042,9 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_P410TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P410ToARGBRow = P410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P410ToARGBRow = P410ToARGBRow_SSSE3; } } #endif @@ -8246,9 +8075,8 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y, } #endif -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41; } #endif @@ -8322,11 +8150,9 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_P410TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P410ToARGBRow = P410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P410ToARGBRow = P410ToARGBRow_SSSE3; } } #endif @@ -8357,9 +8183,8 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y, } #endif -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41; } #endif @@ -8419,11 +8244,9 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_P410TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P410ToAR30Row = P410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P410ToAR30Row = P410ToAR30Row_SSSE3; } } #endif @@ -8454,9 +8277,8 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y, } #endif -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41; } #endif @@ -8530,11 +8352,9 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_P410TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P410ToAR30Row = P410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P410ToAR30Row = P410ToAR30Row_SSSE3; } } #endif @@ -8565,9 +8385,8 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y, } #endif -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41; } #endif @@ -8629,11 +8448,9 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y, dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } -#if defined(HAS_I444TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - I444ToRGB24Row = I444ToRGB24Row_SSSE3; } } #endif @@ -8668,14 +8485,12 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y, I444ToRGB24Row = I444ToRGB24Row_RVV; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_AVX2) diff --git a/source/convert_from.cc b/source/convert_from.cc index 5cf88fa2d..161b6cf53 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -362,11 +362,9 @@ int I422ToYUY2(const uint8_t* src_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; } -#if defined(HAS_I422TOYUY2ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_SSE2; } } #endif @@ -421,11 +419,9 @@ int I420ToYUY2(const uint8_t* src_y, dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; dst_stride_yuy2 = -dst_stride_yuy2; } -#if defined(HAS_I422TOYUY2ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_SSE2; } } #endif @@ -508,11 +504,9 @@ int I422ToUYVY(const uint8_t* src_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; } -#if defined(HAS_I422TOUYVYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_SSE2; } } #endif @@ -583,11 +577,9 @@ int I420ToUYVY(const uint8_t* src_y, dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; dst_stride_uyvy = -dst_stride_uyvy; } -#if defined(HAS_I422TOUYVYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_SSE2; } } #endif diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 2c66611e6..262335936 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -52,11 +52,9 @@ int ARGBToI444(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_ARGBTOUV444ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; } } #endif @@ -108,11 +106,9 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -201,11 +197,9 @@ int ARGBToI444Matrix(const uint8_t* src_argb, const struct ArgbConstants* c) = ARGBToUV444MatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -263,11 +257,9 @@ ARGBToUV444MatrixRow_C; } #endif -#if defined(HAS_ARGBTOUV444MATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_SSSE3; } } #endif @@ -351,19 +343,15 @@ int ARGBToI422(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -512,11 +500,9 @@ int ARGBToI422Matrix(const uint8_t* src_argb, const struct ArgbConstants* c) = ARGBToUVMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -600,11 +586,9 @@ ARGBToUVMatrixRow_C; ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3; } } #endif @@ -713,19 +697,15 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -784,11 +764,9 @@ int ARGBToNV12(const uint8_t* src_argb, ARGBToYRow = ARGBToYRow_RVV; } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -879,11 +857,9 @@ int ARGBToNV12Matrix(const uint8_t* src_argb, const struct ArgbConstants* c) = ARGBToUVMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -967,11 +943,9 @@ ARGBToUVMatrixRow_C; ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3; } } #endif @@ -995,11 +969,9 @@ ARGBToUVMatrixRow_C; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1100,19 +1072,15 @@ int ARGBToNV21(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -1229,11 +1197,9 @@ int ARGBToNV21(const uint8_t* src_argb, ARGBToYRow = ARGBToYRow_RVV; } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1332,19 +1298,15 @@ int ABGRToNV12(const uint8_t* src_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; } } #endif @@ -1449,11 +1411,9 @@ int ABGRToNV12(const uint8_t* src_abgr, ABGRToYRow = ABGRToYRow_RVV; } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1553,19 +1513,15 @@ int ABGRToNV21(const uint8_t* src_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; } } #endif @@ -1662,11 +1618,9 @@ int ABGRToNV21(const uint8_t* src_abgr, ABGRToYRow = ABGRToYRow_RVV; } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1771,19 +1725,15 @@ int ARGBToYUY2(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_yuy2 = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -1900,11 +1850,9 @@ int ARGBToYUY2(const uint8_t* src_argb, ARGBToYRow = ARGBToYRow_RVV; } #endif -#if defined(HAS_I422TOYUY2ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_SSE2; } } #endif @@ -1995,19 +1943,15 @@ int ARGBToUYVY(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_uyvy = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -2124,11 +2068,9 @@ int ARGBToUYVY(const uint8_t* src_argb, ARGBToYRow = ARGBToYRow_RVV; } #endif -#if defined(HAS_I422TOUYVYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_SSE2; } } #endif @@ -2211,11 +2153,9 @@ int ARGBToI400(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_y = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -2364,11 +2304,9 @@ int ARGBToRGB24(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_rgb24 = 0; } -#if defined(HAS_ARGBTORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; } } #endif @@ -2456,11 +2394,9 @@ int ARGBToRAW(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_raw = 0; } -#if defined(HAS_ARGBTORAWROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToRAWRow = ARGBToRAWRow_SSSE3; } } #endif @@ -2544,11 +2480,9 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, if (!dither4x4) { dither4x4 = kDither565_4x4; } -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; } } #endif @@ -2626,11 +2560,9 @@ int ARGBToRGB565(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_rgb565 = 0; } -#if defined(HAS_ARGBTORGB565ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } } #endif @@ -2706,11 +2638,9 @@ int ARGBToARGB1555(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb1555 = 0; } -#if defined(HAS_ARGBTOARGB1555ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; } } #endif @@ -2780,11 +2710,9 @@ int ARGBToARGB4444(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb4444 = 0; } -#if defined(HAS_ARGBTOARGB4444ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; } } #endif @@ -2862,11 +2790,9 @@ int ABGRToAR30(const uint8_t* src_abgr, } } #endif -#if defined(HAS_ABGRTOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ABGRToAR30Row = ABGRToAR30Row_SSSE3; } } #endif @@ -2919,11 +2845,9 @@ int ARGBToAR30(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBToAR30Row = ARGBToAR30Row_SSSE3; } } #endif @@ -2975,11 +2899,9 @@ int ARGBToJ444(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_ARGBTOUVJ444ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJ444Row = ARGBToUVJ444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJ444Row = ARGBToUVJ444Row_SSSE3; } } #endif @@ -3031,11 +2953,9 @@ int ARGBToJ444(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -3179,19 +3099,15 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif @@ -3295,19 +3211,15 @@ int ARGBToJ422(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0; } -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif @@ -3445,11 +3357,9 @@ int ARGBToJ400(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_yj = 0; } -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -3516,11 +3426,9 @@ int RGBAToJ400(const uint8_t* src_rgba, height = 1; src_stride_rgba = dst_stride_yj = 0; } -#if defined(HAS_RGBATOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGBAToYJRow = RGBAToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToYJRow = RGBAToYJRow_SSSE3; } } #endif @@ -3613,19 +3521,15 @@ int ABGRToJ420(const uint8_t* src_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYJRow = ABGRToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYJRow = ABGRToYJRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVJRow = ABGRToUVJRow_SSSE3; } } #endif @@ -3781,19 +3685,15 @@ int ABGRToJ422(const uint8_t* src_abgr, height = 1; src_stride_abgr = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0; } -#if defined(HAS_ABGRTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYJRow = ABGRToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYJRow = ABGRToYJRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVJRow = ABGRToUVJRow_SSSE3; } } #endif @@ -3927,11 +3827,9 @@ int ABGRToJ400(const uint8_t* src_abgr, height = 1; src_stride_abgr = dst_stride_yj = 0; } -#if defined(HAS_ABGRTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYJRow = ABGRToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYJRow = ABGRToYJRow_SSSE3; } } #endif @@ -4015,11 +3913,9 @@ int ARGBToAR64(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_ar64 = 0; } -#if defined(HAS_ARGBTOAR64ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBToAR64Row = ARGBToAR64Row_SSSE3; } } #endif @@ -4079,11 +3975,9 @@ int ARGBToAB64(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_ab64 = 0; } -#if defined(HAS_ARGBTOAB64ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBToAB64Row = ARGBToAB64Row_SSSE3; } } #endif @@ -4140,11 +4034,9 @@ int RAWToNV21Matrix(const uint8_t* src_raw, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; void (*MergeUVRow)(const uint8_t* src_uj, const uint8_t* src_vj, uint8_t* dst_vu, int width) = MergeUVRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -4213,11 +4105,9 @@ int RAWToNV21Matrix(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -4297,11 +4187,9 @@ int RAWToNV21Matrix(const uint8_t* src_raw, ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3; } } #endif @@ -4322,11 +4210,9 @@ int RAWToNV21Matrix(const uint8_t* src_raw, } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif diff --git a/source/planar_functions.cc b/source/planar_functions.cc index fde3717a4..20ccd6a57 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -52,7 +52,7 @@ void CopyPlane(const uint8_t* src_y, return; } -#if defined(HAS_COPYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } @@ -148,11 +148,9 @@ void Convert16To8Plane(const uint16_t* src_y, Convert16To8Row = Convert16To8Row_SME; } #endif -#if defined(HAS_CONVERT16TO8ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Convert16To8Row = Convert16To8Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - Convert16To8Row = Convert16To8Row_SSSE3; } } #endif @@ -209,11 +207,9 @@ void Convert8To16Plane(const uint8_t* src_y, height = 1; src_stride_y = dst_stride_y = 0; } -#if defined(HAS_CONVERT8TO16ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Convert8To16Row = Convert8To16Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { - Convert8To16Row = Convert8To16Row_SSE2; } } #endif @@ -607,11 +603,9 @@ void SplitUVPlane(const uint8_t* src_uv, height = 1; src_stride_uv = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_SPLITUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SplitUVRow = SplitUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_SSE2; } } #endif @@ -682,11 +676,9 @@ void MergeUVPlane(const uint8_t* src_u, height = 1; src_stride_u = src_stride_v = dst_stride_uv = 0; } -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1008,11 +1000,9 @@ void SwapUVPlane(const uint8_t* src_uv, src_stride_uv = dst_stride_vu = 0; } -#if defined(HAS_SWAPUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - SwapUVRow = SwapUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - SwapUVRow = SwapUVRow_SSSE3; } } #endif @@ -1108,11 +1098,9 @@ int DetilePlane(const uint8_t* src_y, dst_stride_y = -dst_stride_y; } -#if defined(HAS_DETILEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - DetileRow = DetileRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - DetileRow = DetileRow_SSE2; } } #endif @@ -1165,11 +1153,9 @@ int DetilePlane_16(const uint16_t* src_y, dst_stride_y = -dst_stride_y; } -#if defined(HAS_DETILEROW_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - DetileRow_16 = DetileRow_16_Any_SSE2; if (IS_ALIGNED(width, 16)) { - DetileRow_16 = DetileRow_16_SSE2; } } #endif @@ -1234,11 +1220,9 @@ void DetileSplitUVPlane(const uint8_t* src_uv, dst_stride_v = -dst_stride_v; } -#if defined(HAS_DETILESPLITUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - DetileSplitUVRow = DetileSplitUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - DetileSplitUVRow = DetileSplitUVRow_SSSE3; } } #endif @@ -1305,11 +1289,9 @@ void DetileToYUY2(const uint8_t* src_y, } #endif -#if defined(HAS_DETILETOYUY2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - DetileToYUY2 = DetileToYUY2_Any_SSE2; if (IS_ALIGNED(width, 16)) { - DetileToYUY2 = DetileToYUY2_SSE2; } } #endif @@ -1368,19 +1350,15 @@ void SplitRGBPlane(const uint8_t* src_rgb, height = 1; src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; } -#if defined(HAS_SPLITRGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - SplitRGBRow = SplitRGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - SplitRGBRow = SplitRGBRow_SSSE3; } } #endif -#if defined(HAS_SPLITRGBROW_SSE41) +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - SplitRGBRow = SplitRGBRow_Any_SSE41; if (IS_ALIGNED(width, 16)) { - SplitRGBRow = SplitRGBRow_SSE41; } } #endif @@ -1448,11 +1426,9 @@ void MergeRGBPlane(const uint8_t* src_r, height = 1; src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; } -#if defined(HAS_MERGERGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - MergeRGBRow = MergeRGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - MergeRGBRow = MergeRGBRow_SSSE3; } } #endif @@ -1511,19 +1487,15 @@ static void SplitARGBPlaneAlpha(const uint8_t* src_argb, dst_stride_a = 0; } -#if defined(HAS_SPLITARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SplitARGBRow = SplitARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - SplitARGBRow = SplitARGBRow_SSE2; } } #endif -#if defined(HAS_SPLITARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - SplitARGBRow = SplitARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - SplitARGBRow = SplitARGBRow_SSSE3; } } #endif @@ -1585,19 +1557,15 @@ static void SplitARGBPlaneOpaque(const uint8_t* src_argb, src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0; } -#if defined(HAS_SPLITXRGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SplitXRGBRow = SplitXRGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - SplitXRGBRow = SplitXRGBRow_SSE2; } } #endif -#if defined(HAS_SPLITXRGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - SplitXRGBRow = SplitXRGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - SplitXRGBRow = SplitXRGBRow_SSSE3; } } #endif @@ -1698,11 +1666,9 @@ static void MergeARGBPlaneAlpha(const uint8_t* src_r, src_stride_r = src_stride_g = src_stride_b = src_stride_a = dst_stride_argb = 0; } -#if defined(HAS_MERGEARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeARGBRow = MergeARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - MergeARGBRow = MergeARGBRow_SSE2; } } #endif @@ -1765,11 +1731,9 @@ static void MergeARGBPlaneOpaque(const uint8_t* src_r, height = 1; src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; } -#if defined(HAS_MERGEXRGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeXRGBRow = MergeXRGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - MergeXRGBRow = MergeXRGBRow_SSE2; } } #endif @@ -2205,13 +2169,9 @@ int YUY2ToI422(const uint8_t* src_yuy2, height = 1; src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_YUY2TOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; - YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -2301,13 +2261,9 @@ int UYVYToI422(const uint8_t* src_uyvy, height = 1; src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_UYVYTOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; - UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUV422Row = UYVYToUV422Row_SSE2; - UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -2389,11 +2345,9 @@ int YUY2ToY(const uint8_t* src_yuy2, height = 1; src_stride_yuy2 = dst_stride_y = 0; } -#if defined(HAS_YUY2TOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -2448,11 +2402,9 @@ int UYVYToY(const uint8_t* src_uyvy, height = 1; src_stride_uyvy = dst_stride_y = 0; } -#if defined(HAS_UYVYTOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -2514,11 +2466,9 @@ void MirrorPlane(const uint8_t* src_y, } } #endif -#if defined(HAS_MIRRORROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorRow = MirrorRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSSE3; } } #endif @@ -2580,11 +2530,9 @@ void MirrorUVPlane(const uint8_t* src_uv, } } #endif -#if defined(HAS_MIRRORUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorUVRow = MirrorUVRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - MirrorUVRow = MirrorUVRow_SSSE3; } } #endif @@ -2752,11 +2700,9 @@ int ARGBMirror(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBMIRRORROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBMirrorRow = ARGBMirrorRow_SSE2; } } #endif @@ -2822,11 +2768,9 @@ int RGB24Mirror(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_RGB24MIRRORROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24MirrorRow = RGB24MirrorRow_SSSE3; } } #endif @@ -2869,9 +2813,8 @@ int ARGBBlend(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBBLENDROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBBlendRow = ARGBBlendRow_SSSE3; } #endif #if defined(HAS_ARGBBLENDROW_NEON) @@ -2932,11 +2875,9 @@ int BlendPlane(const uint8_t* src_y0, src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; } -#if defined(HAS_BLENDPLANEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - BlendPlaneRow = BlendPlaneRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - BlendPlaneRow = BlendPlaneRow_SSSE3; } } #endif @@ -3014,11 +2955,9 @@ int I420Blend(const uint8_t* src_y0, BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride, dst_y, dst_stride_y, width, height); -#if defined(HAS_BLENDPLANEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - BlendPlaneRow = BlendPlaneRow_Any_SSSE3; if (IS_ALIGNED(halfwidth, 8)) { - BlendPlaneRow = BlendPlaneRow_SSSE3; } } #endif @@ -3049,13 +2988,10 @@ int I420Blend(const uint8_t* src_y0, } } #endif -#if defined(HAS_SCALEROWDOWN2_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3; if (IS_ALIGNED(width, 2)) { - ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3; if (IS_ALIGNED(halfwidth, 16)) { - ScaleRowDown2 = ScaleRowDown2Box_SSSE3; } } } @@ -3131,11 +3067,9 @@ int ARGBMultiply(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBMULTIPLYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBMultiplyRow = ARGBMultiplyRow_SSE2; } } #endif @@ -3216,16 +3150,13 @@ int ARGBAdd(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBADDROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBAddRow = ARGBAddRow_SSE2; } #endif -#if defined(HAS_ARGBADDROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBAddRow = ARGBAddRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBAddRow = ARGBAddRow_SSE2; } } #endif @@ -3301,11 +3232,9 @@ int ARGBSubtract(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBSUBTRACTROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBSubtractRow = ARGBSubtractRow_SSE2; } } #endif @@ -3378,11 +3307,9 @@ int RAWToRGB24(const uint8_t* src_raw, height = 1; src_stride_raw = dst_stride_rgb24 = 0; } -#if defined(HAS_RAWTORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - RAWToRGB24Row = RAWToRGB24Row_SSSE3; } } #endif @@ -3610,11 +3537,9 @@ int ARGBAttenuate(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -3689,11 +3614,9 @@ int ARGBUnattenuate(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBUNATTENUATEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; } } #endif @@ -3740,9 +3663,8 @@ int ARGBGrayTo(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBGRAYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_SSSE3; } #endif #if defined(HAS_ARGBGRAYROW_NEON) @@ -3795,9 +3717,8 @@ int ARGBGray(uint8_t* dst_argb, height = 1; dst_stride_argb = 0; } -#if defined(HAS_ARGBGRAYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_SSSE3; } #endif #if defined(HAS_ARGBGRAYROW_NEON) @@ -3848,9 +3769,8 @@ int ARGBSepia(uint8_t* dst_argb, height = 1; dst_stride_argb = 0; } -#if defined(HAS_ARGBSEPIAROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBSepiaRow = ARGBSepiaRow_SSSE3; } #endif #if defined(HAS_ARGBSEPIAROW_NEON) @@ -3909,9 +3829,8 @@ int ARGBColorMatrix(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; } #endif #if defined(HAS_ARGBCOLORMATRIXROW_NEON) @@ -4079,9 +3998,8 @@ int ARGBQuantize(uint8_t* dst_argb, height = 1; dst_stride_argb = 0; } -#if defined(HAS_ARGBQUANTIZEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { - ARGBQuantizeRow = ARGBQuantizeRow_SSE2; } #endif #if defined(HAS_ARGBQUANTIZEROW_NEON) @@ -4118,9 +4036,8 @@ int ARGBComputeCumulativeSum(const uint8_t* src_argb, if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { return -1; } -#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; } #endif @@ -4176,10 +4093,8 @@ int ARGBBlur(const uint8_t* src_argb, if (radius <= 0 || height <= 1) { return -1; } -#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; - CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; } #endif // Compute enough CumulativeSum for first row to be blurred. After this @@ -4273,9 +4188,8 @@ int ARGBShade(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBSHADEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { - ARGBShadeRow = ARGBShadeRow_SSE2; } #endif #if defined(HAS_ARGBSHADEROW_NEON) @@ -4332,11 +4246,9 @@ int InterpolatePlane(const uint8_t* src0, height = 1; src_stride0 = src_stride1 = dst_stride = 0; } -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -4414,11 +4326,9 @@ int InterpolatePlane_16(const uint16_t* src0, height = 1; src_stride0 = src_stride1 = dst_stride = 0; } -#if defined(HAS_INTERPOLATEROW_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow_16 = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - InterpolateRow_16 = InterpolateRow_16_SSSE3; } } #endif @@ -4544,11 +4454,9 @@ int ARGBShuffle(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBSHUFFLEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBShuffleRow = ARGBShuffleRow_SSSE3; } } #endif @@ -4621,11 +4529,9 @@ int AR64Shuffle(const uint16_t* src_ar64, src_stride_ar64 = dst_stride_ar64 = 0; } // Assembly versions can be reused if it's implemented with shuffle. -#if defined(HAS_ARGBSHUFFLEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - AR64ShuffleRow = ARGBShuffleRow_SSSE3; } } #endif @@ -4761,11 +4667,9 @@ static int ARGBSobelize(const uint8_t* src_argb, src_stride_argb = -src_stride_argb; } -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -4815,9 +4719,8 @@ static int ARGBSobelize(const uint8_t* src_argb, } #endif -#if defined(HAS_SOBELYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SobelYRow = SobelYRow_SSE2; } #endif #if defined(HAS_SOBELYROW_NEON) @@ -4825,9 +4728,8 @@ static int ARGBSobelize(const uint8_t* src_argb, SobelYRow = SobelYRow_NEON; } #endif -#if defined(HAS_SOBELXROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SobelXRow = SobelXRow_SSE2; } #endif #if defined(HAS_SOBELXROW_NEON) @@ -4895,11 +4797,9 @@ int ARGBSobel(const uint8_t* src_argb, int height) { void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) = SobelRow_C; -#if defined(HAS_SOBELROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SobelRow = SobelRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SobelRow = SobelRow_SSE2; } } #endif @@ -4933,11 +4833,9 @@ int ARGBSobelToPlane(const uint8_t* src_argb, int height) { void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_, int width) = SobelToPlaneRow_C; -#if defined(HAS_SOBELTOPLANEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SobelToPlaneRow = SobelToPlaneRow_SSE2; } } #endif @@ -4972,11 +4870,9 @@ int ARGBSobelXY(const uint8_t* src_argb, int height) { void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) = SobelXYRow_C; -#if defined(HAS_SOBELXYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SobelXYRow = SobelXYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SobelXYRow = SobelXYRow_SSE2; } } #endif @@ -5027,9 +4923,8 @@ int ARGBPolynomial(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { - ARGBPolynomialRow = ARGBPolynomialRow_SSE2; } #endif #if defined(HAS_ARGBPOLYNOMIALROW_AVX2) @@ -5077,11 +4972,9 @@ int HalfFloatPlane(const uint16_t* src_y, height = 1; src_stride_y = dst_stride_y = 0; } -#if defined(HAS_HALFFLOATROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - HalfFloatRow = HalfFloatRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - HalfFloatRow = HalfFloatRow_SSE2; } } #endif @@ -5189,9 +5082,8 @@ int ARGBLumaColorTable(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { - ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; } #endif @@ -5229,11 +5121,9 @@ int ARGBCopyAlpha(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBCOPYALPHAROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; } } #endif @@ -5279,7 +5169,7 @@ int ARGBExtractAlpha(const uint8_t* src_argb, } void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, int width) = ARGBExtractAlphaRow_C; -#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 : ARGBExtractAlphaRow_Any_SSE2; @@ -5343,11 +5233,9 @@ int ARGBCopyYToAlpha(const uint8_t* src_y, height = 1; src_stride_y = dst_stride_argb = 0; } -#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; } } #endif @@ -5397,11 +5285,9 @@ int YUY2ToNV12(const uint8_t* src_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } -#if defined(HAS_YUY2TOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -5438,11 +5324,9 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } #endif -#if defined(HAS_YUY2TONVUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2; } } #endif @@ -5505,11 +5389,9 @@ int UYVYToNV12(const uint8_t* src_uyvy, src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } -#if defined(HAS_SPLITUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SplitUVRow = SplitUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_SSE2; } } #endif @@ -5543,11 +5425,9 @@ int UYVYToNV12(const uint8_t* src_uyvy, } #endif -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -5642,9 +5522,8 @@ void HalfMergeUVPlane(const uint8_t* src_u, HalfMergeUVRow = HalfMergeUVRow_NEON; } #endif -#if defined(HAS_HALFMERGEUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - HalfMergeUVRow = HalfMergeUVRow_SSSE3; } #endif #if defined(HAS_HALFMERGEUVROW_AVX2) diff --git a/source/rotate.cc b/source/rotate.cc index d4a9fcd27..3db253a52 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -64,19 +64,15 @@ void TransposePlane(const uint8_t* src, TransposeWxH = TransposeWxH_SME; } #endif -#if defined(HAS_TRANSPOSEWX8_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - TransposeWx8 = TransposeWx8_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - TransposeWx8 = TransposeWx8_SSSE3; } } #endif -#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - TransposeWx8 = TransposeWx8_Fast_SSSE3; } } #endif @@ -174,11 +170,9 @@ void RotatePlane180(const uint8_t* src, } } #endif -#if defined(HAS_MIRRORROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorRow = MirrorRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSSE3; } } #endif @@ -206,7 +200,7 @@ void RotatePlane180(const uint8_t* src, } } #endif -#if defined(HAS_COPYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } @@ -301,11 +295,9 @@ void SplitTransposeUV(const uint8_t* src, TransposeUVWxH = TransposeUVWxH_SME; } #endif -#if defined(HAS_TRANSPOSEUVWX8_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - TransposeUVWx8 = TransposeUVWx8_Any_SSE2; if (IS_ALIGNED(width, 8)) { - TransposeUVWx8 = TransposeUVWx8_SSE2; } } #endif @@ -397,9 +389,8 @@ void SplitRotateUV180(const uint8_t* src, MirrorSplitUVRow = MirrorSplitUVRow_NEON; } #endif -#if defined(HAS_MIRRORSPLITUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - MirrorSplitUVRow = MirrorSplitUVRow_SSSE3; } #endif #if defined(HAS_MIRRORSPLITUVROW_LSX) diff --git a/source/rotate_any.cc b/source/rotate_any.cc index bf62c067b..6756d362f 100644 --- a/source/rotate_any.cc +++ b/source/rotate_any.cc @@ -35,12 +35,6 @@ TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7) #ifdef HAS_TRANSPOSEWX16_NEON TANY(TransposeWx16_Any_NEON, TransposeWx16_NEON, TransposeWx16_C, 15) #endif -#ifdef HAS_TRANSPOSEWX8_SSSE3 -TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7) -#endif -#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 -TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15) -#endif #ifdef HAS_TRANSPOSEWX16_LSX TANY(TransposeWx16_Any_LSX, TransposeWx16_LSX, TransposeWx16_C, 15) #endif @@ -62,9 +56,6 @@ TANY(TransposeWx16_Any_LSX, TransposeWx16_LSX, TransposeWx16_C, 15) #ifdef HAS_TRANSPOSEUVWX8_NEON TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) #endif -#ifdef HAS_TRANSPOSEUVWX8_SSE2 -TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) -#endif #ifdef HAS_TRANSPOSEUVWX16_LSX TUVANY(TransposeUVWx16_Any_LSX, TransposeUVWx16_LSX, 7) #endif diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 8c76ca919..546430fb0 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -37,11 +37,9 @@ static int ARGBTranspose(const uint8_t* src_argb, if (src_stride_argb & 3) { return -1; } -#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; if (IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; } } #endif @@ -131,11 +129,9 @@ static int ARGBRotate180(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBMIRRORROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBMirrorRow = ARGBMirrorRow_SSE2; } } #endif @@ -163,7 +159,7 @@ static int ARGBRotate180(const uint8_t* src_argb, } } #endif -#if defined(HAS_COPYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index 9847ecd48..1f8f49550 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -22,424 +22,11 @@ extern "C" { !defined(LIBYUV_ENABLE_ROWWIN) // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. -#if defined(HAS_TRANSPOSEWX8_SSSE3) -void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(src_stride)), // %3 - "r"((ptrdiff_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // defined(HAS_TRANSPOSEWX8_SSSE3) // Transpose 16x8. 64 bit -#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) -void TransposeWx8_Fast_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(src_stride)), // %3 - "r"((ptrdiff_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", - "xmm15"); -} -#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) // Transpose UV 8x8. 64 bit. -#if defined(HAS_TRANSPOSEUVWX8_SSE2) -void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" - // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(src_stride)), // %4 - "r"((ptrdiff_t)(dst_stride_a)), // %5 - "r"((ptrdiff_t)(dst_stride_b)) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7", "xmm8", "xmm9"); -} -#endif // defined(HAS_TRANSPOSEUVWX8_SSE2) -#if defined(HAS_TRANSPOSE4X4_32_SSE2) -// 4 values, little endian view -// a b c d -// e f g h -// i j k l -// m n o p - -// transpose 2x2 -// a e b f from row 0, 1 -// i m j n from row 2, 3 -// c g d h from row 0, 1 -// k o l p from row 2, 3 - -// transpose 4x4 -// a e i m from row 0, 1 -// b f j n from row 0, 1 -// c g k o from row 2, 3 -// d h l p from row 2, 3 - -// Transpose 32 bit values (ARGB) -void Transpose4x4_32_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - asm volatile( - // Main loop transpose 4x4. Read a column, write a row. - "1: \n" - "movdqu (%0),%%xmm0 \n" // a b c d - "movdqu (%0,%3),%%xmm1 \n" // e f g h - "lea (%0,%3,2),%0 \n" // src += stride * 2 - "movdqu (%0),%%xmm2 \n" // i j k l - "movdqu (%0,%3),%%xmm3 \n" // m n o p - "lea (%0,%3,2),%0 \n" // src += stride * 2 - - // Transpose 2x2 - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "movdqa %%xmm0,%%xmm6 \n" - "movdqa %%xmm2,%%xmm7 \n" - "punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1 - "punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3 - "punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1 - "punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3 - - // Transpose 4x4 - "movdqa %%xmm4,%%xmm0 \n" - "movdqa %%xmm4,%%xmm1 \n" - "movdqa %%xmm6,%%xmm2 \n" - "movdqa %%xmm6,%%xmm3 \n" - "punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1 - "punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1 - "punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3 - "punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3 - - "movdqu %%xmm0,(%1) \n" - "lea 16(%1,%4),%1 \n" // dst += stride + 16 - "movdqu %%xmm1,-16(%1) \n" - "movdqu %%xmm2,-16(%1,%4) \n" - "movdqu %%xmm3,-16(%1,%4,2) \n" - "sub %4,%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+rm"(width) // %2 - : "r"((ptrdiff_t)(src_stride)), // %3 - "r"((ptrdiff_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // defined(HAS_TRANSPOSE4X4_32_SSE2) #if defined(HAS_TRANSPOSE4X4_32_AVX2) diff --git a/source/rotate_win.cc b/source/rotate_win.cc index 03eeee3a6..b5e8a1f14 100644 --- a/source/rotate_win.cc +++ b/source/rotate_win.cc @@ -20,230 +20,9 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_IX86) && \ (!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN)) -__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - __asm { - push edi - push esi - push ebp - mov eax, [esp + 12 + 4] // src - mov edi, [esp + 12 + 8] // src_stride - mov edx, [esp + 12 + 12] // dst - mov esi, [esp + 12 + 16] // dst_stride - mov ecx, [esp + 12 + 20] // width - // Read in the data from the source pointer. - // First round of bit swap. - align 4 - convertloop: - movq xmm0, qword ptr [eax] - lea ebp, [eax + 8] - movq xmm1, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm0, xmm1 - movq xmm2, qword ptr [eax] - movdqa xmm1, xmm0 - palignr xmm1, xmm1, 8 - movq xmm3, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm2, xmm3 - movdqa xmm3, xmm2 - movq xmm4, qword ptr [eax] - palignr xmm3, xmm3, 8 - movq xmm5, qword ptr [eax + edi] - punpcklbw xmm4, xmm5 - lea eax, [eax + 2 * edi] - movdqa xmm5, xmm4 - movq xmm6, qword ptr [eax] - palignr xmm5, xmm5, 8 - movq xmm7, qword ptr [eax + edi] - punpcklbw xmm6, xmm7 - mov eax, ebp - movdqa xmm7, xmm6 - palignr xmm7, xmm7, 8 - // Second round of bit swap. - punpcklwd xmm0, xmm2 - punpcklwd xmm1, xmm3 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - palignr xmm2, xmm2, 8 - palignr xmm3, xmm3, 8 - punpcklwd xmm4, xmm6 - punpcklwd xmm5, xmm7 - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - palignr xmm6, xmm6, 8 - palignr xmm7, xmm7, 8 - // Third round of bit swap. - // Write to the destination pointer. - punpckldq xmm0, xmm4 - movq qword ptr [edx], xmm0 - movdqa xmm4, xmm0 - palignr xmm4, xmm4, 8 - movq qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - punpckldq xmm2, xmm6 - movdqa xmm6, xmm2 - palignr xmm6, xmm6, 8 - movq qword ptr [edx], xmm2 - punpckldq xmm1, xmm5 - movq qword ptr [edx + esi], xmm6 - lea edx, [edx + 2 * esi] - movdqa xmm5, xmm1 - movq qword ptr [edx], xmm1 - palignr xmm5, xmm5, 8 - punpckldq xmm3, xmm7 - movq qword ptr [edx + esi], xmm5 - lea edx, [edx + 2 * esi] - movq qword ptr [edx], xmm3 - movdqa xmm7, xmm3 - palignr xmm7, xmm7, 8 - sub ecx, 8 - movq qword ptr [edx + esi], xmm7 - lea edx, [edx + 2 * esi] - jg convertloop - pop ebp - pop esi - pop edi - ret - } -} -__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int w) { - __asm { - push ebx - push esi - push edi - push ebp - mov eax, [esp + 16 + 4] // src - mov edi, [esp + 16 + 8] // src_stride - mov edx, [esp + 16 + 12] // dst_a - mov esi, [esp + 16 + 16] // dst_stride_a - mov ebx, [esp + 16 + 20] // dst_b - mov ebp, [esp + 16 + 24] // dst_stride_b - mov ecx, esp - sub esp, 4 + 16 - and esp, ~15 - mov [esp + 16], ecx - mov ecx, [ecx + 16 + 28] // w - - align 4 - // Read in the data from the source pointer. - // First round of bit swap. - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm0 // use xmm7 as temp register. - punpcklbw xmm0, xmm1 - punpckhbw xmm7, xmm1 - movdqa xmm1, xmm7 - movdqu xmm2, [eax] - movdqu xmm3, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm2 - punpcklbw xmm2, xmm3 - punpckhbw xmm7, xmm3 - movdqa xmm3, xmm7 - movdqu xmm4, [eax] - movdqu xmm5, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm4 - punpcklbw xmm4, xmm5 - punpckhbw xmm7, xmm5 - movdqa xmm5, xmm7 - movdqu xmm6, [eax] - movdqu xmm7, [eax + edi] - lea eax, [eax + 2 * edi] - movdqu [esp], xmm5 // backup xmm5 - neg edi - movdqa xmm5, xmm6 // use xmm5 as temp register. - punpcklbw xmm6, xmm7 - punpckhbw xmm5, xmm7 - movdqa xmm7, xmm5 - lea eax, [eax + 8 * edi + 16] - neg edi - // Second round of bit swap. - movdqa xmm5, xmm0 - punpcklwd xmm0, xmm2 - punpckhwd xmm5, xmm2 - movdqa xmm2, xmm5 - movdqa xmm5, xmm1 - punpcklwd xmm1, xmm3 - punpckhwd xmm5, xmm3 - movdqa xmm3, xmm5 - movdqa xmm5, xmm4 - punpcklwd xmm4, xmm6 - punpckhwd xmm5, xmm6 - movdqa xmm6, xmm5 - movdqu xmm5, [esp] // restore xmm5 - movdqu [esp], xmm6 // backup xmm6 - movdqa xmm6, xmm5 // use xmm6 as temp register. - punpcklwd xmm5, xmm7 - punpckhwd xmm6, xmm7 - movdqa xmm7, xmm6 - - // Third round of bit swap. - // Write to the destination pointer. - movdqa xmm6, xmm0 - punpckldq xmm0, xmm4 - punpckhdq xmm6, xmm4 - movdqa xmm4, xmm6 - movdqu xmm6, [esp] // restore xmm6 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [ebx], xmm0 - movlpd qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm4 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm2 // use xmm0 as the temp register. - punpckldq xmm2, xmm6 - movlpd qword ptr [edx], xmm2 - movhpd qword ptr [ebx], xmm2 - punpckhdq xmm0, xmm6 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm1 // use xmm0 as the temp register. - punpckldq xmm1, xmm5 - movlpd qword ptr [edx], xmm1 - movhpd qword ptr [ebx], xmm1 - punpckhdq xmm0, xmm5 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm3 // use xmm0 as the temp register. - punpckldq xmm3, xmm7 - movlpd qword ptr [edx], xmm3 - movhpd qword ptr [ebx], xmm3 - punpckhdq xmm0, xmm7 - sub ecx, 8 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - jg convertloop - - mov esp, [esp + 16] - pop ebp - pop edi - pop esi - pop ebx - ret - } -} #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) diff --git a/source/row_any.cc b/source/row_any.cc index 82a4abe8d..cb9fdede0 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -53,9 +53,6 @@ extern "C" { memcpy(dst_ptr + (np >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } -#ifdef HAS_MERGEARGBROW_SSE2 -ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7) -#endif #ifdef HAS_MERGEARGBROW_AVX2 ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15) #endif @@ -92,15 +89,9 @@ ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15) memcpy(dst_ptr + (np >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } -#ifdef HAS_I444ALPHATOARGBROW_SSSE3 -ANY41C(I444AlphaToARGBRow_Any_SSSE3, I444AlphaToARGBRow_SSSE3, 0, 0, 4, 7) -#endif #ifdef HAS_I444ALPHATOARGBROW_AVX2 ANY41C(I444AlphaToARGBRow_Any_AVX2, I444AlphaToARGBRow_AVX2, 0, 0, 4, 15) #endif -#ifdef HAS_I422ALPHATOARGBROW_SSSE3 -ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) -#endif #ifdef HAS_I422ALPHATOARGBROW_AVX2 ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) #endif @@ -161,8 +152,7 @@ ANY41CT(I410AlphaToARGBRow_Any_NEON, 7) #endif -#ifdef HAS_I210ALPHATOARGBROW_SSSE3 -ANY41CT(I210AlphaToARGBRow_Any_SSSE3, +#if 0 I210AlphaToARGBRow_SSSE3, 1, 0, @@ -183,8 +173,7 @@ ANY41CT(I210AlphaToARGBRow_Any_AVX2, 15) #endif -#ifdef HAS_I410ALPHATOARGBROW_SSSE3 -ANY41CT(I410AlphaToARGBRow_Any_SSSE3, +#if 0 I410AlphaToARGBRow_SSSE3, 0, 0, @@ -279,25 +268,15 @@ ANY41PT(MergeARGB16To8Row_Any_NEON, } // Merge functions. -#ifdef HAS_MERGERGBROW_SSSE3 -ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) -#endif #ifdef HAS_MERGERGBROW_NEON ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) #endif -#ifdef HAS_MERGEXRGBROW_SSE2 -ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7) -#endif #ifdef HAS_MERGEXRGBROW_AVX2 ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15) #endif #ifdef HAS_MERGEXRGBROW_NEON ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15) #endif -#ifdef HAS_I422TOYUY2ROW_SSE2 -ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) -ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) -#endif #ifdef HAS_I422TOYUY2ROW_AVX2 ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31) ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31) @@ -323,9 +302,6 @@ ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31) #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #endif -#ifdef HAS_BLENDPLANEROW_SSSE3 -ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) -#endif #undef ANY31 // Note that odd width replication includes 444 due to implementation @@ -355,36 +331,9 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) memcpy(dst_ptr + (np >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } -#ifdef HAS_I422TOARGBROW_SSSE3 -ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) -#endif -#ifdef HAS_I422TORGBAROW_SSSE3 -ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) -#endif -#ifdef HAS_I422TOARGB4444ROW_SSSE3 -ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TOARGB1555ROW_SSSE3 -ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TORGB565ROW_SSSE3 -ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TORGB24ROW_SSSE3 -ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) -#endif -#ifdef HAS_I422TOAR30ROW_SSSE3 -ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) -#endif #ifdef HAS_I422TOAR30ROW_AVX2 ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) #endif -#ifdef HAS_I444TOARGBROW_SSSE3 -ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) -#endif -#ifdef HAS_I444TORGB24ROW_SSSE3 -ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15) -#endif #ifdef HAS_I422TORGB24ROW_AVX2 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) #endif @@ -470,36 +419,18 @@ ANY31C(I444ToARGBRow_Any_LSX, I444ToARGBRow_LSX, 0, 0, 4, 15) memcpy(dst_ptr + (np >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } -#ifdef HAS_I210TOAR30ROW_SSSE3 -ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I210TOARGBROW_SSSE3 -ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif #ifdef HAS_I210TOARGBROW_AVX2 ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_I210TOAR30ROW_AVX2 ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif -#ifdef HAS_I410TOAR30ROW_SSSE3 -ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I410TOARGBROW_SSSE3 -ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif #ifdef HAS_I410TOARGBROW_AVX2 ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_I410TOAR30ROW_AVX2 ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif -#ifdef HAS_I212TOAR30ROW_SSSE3 -ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I212TOARGBROW_SSSE3 -ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif #ifdef HAS_I212TOARGBROW_AVX2 ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif @@ -612,9 +543,6 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON, } // Merge functions. -#ifdef HAS_MERGEUVROW_SSE2 -ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) -#endif #ifdef HAS_MERGEUVROW_AVX2 ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) #endif @@ -630,22 +558,10 @@ ANY21(MergeUVRow_Any_LSX, MergeUVRow_LSX, 0, 1, 1, 2, 15) #ifdef HAS_NV21TOYUV24ROW_NEON ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15) #endif -#ifdef HAS_NV21TOYUV24ROW_SSSE3 -ANY21(NV21ToYUV24Row_Any_SSSE3, NV21ToYUV24Row_SSSE3, 1, 1, 2, 3, 15) -#endif #ifdef HAS_NV21TOYUV24ROW_AVX2 ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31) #endif // Math functions. -#ifdef HAS_ARGBMULTIPLYROW_SSE2 -ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3) -#endif -#ifdef HAS_ARGBADDROW_SSE2 -ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3) -#endif -#ifdef HAS_ARGBSUBTRACTROW_SSE2 -ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3) -#endif #ifdef HAS_ARGBMULTIPLYROW_AVX2 ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7) #endif @@ -682,27 +598,18 @@ ANY21(ARGBSubtractRow_Any_LSX, ARGBSubtractRow_LSX, 0, 4, 4, 4, 3) #ifdef HAS_ARGBSUBTRACTROW_LASX ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7) #endif -#ifdef HAS_SOBELROW_SSE2 -ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) -#endif #ifdef HAS_SOBELROW_NEON ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) #endif #ifdef HAS_SOBELROW_LSX ANY21(SobelRow_Any_LSX, SobelRow_LSX, 0, 1, 1, 4, 15) #endif -#ifdef HAS_SOBELTOPLANEROW_SSE2 -ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) -#endif #ifdef HAS_SOBELTOPLANEROW_NEON ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) #endif #ifdef HAS_SOBELTOPLANEROW_LSX ANY21(SobelToPlaneRow_Any_LSX, SobelToPlaneRow_LSX, 0, 1, 1, 1, 31) #endif -#ifdef HAS_SOBELXYROW_SSE2 -ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) -#endif #ifdef HAS_SOBELXYROW_NEON ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #endif @@ -735,9 +642,6 @@ ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15) #ifdef HAS_YUY2TONVUVROW_NEON ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7) #endif -#ifdef HAS_YUY2TONVUVROW_SSE2 -ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7) -#endif #ifdef HAS_YUY2TONVUVROW_AVX2 ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15) #endif @@ -763,9 +667,6 @@ ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15) } // Biplanar to RGB. -#ifdef HAS_NV12TOARGBROW_SSSE3 -ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7) -#endif #ifdef HAS_NV12TOARGBROW_AVX2 ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) #endif @@ -778,9 +679,6 @@ ANY21C(NV12ToARGBRow_Any_LSX, NV12ToARGBRow_LSX, 1, 1, 2, 4, 7) #ifdef HAS_NV12TOARGBROW_LASX ANY21C(NV12ToARGBRow_Any_LASX, NV12ToARGBRow_LASX, 1, 1, 2, 4, 15) #endif -#ifdef HAS_NV21TOARGBROW_SSSE3 -ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) -#endif #ifdef HAS_NV21TOARGBROW_AVX2 ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) #endif @@ -799,21 +697,12 @@ ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) #ifdef HAS_NV21TORGB24ROW_NEON ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7) #endif -#ifdef HAS_NV12TORGB24ROW_SSSE3 -ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) -#endif -#ifdef HAS_NV21TORGB24ROW_SSSE3 -ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) -#endif #ifdef HAS_NV12TORGB24ROW_AVX2 ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31) #endif #ifdef HAS_NV21TORGB24ROW_AVX2 ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31) #endif -#ifdef HAS_NV12TORGB565ROW_SSSE3 -ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) -#endif #ifdef HAS_NV12TORGB565ROW_AVX2 ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) #endif @@ -847,12 +736,6 @@ ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15) memcpy(dst_ptr + (np >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } -#ifdef HAS_P210TOAR30ROW_SSSE3 -ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_P210TOARGBROW_SSSE3 -ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif #ifdef HAS_P210TOARGBROW_AVX2 ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif @@ -865,12 +748,6 @@ ANY21CT(P210ToAR30Row_Any_NEON, P210ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7) #ifdef HAS_P210TOARGBROW_NEON ANY21CT(P210ToARGBRow_Any_NEON, P210ToARGBRow_NEON, 1, 0, uint16_t, 2, 4, 7) #endif -#ifdef HAS_P410TOAR30ROW_SSSE3 -ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_P410TOARGBROW_SSSE3 -ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif #ifdef HAS_P410TOARGBROW_AVX2 ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif @@ -937,19 +814,9 @@ ANY11(CopyRow_Any_AVX512BW, CopyRow_AVX512BW, 0, 1, 1, 127) #ifdef HAS_COPYROW_AVX ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) #endif -#ifdef HAS_COPYROW_SSE2 -ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) -#endif #ifdef HAS_COPYROW_NEON ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) #endif -#if defined(HAS_ARGBTORGB24ROW_SSSE3) -ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15) -ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15) -ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) -ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) -ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) -#endif #if defined(HAS_ARGBTORGB24ROW_AVX2) ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31) #endif @@ -966,37 +833,21 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif -#if defined(HAS_ABGRTOAR30ROW_SSSE3) -ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) -#endif #if defined(HAS_ABGRTOAR30ROW_NEON) ANY11(ABGRToAR30Row_Any_NEON, ABGRToAR30Row_NEON, 0, 4, 4, 7) #endif #if defined(HAS_ARGBTOAR30ROW_NEON) ANY11(ARGBToAR30Row_Any_NEON, ARGBToAR30Row_NEON, 0, 4, 4, 7) #endif -#if defined(HAS_ARGBTOAR30ROW_SSSE3) -ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) -#endif #if defined(HAS_ABGRTOAR30ROW_AVX2) ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) #endif #if defined(HAS_ARGBTOAR30ROW_AVX2) ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) #endif -#if defined(HAS_J400TOARGBROW_SSE2) -ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) -#endif #if defined(HAS_J400TOARGBROW_AVX2) ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) #endif -#if defined(HAS_RGB24TOARGBROW_SSSE3) -ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) -ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) -ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) -ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) -ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) -#endif #if defined(HAS_RAWTOARGBROW_AVX2) ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31) #endif @@ -1006,12 +857,6 @@ ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63) #if defined(HAS_RGB24TOARGBROW_AVX512BW) ANY11(RGB24ToARGBRow_Any_AVX512BW, RGB24ToARGBRow_AVX512BW, 0, 3, 4, 63) #endif -#if defined(HAS_RAWTORGBAROW_SSSE3) -ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) -#endif -#if defined(HAS_RAWTORGB24ROW_SSSE3) -ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7) -#endif #if defined(HAS_RGB565TOARGBROW_AVX2) ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15) #endif @@ -1073,17 +918,9 @@ ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31) #ifdef HAS_YUY2TOYROW_AVX2 ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31) #endif -#ifdef HAS_ARGBTOYROW_SSSE3 -ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(ARGBToYRow_Any_AVX512BW, ARGBToYRow_AVX512BW, 0, 4, 1, 63) #endif -#ifdef HAS_BGRATOYROW_SSSE3 -ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) -ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) -ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(BGRAToYRow_Any_AVX512BW, BGRAToYRow_AVX512BW, 0, 4, 1, 63) #endif @@ -1099,25 +936,12 @@ ANY11(RGBAToYRow_Any_AVX2, RGBAToYRow_AVX2, 0, 4, 1, 31) #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(ABGRToYRow_Any_AVX512BW, ABGRToYRow_AVX512BW, 0, 4, 1, 63) #endif -#ifdef HAS_YUY2TOYROW_SSE2 -ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) -ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) -#endif -#ifdef HAS_ARGBTOYJROW_SSSE3 -ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(ARGBToYJRow_Any_AVX512BW, ARGBToYJRow_AVX512BW, 0, 4, 1, 63) #endif -#ifdef HAS_ABGRTOYJROW_SSSE3 -ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(ABGRToYJRow_Any_AVX512BW, ABGRToYJRow_AVX512BW, 0, 4, 1, 63) #endif -#ifdef HAS_RGBATOYJROW_SSSE3 -ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(RGBAToYJRow_Any_AVX512BW, RGBAToYJRow_AVX512BW, 0, 4, 1, 63) #endif @@ -1205,38 +1029,6 @@ ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15) #ifdef HAS_RGBATOYROW_LASX ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31) #endif -#ifdef HAS_RGB24TOYROW_NEON -#endif -#ifdef HAS_RGB24TOYJROW_AVX2 -#endif -#ifdef HAS_RGB24TOYJROW_SSSE3 -#endif -#ifdef HAS_RGB24TOYJROW_NEON -#endif -#ifdef HAS_RGB24TOYROW_LSX -#endif -#ifdef HAS_RGB24TOYJROW_LSX -#endif -#ifdef HAS_RGB24TOYJROW_LASX -#endif -#ifdef HAS_RGB24TOYROW_LASX -#endif -#ifdef HAS_RAWTOYROW_NEON -#endif -#ifdef HAS_RAWTOYJROW_AVX2 -#endif -#ifdef HAS_RAWTOYJROW_SSSE3 -#endif -#ifdef HAS_RAWTOYJROW_NEON -#endif -#ifdef HAS_RAWTOYROW_LSX -#endif -#ifdef HAS_RAWTOYROW_LASX -#endif -#ifdef HAS_RAWTOYJROW_LSX -#endif -#ifdef HAS_RAWTOYJROW_LASX -#endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 15) #endif @@ -1287,9 +1079,6 @@ ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31) #ifdef HAS_AYUVTOYROW_NEON ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) #endif -#ifdef HAS_SWAPUVROW_SSSE3 -ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15) -#endif #ifdef HAS_SWAPUVROW_AVX2 ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31) #endif @@ -1344,12 +1133,6 @@ ANY11(ARGB4444ToARGBRow_Any_LSX, ARGB4444ToARGBRow_LSX, 0, 2, 4, 15) #ifdef HAS_ARGB4444TOARGBROW_LASX ANY11(ARGB4444ToARGBRow_Any_LASX, ARGB4444ToARGBRow_LASX, 0, 2, 4, 31) #endif -#ifdef HAS_ARGBATTENUATEROW_SSSE3 -ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) -#endif -#ifdef HAS_ARGBUNATTENUATEROW_SSE2 -ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) -#endif #ifdef HAS_ARGBATTENUATEROW_AVX2 ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7) #endif @@ -1365,9 +1148,6 @@ ANY11(ARGBAttenuateRow_Any_LSX, ARGBAttenuateRow_LSX, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_LASX ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15) #endif -#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 -ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) -#endif #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31) #endif @@ -1401,15 +1181,9 @@ ANY11(ARGBExtractAlphaRow_Any_LSX, ARGBExtractAlphaRow_LSX, 0, 4, 1, 15) #ifdef HAS_ARGBCOPYALPHAROW_AVX2 ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) #endif -#ifdef HAS_ARGBCOPYALPHAROW_SSE2 -ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) -#endif #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) #endif -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) -#endif #undef ANY11B // Any 1 to 1 with parameter. @@ -1429,8 +1203,7 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) memcpy(dst_ptr + np * BPP, vout, r * BPP); \ } -#if defined(HAS_I400TOARGBROW_SSE2) -ANY11P(I400ToARGBRow_Any_SSE2, +#if 0 I400ToARGBRow_SSE2, const struct YuvConstants*, 1, @@ -1462,8 +1235,7 @@ ANY11P(I400ToARGBRow_Any_LSX, 15) #endif -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) -ANY11P(ARGBToRGB565DitherRow_Any_SSE2, +#if 0 ARGBToRGB565DitherRow_SSE2, const uint32_t, 4, @@ -1502,9 +1274,6 @@ ANY11P(ARGBToRGB565DitherRow_Any_LASX, 2, 15) #endif -#ifdef HAS_ARGBSHUFFLEROW_SSSE3 -ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) -#endif #ifdef HAS_ARGBSHUFFLEROW_AVX2 ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15) #endif @@ -1537,21 +1306,9 @@ ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15) memcpy((uint8_t*)(dst_ptr) + np * BPP, vout, r * BPP); \ } -#ifdef HAS_ARGBTOAR64ROW_SSSE3 -ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) -#endif -#ifdef HAS_ARGBTOAB64ROW_SSSE3 -ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) -#endif -#ifdef HAS_AR64TOARGBROW_SSSE3 -ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) -#endif -#ifdef HAS_ARGBTOAR64ROW_SSSE3 -ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) -#endif #ifdef HAS_ARGBTOAR64ROW_AVX2 ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) @@ -1604,8 +1361,7 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) memcpy(dst_ptr + np, vout, r * BPP); \ } -#ifdef HAS_CONVERT16TO8ROW_SSSE3 -ANY11C(Convert16To8Row_Any_SSSE3, +#if 0 Convert16To8Row_SSSE3, 2, 1, @@ -1640,8 +1396,7 @@ ANY11C(Convert16To8Row_Any_NEON, uint8_t, 15) #endif -#ifdef HAS_CONVERT8TO16ROW_SSE2 -ANY11C(Convert8To16Row_Any_SSE2, +#if 0 Convert8To16Row_SSE2, 1, 2, @@ -1748,9 +1503,6 @@ ANY11SB(Convert8To8Row_Any_AVX2, memcpy(dst_ptr + np, vout, r * BPP); \ } -#ifdef HAS_HALFFLOATROW_SSE2 -ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) -#endif #ifdef HAS_HALFFLOATROW_AVX2 ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) #endif @@ -1793,10 +1545,6 @@ ANY11P16(HalfFloatRow_Any_LSX, HalfFloatRow_LSX, uint16_t, uint16_t, 2, 2, 31) memcpy(dst_ptr + np * BPP, vout, r * BPP); \ } -#if defined(HAS_YUY2TOARGBROW_SSSE3) -ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) -ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) -#endif #if defined(HAS_YUY2TOARGBROW_AVX2) ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31) ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) @@ -1836,8 +1584,7 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7) #ifdef HAS_INTERPOLATEROW_AVX2 ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31) #endif -#ifdef HAS_INTERPOLATEROW_SSSE3 -ANY11I(InterpolateRow_Any_SSSE3, +#if 0 InterpolateRow_SSSE3, uint8_t, uint8_t, @@ -1926,9 +1673,6 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2, #ifdef HAS_MIRRORROW_AVX2 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) #endif -#ifdef HAS_MIRRORROW_SSSE3 -ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) -#endif #ifdef HAS_MIRRORROW_NEON ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31) #endif @@ -1941,9 +1685,6 @@ ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63) #ifdef HAS_MIRRORUVROW_AVX2 ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) #endif -#ifdef HAS_MIRRORUVROW_SSSE3 -ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) -#endif #ifdef HAS_MIRRORUVROW_NEON ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) #endif @@ -1956,9 +1697,6 @@ ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15) #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif -#ifdef HAS_ARGBMIRRORROW_SSE2 -ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) -#endif #ifdef HAS_ARGBMIRRORROW_NEON ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) #endif @@ -1968,9 +1706,6 @@ ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7) #ifdef HAS_ARGBMIRRORROW_LASX ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15) #endif -#ifdef HAS_RGB24MIRRORROW_SSSE3 -ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15) -#endif #ifdef HAS_RGB24MIRRORROW_NEON ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15) #endif @@ -2026,9 +1761,6 @@ ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3) memcpy(dst_v + (np >> DUVSHIFT), vout + 256, SS(r, DUVSHIFT)); \ } -#ifdef HAS_SPLITUVROW_SSE2 -ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) -#endif #ifdef HAS_SPLITUVROW_AVX2 ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) #endif @@ -2038,12 +1770,6 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #ifdef HAS_SPLITUVROW_LSX ANY12(SplitUVRow_Any_LSX, SplitUVRow_LSX, 0, 2, 0, 31) #endif -#ifdef HAS_ARGBTOUV444ROW_SSSE3 -ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) -#endif -#ifdef HAS_ARGBTOUVJ444ROW_SSSE3 -ANY12(ARGBToUVJ444Row_Any_SSSE3, ARGBToUVJ444Row_SSSE3, 0, 4, 0, 15) -#endif #ifdef HAS_ARGBTOUV444ROW_AVX2 ANY12(ARGBToUV444Row_Any_AVX2, ARGBToUV444Row_AVX2, 0, 4, 0, 31) #endif @@ -2060,10 +1786,6 @@ ANY12(ARGBToUVJ444Row_Any_AVX512BW, ARGBToUVJ444Row_AVX512BW, 0, 4, 0, 63) ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) #endif -#ifdef HAS_YUY2TOUV422ROW_SSE2 -ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15) -ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) -#endif #ifdef HAS_YUY2TOUV422ROW_NEON ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) ANY12(ARGBToUVJ444Row_Any_NEON, ARGBToUVJ444Row_NEON, 0, 4, 0, 7) @@ -2134,24 +1856,12 @@ ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7) memcpy(dst_b + np, vout + 32, r); \ } -#ifdef HAS_SPLITRGBROW_SSSE3 -ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) -#endif -#ifdef HAS_SPLITRGBROW_SSE41 -ANY13(SplitRGBRow_Any_SSE41, SplitRGBRow_SSE41, 3, 15) -#endif #ifdef HAS_SPLITRGBROW_AVX2 ANY13(SplitRGBRow_Any_AVX2, SplitRGBRow_AVX2, 3, 31) #endif #ifdef HAS_SPLITRGBROW_NEON ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) #endif -#ifdef HAS_SPLITXRGBROW_SSE2 -ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7) -#endif -#ifdef HAS_SPLITXRGBROW_SSSE3 -ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7) -#endif #ifdef HAS_SPLITXRGBROW_AVX2 ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15) #endif @@ -2180,12 +1890,6 @@ ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15) memcpy(dst_a + np, vout + 48, r); \ } -#ifdef HAS_SPLITARGBROW_SSE2 -ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7) -#endif -#ifdef HAS_SPLITARGBROW_SSSE3 -ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7) -#endif #ifdef HAS_SPLITARGBROW_AVX2 ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15) #endif @@ -2276,18 +1980,12 @@ ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15) #ifdef HAS_ARGBTOUVMATRIXROW_AVX512BW ANY12MS(ARGBToUVMatrixRow_Any_AVX512BW, ARGBToUVMatrixRow_AVX512BW, 0, 4, 63) #endif -#ifdef HAS_ARGBTOUVMATRIXROW_SSSE3 -ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7) -#endif #ifdef HAS_ARGBTOUV444MATRIXROW_AVX2 ANY12M(ARGBToUV444MatrixRow_Any_AVX2, ARGBToUV444MatrixRow_AVX2, 4, 31) #endif #ifdef HAS_ARGBTOUV444MATRIXROW_AVX512BW ANY12M(ARGBToUV444MatrixRow_Any_AVX512BW, ARGBToUV444MatrixRow_AVX512BW, 4, 63) #endif -#ifdef HAS_ARGBTOUV444MATRIXROW_SSSE3 -ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15) -#endif #ifdef HAS_ARGBTOUV444MATRIXROW_NEON ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7) #endif @@ -2308,9 +2006,6 @@ ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7) memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r); \ } -#ifdef HAS_ARGBTOYROW_SSSE3 -ANY11MC(ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_SSSE3, 4, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11MC(ARGBToYMatrixRow_Any_AVX2, ARGBToYMatrixRow_AVX2, 4, 31) #endif @@ -2358,26 +2053,10 @@ ANY12S(ABGRToUVJRow_Any_AVX2, ABGRToUVJRow_AVX2, 0, 4, 31) #ifdef HAS_ABGRTOUVJROW_AVX512BW ANY12S(ABGRToUVJRow_Any_AVX512BW, ABGRToUVJRow_AVX512BW, 0, 4, 63) #endif -#ifdef HAS_ARGBTOUVJROW_SSSE3 -ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) -#endif -#ifdef HAS_ABGRTOUVJROW_SSSE3 -ANY12S(ABGRToUVJRow_Any_SSSE3, ABGRToUVJRow_SSSE3, 0, 4, 15) -#endif -#ifdef HAS_ARGBTOUVROW_SSSE3 -ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) -ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) -ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) -ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) -#endif #ifdef HAS_YUY2TOUVROW_AVX2 ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31) ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31) #endif -#ifdef HAS_YUY2TOUVROW_SSE2 -ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15) -ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) -#endif #ifdef HAS_ARGBTOUVROW_NEON ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #endif @@ -2594,15 +2273,9 @@ ANY11S(AYUVToVURow_Any_SVE2, AYUVToVURow_SVE2, 0, 4, 1) #ifdef HAS_DETILEROW_NEON ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, uint8_t, 1, 15) #endif -#ifdef HAS_DETILEROW_SSE2 -ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, uint8_t, 1, 15) -#endif #ifdef HAS_DETILEROW_16_NEON ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15) #endif -#ifdef HAS_DETILEROW_16_SSE2 -ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15) -#endif #ifdef HAS_DETILEROW_16_AVX ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15) #endif @@ -2629,9 +2302,6 @@ ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15) #ifdef HAS_DETILESPLITUVROW_NEON ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15) #endif -#ifdef HAS_DETILESPLITUVROW_SSSE3 -ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) -#endif #define ANYDETILEMERGE(NAMEANY, ANY_SIMD, MASK) \ void NAMEANY(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, \ @@ -2657,9 +2327,6 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15) #endif -#ifdef HAS_DETILETOYUY2_SSE2 -ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15) -#endif #ifdef __cplusplus } // extern "C" diff --git a/source/row_common.cc b/source/row_common.cc index b2a0ec12b..c54a093df 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -4046,7 +4046,7 @@ void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { #define MAXTWIDTH 2048 #if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \ - defined(HAS_I422TORGB565ROW_SSSE3) && !defined(LIBYUV_ENABLE_ROWWIN) + 0 && !defined(LIBYUV_ENABLE_ROWWIN) // row_win.cc has asm version, but GCC uses 2 step wrapper. void I422ToRGB565Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, @@ -4057,8 +4057,6 @@ void I422ToRGB565Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -4068,7 +4066,7 @@ void I422ToRGB565Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_I422TOARGB1555ROW_SSSE3) +#if 0 void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4079,8 +4077,6 @@ void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); - ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -4090,7 +4086,7 @@ void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_I422TOARGB4444ROW_SSSE3) +#if 0 void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4101,8 +4097,6 @@ void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); - ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -4112,7 +4106,7 @@ void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_NV12TORGB565ROW_SSSE3) +#if 0 void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, @@ -4122,8 +4116,6 @@ void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); src_y += twidth; src_uv += twidth; dst_rgb565 += twidth * 2; @@ -4132,7 +4124,7 @@ void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_NV12TORGB24ROW_SSSE3) +#if 0 void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, @@ -4142,8 +4134,6 @@ void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); src_y += twidth; src_uv += twidth; dst_rgb24 += twidth * 3; @@ -4152,7 +4142,7 @@ void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_NV21TORGB24ROW_SSSE3) +#if 0 void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, @@ -4162,8 +4152,6 @@ void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth); - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); src_y += twidth; src_vu += twidth; dst_rgb24 += twidth * 3; @@ -4186,7 +4174,6 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_uv += twidth; @@ -4210,7 +4197,6 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_vu += twidth; @@ -4234,7 +4220,6 @@ void I422ToRGB565Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB565ROW_AVX2) ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); #else - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); #endif src_y += twidth; src_u += twidth / 2; @@ -4260,7 +4245,6 @@ void I422ToARGB1555Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTOARGB1555ROW_AVX2) ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); #else - ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); #endif src_y += twidth; src_u += twidth / 2; @@ -4286,7 +4270,6 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTOARGB4444ROW_AVX2) ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); #else - ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); #endif src_y += twidth; src_u += twidth / 2; @@ -4312,7 +4295,6 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_u += twidth / 2; @@ -4338,7 +4320,6 @@ void I444ToRGB24Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_u += twidth; @@ -4363,7 +4344,6 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB565ROW_AVX2) ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); #else - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); #endif src_y += twidth; src_uv += twidth; @@ -4383,12 +4363,12 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, } #endif // HAS_RAWTOYJROW_AVX2 -#ifdef HAS_RGB24TOYJROW_SSSE3 +#if 0 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. } #endif // HAS_RGB24TOYJROW_SSSE3 -#ifdef HAS_RAWTOYJROW_SSSE3 +#if 0 // Convert 16 RAW pixels (64 bytes) to 16 YJ values. } #endif // HAS_RAWTOYJROW_SSSE3 diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 0da6e2ada..16b2c2780 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -21,7 +21,7 @@ extern "C" { (defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_ENABLE_ROWWIN) -#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) +#if 1 || 1 // Constants for ARGB @@ -30,9 +30,9 @@ static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u}; -#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) +#endif // 1 || 1 -#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) +#if 1 || 1 // Constants for BGRA // Constants for ABGR @@ -43,9 +43,9 @@ static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u}; -#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) +#endif // 1 || 1 -#ifdef HAS_RGB24TOARGBROW_SSSE3 +#if 1 // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { @@ -113,112 +113,11 @@ static const lvec8 kShuffleNV21 = { }; #endif // HAS_RGB24TOARGBROW_SSSE3 -#ifdef HAS_J400TOARGBROW_SSE2 -void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_J400TOARGBROW_SSE2 +#if 1 -#ifdef HAS_RGB24TOARGBROW_SSSE3 -void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRGB24ToARGB) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile( - "pcmpeqb %%xmm6,%%xmm6 \n" // 0xff000000 - "pslld $0x18,%%xmm6 \n" - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 12(%0),%%xmm1 \n" - "movdqu 24(%0),%%xmm2 \n" - "movdqu 32(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm4,%%xmm2 \n" - "pshufb %%xmm5,%%xmm3 \n" - "por %%xmm6,%%xmm0 \n" - "por %%xmm6,%%xmm1 \n" - "por %%xmm6,%%xmm2 \n" - "por %%xmm6,%%xmm3 \n" - "movdqu %%xmm0,0x00(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToARGB), // %3 - "m"(kShuffleMaskRAWToARGB_0) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( @@ -321,287 +220,19 @@ void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int wi // Same code as RAWToARGB with different shuffler and A in low bits -void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff - "psrld $24,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgba), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToRGBA) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, - uint8_t* dst_rgb24, - int width) { - asm volatile( - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - "movdqa %5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x4(%0),%%xmm1 \n" - "movdqu 0x8(%0),%%xmm2 \n" - "lea 0x18(%0),%0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToRGB24_0), // %3 - "m"(kShuffleMaskRAWToRGB24_1), // %4 - "m"(kShuffleMaskRAWToRGB24_2) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x20802080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $10,%%xmm4 \n" - "psrlw $5,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "pand %%xmm4,%%xmm0 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} -void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x42004200,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "movdqa %%xmm3,%%xmm4 \n" - "psrlw $0x6,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psllw $0x1,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} -void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0xf0f0f0f,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x4,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "pand %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "psllw $0x4,%%xmm1 \n" - "psrlw $0x4,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,0x00(%1,%0,2) \n" - "movdqu %%xmm1,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRGB24) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); -} -void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRAW) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); -} #ifdef HAS_ARGBTORGB24ROW_AVX2 // vpermd for 12+12 to 24 @@ -746,89 +377,9 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif -void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, - uint8_t* dst, - uint32_t dither4, - int width) { - asm volatile( - "movd %3,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "movdqa %%xmm6,%%xmm7 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "punpckhwd %%xmm7,%%xmm7 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "paddusb %%xmm6,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, @@ -876,75 +427,9 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, } #endif // HAS_ARGBTORGB565DITHERROW_AVX2 -void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1b,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x5,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pslld $0xa,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "pslld $0xf,%%xmm7 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "psrad $0x10,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x6,%%xmm2 \n" - "psrld $0x9,%%xmm3 \n" - "pand %%xmm7,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm6,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); -} -void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xc,%%xmm4 \n" - "movdqa %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm3 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "psrlq $0x4,%%xmm0 \n" - "psrlq $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} #endif // HAS_RGB24TOARGBROW_SSSE3 /* @@ -980,83 +465,9 @@ static const uint32_t kMaskRB10 = 0x3ff003ff; static const uint32_t kMaskAG10 = 0xc000ff00; static const uint32_t kMulAG10 = 64 * 65536 + 1028; -void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleRB30), // %3 - "m"(kMulRB10), // %4 - "m"(kMaskRB10), // %5 - "m"(kMaskAG10), // %6 - "m"(kMulAG10) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleBR30), // %3 reversed shuffler - "m"(kMulRB10), // %4 - "m"(kMaskRB10), // %5 - "m"(kMaskAG10), // %6 - "m"(kMulAG10) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} #ifdef HAS_ARGBTOAR30ROW_AVX2 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { @@ -1140,99 +551,13 @@ static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3, static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}; -void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} -void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" - "movdqa %4,%%xmm3 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm0 \n" - "pshufb %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToAB64Lo), // %3 - "m"(kShuffleARGBToAB64Hi) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrlw $8,%%xmm0 \n" - "psrlw $8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} -void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width) { - asm volatile("movdqa %3,%%xmm2 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrlw $8,%%xmm0 \n" - "psrlw $8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "pshufb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} + + + #ifdef HAS_ARGBTOAR64ROW_AVX2 void ARGBToAR64Row_AVX2(const uint8_t* src_argb, @@ -1407,37 +732,9 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, // clang-format on -#ifdef HAS_ARGBTOYROW_SSSE3 -// Convert 16 ARGB pixels (64 bytes) to 16 Y values. -void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kArgbI601Constants); -} -#endif // HAS_ARGBTOYROW_SSSE3 -#ifdef HAS_ARGBTOYJROW_SSSE3 -// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. -// Same as ARGBToYRow but different coefficients, no add 16. -void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kArgbJPEGConstants); -} -#endif // HAS_ARGBTOYJROW_SSSE3 -#ifdef HAS_ABGRTOYJROW_SSSE3 -// Convert 16 ABGR pixels (64 bytes) to 16 YJ values. -// Same as ABGRToYRow but different coefficients, no add 16. -void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_abgr, dst_y, width, &kAbgrJPEGConstants); -} -#endif // HAS_ABGRTOYJROW_SSSE3 - -#ifdef HAS_RGBATOYJROW_SSSE3 -// Convert 16 RGBA pixels (64 bytes) to 16 YJ values. -// Same as ARGBToYRow but different coefficients, no add 16. -void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_rgba, dst_y, width, &kRgbaJPEGConstants); -} -#endif // HAS_RGBATOYJROW_SSSE3 #if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \ defined(HAS_ARGBEXTRACTALPHAROW_AVX2) @@ -1497,31 +794,6 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; #endif -#ifdef HAS_ARGBTOYROW_SSSE3 -void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $15,%%xmm5 \n" - "packsswb %%xmm5,%%xmm5 \n" - "movdqa 0(%3),%%xmm4 \n" - "movdqa 0x60(%3),%%xmm7 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "phaddw %%xmm6,%%xmm6 \n" - "psubw %%xmm6,%%xmm7 \n" - LABELALIGN "" - RGBTOY(xmm7) - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(c) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif #ifdef HAS_ARGBTOYROW_AVX2 void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, @@ -1620,75 +892,6 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, } #endif -#ifdef HAS_ARGBTOUV444ROW_SSSE3 -void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0x8000 - "psllw $15,%%xmm5 \n" - "movdqa 0x20(%4),%%xmm3 \n" // kRGBToU - "movdqa 0x40(%4),%%xmm4 \n" // kRGBToV - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "movdqa %%xmm5,%%xmm1 \n" - "movdqa %%xmm5,%%xmm6 \n" - "psubw %%xmm0,%%xmm1 \n" - "psubw %%xmm2,%%xmm6 \n" - "psrlw $0x8,%%xmm1 \n" - "psrlw $0x8,%%xmm6 \n" - "packuswb %%xmm6,%%xmm1 \n" - "movdqu %%xmm1,(%1) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "movdqa %%xmm5,%%xmm1 \n" - "movdqa %%xmm5,%%xmm6 \n" - "psubw %%xmm0,%%xmm1 \n" - "psubw %%xmm2,%%xmm6 \n" - "psrlw $0x8,%%xmm1 \n" - "psrlw $0x8,%%xmm6 \n" - "packuswb %%xmm6,%%xmm1 \n" - "movdqu %%xmm1,0x00(%1,%2,1) \n" - - "lea 0x40(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "subl $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 -#if defined(__i386__) - "+m"(width) // %3 -#else - "+rm"(width) // %3 -#endif - : "r"(c) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_ARGBTOUV444ROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_AVX2 @@ -1842,7 +1045,7 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, } #endif // HAS_ARGBTOUV444ROW_AVX512BW -#ifdef HAS_ARGBTOUVROW_SSSE3 +#if 1 // ARGBARGB to AARRGGBB shuffle static const lvec8 kShuffleAARRGGBB = { @@ -1852,73 +1055,7 @@ static const lvec8 kShuffleAARRGGBB = { // 8x2 -> 4x1 ARGB pixels converted to 4 U and 4 V // ARGBToUV does rounding average of 4 ARGB pixels -void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - asm volatile( - "movdqa 0x20(%5),%%xmm4 \n" // RGBToU - "movdqa 0x40(%5),%%xmm5 \n" // RGBToV - "pcmpeqb %%xmm6,%%xmm6 \n" // 0x0101 - "pabsb %%xmm6,%%xmm6 \n" - "movdqa %6,%%xmm7 \n" // kShuffleAARRGGBB - "sub %1,%2 \n" - "1: \n" - "movdqu (%0),%%xmm0 \n" // Read 8x2 ARGB Pixels - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "pshufb %%xmm7,%%xmm0 \n" // aarrggbb - "pshufb %%xmm7,%%xmm1 \n" - "pshufb %%xmm7,%%xmm2 \n" - "pshufb %%xmm7,%%xmm3 \n" - "pmaddubsw %%xmm6,%%xmm0 \n" // 8x2 -> 4x2 - "pmaddubsw %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm6,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" // 4x2 -> 4x1 - "paddw %%xmm3,%%xmm1 \n" - "pxor %%xmm2,%%xmm2 \n" // 0 for vpavgw - "psrlw $1,%%xmm0 \n" - "psrlw $1,%%xmm1 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm2,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" // mutates - - "movdqa %%xmm6,%%xmm2 \n" - "psllw $15,%%xmm2 \n" // 0x8000 - "movdqa %%xmm0,%%xmm1 \n" - "pmaddubsw %%xmm5,%%xmm1 \n" // 4 V - "pmaddubsw %%xmm4,%%xmm0 \n" // 4 U - "phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv - "psubw %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,(%1) \n" // Write 4 U's - "pshufd $0x55,%%xmm2,%%xmm2 \n" // Copy V to low 4 bytes - "movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's - - "lea 0x20(%0),%0 \n" - "lea 0x4(%1),%1 \n" - "subl $0x8,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 -#if defined(__i386__) - "+m"(width) // %3 -#else - "+rm"(width) // %3 -#endif - : "r"((ptrdiff_t)(src_stride_argb)), // %4 - "r"(c), // %5 - "m"(kShuffleAARRGGBB) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #endif // HAS_ARGBTOUVROW_SSSE3 @@ -2004,15 +1141,6 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, // VG -0.7344 coefficient = -94 // VR 0.875 coefficient = 112 -#ifdef HAS_ARGBTOUV444ROW_SSSE3 -void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUV444MatrixRow_SSSE3(src_argb, dst_u, dst_v, width, - &kArgbI601Constants); -} -#endif // HAS_ARGBTOUV444ROW_SSSE3 #ifdef HAS_ARGBTOYROW_AVX2 @@ -2089,43 +1217,6 @@ void ARGBToUV444Row_AVX512BW(const uint8_t* src_argb, } #endif // HAS_ARGBTOUV444ROW_AVX512BW -#ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width, - &kArgbI601Constants); -} - -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_abgr, src_stride_abgr, dst_u, dst_v, width, - &kAbgrI601Constants); -} - -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_bgra, src_stride_bgra, dst_u, dst_v, width, - &kBgraI601Constants); -} - -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_rgba, src_stride_rgba, dst_u, dst_v, width, - &kRgbaI601Constants); -} -#endif // HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_AVX2 void ARGBToUVRow_AVX2(const uint8_t* src_argb, @@ -2147,15 +1238,6 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr, } #endif // HAS_ARGBTOUVROW_AVX2 -#ifdef HAS_ARGBTOUVJ444ROW_SSSE3 -void ARGBToUVJ444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUV444MatrixRow_SSSE3(src_argb, dst_u, dst_v, width, - &kArgbJPEGConstants); -} -#endif // HAS_ARGBTOUVJ444ROW_SSSE3 #ifdef HAS_ARGBTOUVJ444ROW_AVX2 void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb, @@ -2176,27 +1258,7 @@ void ARGBToUVJ444Row_AVX512BW(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVJ444ROW_AVX512BW -#ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width, - &kArgbJPEGConstants); -} -#endif // HAS_ARGBTOUVJROW_SSSE3 -#ifdef HAS_ABGRTOUVJROW_SSSE3 -void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_abgr, src_stride_abgr, dst_u, dst_v, width, - &kAbgrJPEGConstants); -} -#endif // HAS_ABGRTOUVJROW_SSSE3 #ifdef HAS_ARGBTOUVJROW_AVX2 void ARGBToUVJRow_AVX2(const uint8_t* src_argb, @@ -2343,19 +1405,13 @@ void ABGRToUVJRow_AVX512BW(const uint8_t* src_abgr, #endif // HAS_ABGRTOUVJROW_AVX512BW #endif // HAS_ARGBTOUVROW_AVX512BW -void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_bgra, dst_y, width, &kBgraI601Constants); -} -void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_abgr, dst_y, width, &kAbgrI601Constants); -} -void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_rgba, dst_y, width, &kRgbaI601Constants); -} -#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) + + + +#if 1 || defined(HAS_I422TOARGBROW_AVX2) // Read 8 UV from 444 #define READYUV444 \ @@ -2682,707 +1738,55 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \ "lea 0x20(%[dst_ar30]), %[dst_ar30] \n" -void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV444 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#ifdef HAS_I444ALPHATOARGBROW_SSSE3 -void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA444 YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -#endif // HAS_I444ALPHATOARGBROW_SSSE3 -void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" - "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB(yuvconstants) - STORERGB24 - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] -#if defined(__i386__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), - [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); -} -void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" - "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUV444 - YUVTORGB(yuvconstants) - STORERGB24 - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] -#if defined(__i386__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), - [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); -} -void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} // 10 bit YUV to ARGB -void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV210 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} // 12 bit YUV to ARGB -void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV212 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} // 10 bit YUV to AR30 -void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - LABELALIGN - "1: \n" - READYUV210 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} // 12 bit YUV to AR30 -void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - LABELALIGN - "1: \n" - READYUV212 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} // 10 bit YUV to ARGB -void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV410 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#ifdef HAS_I210ALPHATOARGBROW_SSSE3 -// 10 bit YUVA to ARGB -void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA210 YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_I410ALPHATOARGBROW_SSSE3 -// 10 bit YUVA to ARGB -void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - - LABELALIGN "1: \n" READYUVA410 YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -#endif // 10 bit YUV to AR30 -void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - LABELALIGN - "1: \n" - READYUV410 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -#ifdef HAS_I422ALPHATOARGBROW_SSSE3 -void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA422 YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -#endif // HAS_I422ALPHATOARGBROW_SSSE3 -void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV12 YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[uv_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV21 YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [vu_buf] "+r"(vu_buf), // %[vu_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] - [kShuffleNV21] "m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - "movdqa %[kShuffleYUY2Y],%%xmm6 \n" - "movdqa %[kShuffleYUY2UV],%%xmm7 \n" YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [yuy2_buf] "+r"(yuy2_buf), // %[yuy2_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] - [kShuffleYUY2Y] "m"(kShuffleYUY2Y), [kShuffleYUY2UV] "m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5", "xmm6", "xmm7"); -} -void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - "movdqa %[kShuffleUYVYY],%%xmm6 \n" - "movdqa %[kShuffleUYVYUV],%%xmm7 \n" YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [uyvy_buf] "+r"(uyvy_buf), // %[uyvy_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] - [kShuffleUYVYY] "m"(kShuffleUYVYY), [kShuffleUYVYUV] "m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); -} -void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP210 YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[u_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP410 YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[u_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - LABELALIGN - "1: \n" - READP210 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - LABELALIGN - "1: \n" - READP410 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB(yuvconstants) - STORERGBA - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} #endif // HAS_I422TOARGBROW_SSSE3 @@ -4556,48 +2960,6 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, } #endif // HAS_P410TOAR30ROW_AVX2 -#ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8_t* y_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 - "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 - "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 - "pslld $0x18,%%xmm4 \n" - - LABELALIGN - "1: \n" - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "paddsw %%xmm3,%%xmm0 \n" - "psraw $6, %%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - - // Step 2: Weave into ARGB - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "por %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : "r"(yuvconstants) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif // HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). @@ -4643,29 +3005,12 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, } #endif // HAS_I400TOARGBROW_AVX2 -#ifdef HAS_MIRRORROW_SSSE3 +#if 1 // Shuffle table for reversing the bytes. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,1),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 @@ -4691,29 +3036,12 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_MIRRORROW_AVX2 -#ifdef HAS_MIRRORUVROW_SSSE3 +#if 1 // Shuffle table for reversing the UV. static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; -void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,2),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorUV) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_MIRRORUVROW_AVX2 @@ -4739,40 +3067,14 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { } #endif // HAS_MIRRORUVROW_AVX2 -#ifdef HAS_MIRRORSPLITUVROW_SSSE3 +#if 1 // Shuffle table for reversing the bytes of UV channels. static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile( - "movdqa %4,%%xmm1 \n" - "lea -0x10(%0,%3,2),%0 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $8,%3 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorSplitUV) // %4 - : "memory", "cc", "xmm0", "xmm1"); -} #endif // HAS_MIRRORSPLITUVROW_SSSE3 -#ifdef HAS_RGB24MIRRORROW_SSSE3 +#if 1 // Shuffle first 5 pixels to last 5 mirrored. first byte zero static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, @@ -4783,64 +3085,9 @@ static const uvec8 kShuffleMirrorRGB1 = { 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; // Shuffle 5 pixels at a time (15 bytes) -void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - src_rgb24 += width * 3 - 48; - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // first 5 - "movdqu 15(%0),%%xmm1 \n" // next 5 - "movdqu 30(%0),%%xmm2 \n" // next 5 - "movdqu 32(%0),%%xmm3 \n" // last 1 special - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm4,%%xmm2 \n" - "pshufb %%xmm5,%%xmm3 \n" - "lea -0x30(%0),%0 \n" - "movdqu %%xmm0,32(%1) \n" // last 5 - "movdqu %%xmm1,17(%1) \n" // next 5 - "movdqu %%xmm2,2(%1) \n" // next 5 - "movlpd %%xmm3,0(%1) \n" // first 1 - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_rgb24), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorRGB0), // %3 - "m"(kShuffleMirrorRGB1) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} #endif // HAS_RGB24MIRRORROW_SSSE3 -#ifdef HAS_ARGBMIRRORROW_SSE2 - -void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("lea -0x10(%0,%2,4),%0 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc", "xmm0"); -} -#endif // HAS_ARGBMIRRORROW_SSE2 #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. @@ -4903,86 +3150,8 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, } #endif // HAS_SPLITUVROW_AVX2 -#ifdef HAS_SPLITUVROW_SSE2 -void SplitUVRow_SSE2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SPLITUVROW_SSE2 -#ifdef HAS_DETILEROW_SSE2 -void DetileRow_SSE2(const uint8_t* src, - ptrdiff_t src_tile_stride, - uint8_t* dst, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "sub $0x10,%2 \n" - "lea (%0,%3),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(src_tile_stride) // %3 - : "cc", "memory", "xmm0"); -} -#endif // HAS_DETILEROW_SSE2 - -#ifdef HAS_DETILEROW_16_SSE2 -void DetileRow_16_SSE2(const uint16_t* src, - ptrdiff_t src_tile_stride, - uint16_t* dst, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(src_tile_stride) // %3 - : "cc", "memory", "xmm0", "xmm1"); -} -#endif // HAS_DETILEROW_SSE2 #ifdef HAS_DETILEROW_16_AVX void DetileRow_16_AVX(const uint16_t* src, @@ -5006,40 +3175,8 @@ void DetileRow_16_AVX(const uint16_t* src, } #endif // HAS_DETILEROW_AVX -#ifdef HAS_DETILETOYUY2_SSE2 -// Read 16 Y, 8 UV, and write 8 YUYV. -void DetileToYUY2_SSE2(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" // Load 16 Y - "sub $0x10,%3 \n" - "lea (%0,%4),%0 \n" - "movdqu (%1),%%xmm1 \n" // Load 8 UV - "lea (%1,%5),%1 \n" - "movdqu %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_yuy2), // %2 - "+r"(width) // %3 - : "r"(src_y_tile_stride), // %4 - "r"(src_uv_tile_stride) // %5 - : "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list - ); -} -#endif -#ifdef HAS_DETILESPLITUVROW_SSSE3 +#if 1 // TODO(greenjustin): Look into generating these constants instead of loading // them since this can cause branch mispredicts for fPIC code on 32-bit // machines. @@ -5048,31 +3185,7 @@ static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14, // TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very // slow on older SSE2 processors. -void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, - ptrdiff_t src_tile_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqu %4,%%xmm1 \n" - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea (%0, %5),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "movhps %%xmm0,(%2) \n" - "lea 0x8(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "m"(kDeinterlaceUV), // %4 - "r"(src_tile_stride) // %5 - : "cc", "memory", "xmm0", "xmm1"); -} + #endif // HAS_DETILESPLITUVROW_SSSE3 #ifdef HAS_MERGEUVROW_AVX512BW @@ -5131,34 +3244,6 @@ void MergeUVRow_AVX2(const uint8_t* src_u, } #endif // HAS_MERGEUVROW_AVX2 -#ifdef HAS_MERGEUVROW_SSE2 -void MergeUVRow_SSE2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - asm volatile("sub %0,%1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_16_AVX2 void MergeUVRow_16_AVX2(const uint16_t* src_u, @@ -5319,34 +3404,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y, // 16384 = 10 bits // 4096 = 12 bits // 256 = 16 bits -void Convert16To8Row_SSSE3(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width) { - asm volatile( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - // 32 pixels per loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "add $0x20,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} #ifdef HAS_CONVERT16TO8ROW_AVX2 void Convert16To8Row_AVX2(const uint16_t* src_y, @@ -5415,36 +3473,7 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, // 512 = 9 bits // 1024 = 10 bits // 4096 = 12 bits -void Convert8To16Row_SSE2(const uint8_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - asm volatile( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - // 32 pixels per loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "add $0x10,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "add $0x20,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} #ifdef HAS_CONVERT8TO16ROW_AVX2 void Convert8To16Row_AVX2(const uint8_t* src_y, @@ -5478,7 +3507,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, } #endif // HAS_CONVERT8TO16ROW_AVX2 -#ifdef HAS_SPLITRGBROW_SSSE3 +#if 1 // Shuffle table for converting RGB to Planar. static const uvec8 kSplitRGBShuffle[9] = { {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, @@ -5500,59 +3529,10 @@ static const uvec8 kSplitRGBShuffle[9] = { {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u}}; -void SplitRGBRow_SSSE3(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb 0(%5), %%xmm0 \n" - "pshufb 16(%5), %%xmm1 \n" - "pshufb 32(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb 48(%5),%%xmm0 \n" - "pshufb 64(%5),%%xmm1 \n" - "pshufb 80(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb 96(%5), %%xmm0 \n" - "pshufb 112(%5), %%xmm1 \n" - "pshufb 128(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - "lea 0x10(%3),%3 \n" - "lea 0x30(%0),%0 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "r"(&kSplitRGBShuffle[0]) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} #endif // HAS_SPLITRGBROW_SSSE3 -#ifdef HAS_SPLITRGBROW_SSE41 +#if 1 // Shuffle table for converting RGB to Planar, SSE4.1. Note: these are used for // the AVX2 implementation as well. static const uvec8 kSplitRGBShuffleSSE41[5] = { @@ -5563,46 +3543,7 @@ static const uvec8 kSplitRGBShuffleSSE41[5] = { {0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u}, }; -void SplitRGBRow_SSE41(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "movdqa 48(%5), %%xmm0 \n" - "1: \n" - "movdqu (%0),%%xmm1 \n" - "movdqu 0x10(%0),%%xmm2 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm1, %%xmm4 \n" - "pblendvb %%xmm3, %%xmm1 \n" - "pblendvb %%xmm2, %%xmm3 \n" - "pblendvb %%xmm4, %%xmm2 \n" - "palignr $0xF, %%xmm0, %%xmm0 \n" - "pblendvb %%xmm2, %%xmm1 \n" - "pblendvb %%xmm3, %%xmm2 \n" - "pblendvb %%xmm4, %%xmm3 \n" - "palignr $0x1, %%xmm0, %%xmm0 \n" - "pshufb 0(%5), %%xmm1 \n" - "pshufb 16(%5), %%xmm2 \n" - "pshufb 32(%5), %%xmm3 \n" - "movdqu %%xmm1,(%1) \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm2,(%2) \n" - "lea 0x10(%2),%2 \n" - "movdqu %%xmm3,(%3) \n" - "lea 0x10(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "r"(&kSplitRGBShuffleSSE41[0]) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} + #endif // HAS_SPLITRGBROW_SSE41 #ifdef HAS_SPLITRGBROW_AVX2 @@ -5669,7 +3610,7 @@ void SplitRGBRow_AVX2(const uint8_t* src_rgb, } #endif // HAS_SPLITRGBROW_AVX2 -#ifdef HAS_MERGERGBROW_SSSE3 +#if 1 // Shuffle table for converting Planar to RGB. static const uvec8 kMergeRGBShuffle[9] = { {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u, @@ -5691,137 +3632,10 @@ static const uvec8 kMergeRGBShuffle[9] = { {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u, 15u}}; -void MergeRGBRow_SSSE3(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb (%5), %%xmm0 \n" - "pshufb 16(%5), %%xmm1 \n" - "pshufb 32(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb 48(%5), %%xmm0 \n" - "pshufb 64(%5), %%xmm1 \n" - "pshufb 80(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,16(%3) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb 96(%5), %%xmm0 \n" - "pshufb 112(%5), %%xmm1 \n" - "pshufb 128(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,32(%3) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "lea 0x10(%2),%2 \n" - "lea 0x30(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_rgb), // %3 - "+r"(width) // %4 - : "r"(&kMergeRGBShuffle[0]) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} #endif // HAS_MERGERGBROW_SSSE3 -#ifdef HAS_MERGEARGBROW_SSE2 -void MergeARGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - LABELALIGN - "1: \n" - - "movq (%0,%2),%%xmm0 \n" // B - "movq (%0),%%xmm1 \n" // R - "movq (%0,%1),%%xmm2 \n" // G - "punpcklbw %%xmm1,%%xmm0 \n" // BR - "movq (%0,%3),%%xmm1 \n" // A - "punpcklbw %%xmm1,%%xmm2 \n" // GA - "movdqa %%xmm0,%%xmm1 \n" // BR - "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) - "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) - "movdqu %%xmm0,(%4) \n" - "movdqu %%xmm1,16(%4) \n" - - "lea 8(%0),%0 \n" - "lea 32(%4),%4 \n" - "sub $0x8,%5 \n" - "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif - -#ifdef HAS_MERGEXRGBROW_SSE2 -void MergeXRGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - asm volatile( - "1: \n" - - "movq (%2),%%xmm0 \n" // B - "movq (%0),%%xmm1 \n" // R - "movq (%1),%%xmm2 \n" // G - "punpcklbw %%xmm1,%%xmm0 \n" // BR - "pcmpeqd %%xmm1,%%xmm1 \n" // A(255) - "punpcklbw %%xmm1,%%xmm2 \n" // GA - "movdqa %%xmm0,%%xmm1 \n" // BR - "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) - "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,16(%3) \n" - - "lea 8(%0),%0 \n" - "lea 8(%1),%1 \n" - "lea 8(%2),%2 \n" - "lea 32(%3),%3 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_MERGEARGBROW_SSE2 #ifdef HAS_MERGEARGBROW_AVX2 void MergeARGBRow_AVX2(const uint8_t* src_r, @@ -5910,191 +3724,11 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r, } #endif // HAS_MERGEARGBROW_AVX2 -#ifdef HAS_SPLITARGBROW_SSE2 -void SplitARGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile( - "sub %1,%2 \n" - "sub %1,%3 \n" - "sub %1,%4 \n" - LABELALIGN - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 - "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B - "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%1,%3) \n" // B - "movhps %%xmm0,(%1,%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - "movhps %%xmm2,(%1,%4) \n" // A - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "sub $0x8,%5 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 - "+rm"(width) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif - -#ifdef HAS_SPLITXRGBROW_SSE2 -void SplitXRGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 - "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B - "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%3) \n" // B - "movhps %%xmm0,(%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "lea 8(%2),%2 \n" - "lea 8(%3),%3 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; -#ifdef HAS_SPLITARGBROW_SSSE3 -void SplitARGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile( - "movdqa %6,%%xmm3 \n" - "sub %1,%2 \n" - "sub %1,%3 \n" - "sub %1,%4 \n" - LABELALIGN - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) - "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%1,%3) \n" // B - "movhps %%xmm0,(%1,%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - "movhps %%xmm2,(%1,%4) \n" // A - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "subl $0x8,%5 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 -#if defined(__i386__) - "+m"(width) // %5 -#else - "+rm"(width) // %5 -#endif - : "m"(kShuffleMaskARGBSplit) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} -#endif - -#ifdef HAS_SPLITXRGBROW_SSSE3 -void SplitXRGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - - LABELALIGN - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) - "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%3) \n" // B - "movhps %%xmm0,(%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "lea 8(%2),%2 \n" - "lea 8(%3),%3 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "m"(kShuffleMaskARGBSplit) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} -#endif #ifdef HAS_SPLITARGBROW_AVX2 static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7}; @@ -6500,45 +4134,6 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, } #endif -#ifdef HAS_COPYROW_SSE2 -void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "test $0xf,%0 \n" - "jne 2f \n" - "test $0xf,%1 \n" - "jne 2f \n" - - LABELALIGN - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 9f \n" - - LABELALIGN - "2: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 2b \n" - - LABELALIGN "9: \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_AVX void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { @@ -6595,40 +4190,6 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_COPYROW_ERMS -#ifdef HAS_ARGBCOPYALPHAROW_SSE2 -// width in pixels -void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBCOPYALPHAROW_SSE2 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels @@ -6658,31 +4219,6 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_ARGBCOPYALPHAROW_AVX2 -#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 -// width in pixels -void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - asm volatile( - "1: \n" - "movdqu (%0), %%xmm0 \n" - "movdqu 0x10(%0), %%xmm1 \n" - "lea 0x20(%0), %0 \n" - "psrld $0x18, %%xmm0 \n" - "psrld $0x18, %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "packuswb %%xmm0, %%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1), %1 \n" - "sub $0x8, %2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 static const uvec8 kShuffleAlphaShort_AVX2 = { @@ -6725,42 +4261,6 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBEXTRACTALPHAROW_AVX2 -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -// width in pixels -void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm2 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpckhwd %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels @@ -6822,228 +4322,6 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { } #endif // HAS_SETROW_X86 -#ifdef HAS_YUY2TOYROW_SSE2 -void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} - -void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_uv, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(stride_yuy2)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} - -void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_yuy2)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} - -void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} - -void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_uyvy)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_YUY2TOYROW_AVX2 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { @@ -7279,144 +4557,15 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, } #endif // HAS_YUY2TOYROW_AVX2 -#ifdef HAS_ARGBBLENDROW_SSSE3 +#if 1 // Shuffle table for isolating alpha. static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time -void ARGBBlendRow_SSSE3(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - // 4 pixel loop. - LABELALIGN - "40: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" - - // 1 pixel loop. - "91: \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" - "99: \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : "m"(kShuffleAlpha) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #endif // HAS_ARGBBLENDROW_SSSE3 -#ifdef HAS_BLENDPLANEROW_SSSE3 -// Blend 8 pixels at a time. -// unsigned version of math -// =((A2*C2)+(B2*(255-C2))+255)/256 -// signed version of math -// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_SSSE3(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "mov $0x807f807f,%%eax \n" - "movd %%eax,%%xmm7 \n" - "pshufd $0x0,%%xmm7,%%xmm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%2),%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm0 \n" - "movq (%0,%2,1),%%xmm1 \n" - "movq (%1,%2,1),%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "psubb %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "paddw %%xmm7,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%3,%2,1) \n" - "lea 0x8(%2),%2 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(alpha), // %2 - "+r"(dst), // %3 - "+rm"(width) // %4 - ::"memory", - "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); -} -#endif // HAS_BLENDPLANEROW_SSSE3 #ifdef HAS_BLENDPLANEROW_AVX2 // Blend 32 pixels at a time. @@ -7479,57 +4628,14 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, } #endif // HAS_BLENDPLANEROW_AVX2 -#ifdef HAS_ARGBATTENUATEROW_SSSE3 +#if 1 // Shuffle table duplicating alpha. static const vec8 kAttenuateShuffle = {6, -128, 6, -128, 6, -128, -128, -128, 14, -128, 14, -128, 14, -128, -128, -128}; // Attenuate 4 pixels at a time. -void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "punpcklbw %%xmm6,%%xmm7 \n" - "sub %0,%1 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqa %%xmm6,%%xmm0 \n" - "movdqa %%xmm6,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufb %%xmm4,%%xmm2 \n" // a,a,a,0 - "pshufb %%xmm4,%%xmm3 \n" - "pmullw %%xmm2,%%xmm0 \n" // rgb * alpha - "pmullw %%xmm3,%%xmm1 \n" - "paddw %%xmm7,%%xmm0 \n" // + 255 - "paddw %%xmm7,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "pand %%xmm5,%%xmm6 \n" - "por %%xmm6,%%xmm0 \n" - "movdqu %%xmm0,(%0,%1) \n" - "lea 0x10(%0),%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kAttenuateShuffle) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #endif // HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_AVX2 @@ -7584,50 +4690,6 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBATTENUATEROW_AVX2 -#ifdef HAS_ARGBUNATTENUATEROW_SSE2 -// Unattenuate 4 pixels at a time. -void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - uintptr_t alpha; - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movzb 0x03(%0),%3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x07(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movzb 0x0b(%0),%3 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x0f(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "=&r"(alpha) // %3 - : "r"(fixed_invtbl8) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. @@ -7697,56 +4759,8 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBUNATTENUATEROW_AVX2 -#ifdef HAS_ARGBGRAYROW_SSSE3 -// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psubb %%xmm5,%%xmm0 \n" - "psubb %%xmm5,%%xmm1 \n" - "movdqu %%xmm4,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "movdqu %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "phaddw %%xmm0,%%xmm6 \n" - "paddw %%xmm5,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "psrld $0x18,%%xmm2 \n" - "psrld $0x18,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movdqa %%xmm6,%%xmm3 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm2,%%xmm3 \n" - "movdqa %%xmm6,%%xmm1 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm1 \n" - "movdqu %%xmm6,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kSub128) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_ARGBGRAYROW_SSSE3 - -#ifdef HAS_ARGBSEPIAROW_SSSE3 +#if 1 // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 @@ -7761,250 +4775,12 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { - asm volatile( - "movdqa %2,%%xmm2 \n" - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm6 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "phaddw %%xmm6,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "m"(kARGBToSepiaB), // %2 - "m"(kARGBToSepiaG), // %3 - "m"(kARGBToSepiaR) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} #endif // HAS_ARGBSEPIAROW_SSSE3 -#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 -// Tranform 8 ARGB pixels (32 bytes) with color matrix. -// Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - asm volatile( - "movdqu (%3),%%xmm5 \n" - "pshufd $0x00,%%xmm5,%%xmm2 \n" - "pshufd $0x55,%%xmm5,%%xmm3 \n" - "pshufd $0xaa,%%xmm5,%%xmm4 \n" - "pshufd $0xff,%%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm7,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm6 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm1 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm6 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "punpcklwd %%xmm1,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm6 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm6,0x10(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 -#ifdef HAS_ARGBQUANTIZEROW_SSE2 -// Quantize 4 ARGB pixels (16 bytes). -void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - asm volatile( - "movd %2,%%xmm2 \n" - "movd %3,%%xmm3 \n" - "movd %4,%%xmm4 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshufd $0x44,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "pshufd $0x44,%%xmm3,%%xmm3 \n" - "pshuflw $0x40,%%xmm4,%%xmm4 \n" - "pshufd $0x44,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "pslld $0x18,%%xmm6 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "pmullw %%xmm3,%%xmm0 \n" - "movdqu (%0),%%xmm7 \n" - "pmullw %%xmm3,%%xmm1 \n" - "pand %%xmm6,%%xmm7 \n" - "paddw %%xmm4,%%xmm0 \n" - "paddw %%xmm4,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqu %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x4,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBQUANTIZEROW_SSE2 - -#ifdef HAS_ARGBSHADEROW_SSE2 -// Shade 4 pixels at a time by specified value. -void ARGBShadeRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - asm volatile( - "movd %3,%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm2 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_ARGBSHADEROW_SSE2 - -#ifdef HAS_ARGBMULTIPLYROW_SSE2 -// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_ARGBMULTIPLYROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. @@ -8042,33 +4818,6 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBMULTIPLYROW_AVX2 -#ifdef HAS_ARGBADDROW_SSE2 -// Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_ARGBADDROW_SSE2 #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. @@ -8098,33 +4847,6 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBADDROW_AVX2 -#ifdef HAS_ARGBSUBTRACTROW_SSE2 -// Subtract 2 rows of ARGB pixels, 4 pixels at a time. -void ARGBSubtractRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psubusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_ARGBSUBTRACTROW_SSE2 #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. @@ -8154,626 +4876,19 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBSUBTRACTROW_AVX2 -#ifdef HAS_SOBELXROW_SSE2 -// SobelX as a matrix is -// -1 0 1 -// -2 0 2 -// -1 0 1 -void SobelXRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "pxor %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x2(%0),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "movq 0x02(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x00(%0,%2,1),%%xmm2 \n" - "movq 0x02(%0,%2,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%3,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SOBELXROW_SSE2 -#ifdef HAS_SOBELYROW_SSE2 -// SobelY as a matrix is -// -1 -2 -1 -// 0 0 0 -// 1 2 1 -void SobelYRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "pxor %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x1(%0),%%xmm1 \n" - "movq 0x01(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x2(%0),%%xmm2 \n" - "movq 0x02(%0,%1,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%2,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SOBELYROW_SSE2 -#ifdef HAS_SOBELROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into ARGB. -// A = 255 -// R = Sobel -// G = Sobel -// B = Sobel -void SobelRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm2 \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm1 \n" - "punpckhwd %%xmm2,%%xmm2 \n" - "por %%xmm5,%%xmm1 \n" - "por %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklwd %%xmm0,%%xmm3 \n" - "punpckhwd %%xmm0,%%xmm0 \n" - "por %%xmm5,%%xmm3 \n" - "por %%xmm5,%%xmm0 \n" - "movdqu %%xmm1,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "movdqu %%xmm3,0x20(%2) \n" - "movdqu %%xmm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SOBELROW_SSE2 -#ifdef HAS_SOBELTOPLANEROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into a plane. -void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_SOBELTOPLANEROW_SSE2 - -#ifdef HAS_SOBELXYROW_SSE2 -// Mixes Sobel X, Sobel Y and Sobel into ARGB. -// A = 255 -// R = Sobel X -// G = Sobel -// B = Sobel Y -void SobelXYRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "paddusb %%xmm1,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "punpckhbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm2,%%xmm4 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm4,%%xmm6 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "movdqa %%xmm1,%%xmm7 \n" - "punpcklwd %%xmm0,%%xmm7 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm6,(%2) \n" - "movdqu %%xmm4,0x10(%2) \n" - "movdqu %%xmm7,0x20(%2) \n" - "movdqu %%xmm1,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_SOBELXYROW_SSE2 - -#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 -// Creates a table of cumulative sums where each value is a sum of all values -// above and to the left of the value, inclusive of the value. -void ComputeCumulativeSumRow_SSE2(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width) { - asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "test $0xf,%1 \n" - "jne 49f \n" - - // 4 pixel loop. - LABELALIGN - "40: \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm3 \n" - "punpckhbw %%xmm1,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "punpcklwd %%xmm1,%%xmm4 \n" - "punpckhwd %%xmm1,%%xmm5 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm0 \n" - "movdqu 0x10(%2),%%xmm3 \n" - "paddd %%xmm0,%%xmm3 \n" - "paddd %%xmm4,%%xmm0 \n" - "movdqu 0x20(%2),%%xmm4 \n" - "paddd %%xmm0,%%xmm4 \n" - "paddd %%xmm5,%%xmm0 \n" - "movdqu 0x30(%2),%%xmm5 \n" - "lea 0x40(%2),%2 \n" - "paddd %%xmm0,%%xmm5 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "movdqu %%xmm4,0x20(%1) \n" - "movdqu %%xmm5,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" - - // 1 pixel loop. - LABELALIGN - "10: \n" - "movd (%0),%%xmm2 \n" - "lea 0x4(%0),%0 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "lea 0x10(%2),%2 \n" - "paddd %%xmm0,%%xmm2 \n" - "movdqu %%xmm2,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - - "19: \n" - : "+r"(row), // %0 - "+r"(cumsum), // %1 - "+r"(previous_cumsum), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 - -#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, - const int32_t* botleft, - int width, - int area, - uint8_t* dst, - int count) { - asm volatile( - "movd %5,%%xmm5 \n" - "cvtdq2ps %%xmm5,%%xmm5 \n" - "rcpss %%xmm5,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "cmpl $0x80,%5 \n" - "ja 40f \n" - - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrld $0x10,%%xmm6 \n" - "cvtdq2ps %%xmm6,%%xmm6 \n" - "addps %%xmm6,%%xmm5 \n" - "mulps %%xmm4,%%xmm5 \n" - "cvtps2dq %%xmm5,%%xmm5 \n" - "packssdw %%xmm5,%%xmm5 \n" - - // 4 pixel small loop. - LABELALIGN - "4: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 4b \n" - "jmp 49f \n" - - // 4 pixel loop - LABELALIGN - "40: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm1,%%xmm1 \n" - "mulps %%xmm4,%%xmm0 \n" - "mulps %%xmm4,%%xmm1 \n" - "cvtdq2ps %%xmm2,%%xmm2 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "cvtps2dq %%xmm1,%%xmm1 \n" - "cvtps2dq %%xmm2,%%xmm2 \n" - "cvtps2dq %%xmm3,%%xmm3 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" - - // 1 pixel loop - LABELALIGN - "10: \n" - "movdqu (%0),%%xmm0 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "mulps %%xmm4,%%xmm0 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - "19: \n" - : "+r"(topleft), // %0 - "+r"(botleft), // %1 - "+r"(dst), // %2 - "+rm"(count) // %3 - : "r"((ptrdiff_t)(width)), // %4 - "rm"(area) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 - -#ifdef HAS_ARGBAFFINEROW_SSE2 +#if 1 // Copy ARGB pixels from source image with slope to a row of destination. LIBYUV_API -void ARGBAffineRow_SSE2(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* src_dudv, - int width) { - ptrdiff_t src_argb_stride_temp = src_argb_stride; - intptr_t temp; - asm volatile( - "movq (%3),%%xmm2 \n" - "movq 0x08(%3),%%xmm7 \n" - "shl $0x10,%1 \n" - "add $0x4,%1 \n" - "movd %1,%%xmm5 \n" - "sub $0x4,%4 \n" - "jl 49f \n" - "pshufd $0x44,%%xmm7,%%xmm7 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "movdqa %%xmm2,%%xmm0 \n" - "addps %%xmm7,%%xmm0 \n" - "movlhps %%xmm0,%%xmm2 \n" - "movdqa %%xmm7,%%xmm4 \n" - "addps %%xmm4,%%xmm4 \n" - "movdqa %%xmm2,%%xmm3 \n" - "addps %%xmm4,%%xmm3 \n" - "addps %%xmm4,%%xmm4 \n" - - // 4 pixel loop - LABELALIGN - "40: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 - "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 - "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts - "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm1 \n" - "addps %%xmm4,%%xmm2 \n" - "movq %%xmm1,(%2) \n" - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm0 \n" - "addps %%xmm4,%%xmm3 \n" - "movq %%xmm0,0x08(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%4 \n" - "jl 19f \n" - - // 1 pixel loop - LABELALIGN - "10: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "pmaddwd %%xmm5,%%xmm0 \n" - "addps %%xmm7,%%xmm2 \n" - "movd %%xmm0,%k1 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x04(%2),%2 \n" - "sub $0x1,%4 \n" - "jge 10b \n" - "19: \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_stride_temp), // %1 - "+r"(dst_argb), // %2 - "+r"(src_dudv), // %3 - "+rm"(width), // %4 - "=&r"(temp) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #endif // HAS_ARGBAFFINEROW_SSE2 -#ifdef HAS_INTERPOLATEROW_SSSE3 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - asm volatile( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "psubb %%xmm4,%%xmm0 \n" - "psubb %%xmm4,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm3 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "pmaddubsw %%xmm1,%%xmm3 \n" - "paddw %%xmm4,%%xmm2 \n" - "paddw %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+rm"(width), // %2 - "+r"(source_y_fraction) // %3 - : "r"(src_stride) // %4 - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 @@ -8853,33 +4968,6 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, } #endif // HAS_INTERPOLATEROW_AVX2 -#ifdef HAS_ARGBSHUFFLEROW_SSSE3 -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - asm volatile("movdqu (%3),%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_ARGBSHUFFLEROW_SSSE3 #ifdef HAS_ARGBSHUFFLEROW_AVX2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. @@ -8910,73 +4998,7 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBSHUFFLEROW_AVX2 -#ifdef HAS_I422TOYUY2ROW_SSE2 -void I422ToYUY2Row_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - asm volatile("sub %1,%2 \n" - LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "add $0x10,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_I422TOYUY2ROW_SSE2 - -#ifdef HAS_I422TOUYVYROW_SSE2 -void I422ToUYVYRow_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - asm volatile("sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "add $0x10,%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,(%3) \n" - "movdqu %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_I422TOUYVYROW_SSE2 #ifdef HAS_I422TOYUY2ROW_AVX2 void I422ToYUY2Row_AVX2(const uint8_t* src_y, @@ -9052,60 +5074,6 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, } #endif // HAS_I422TOUYVYROW_AVX2 -#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - asm volatile("pxor %%xmm3,%%xmm3 \n" - - // 2 pixel loop. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps 0x10(%3),%%xmm0 \n" - "mulps 0x10(%3),%%xmm4 \n" - "addps (%3),%%xmm0 \n" - "addps (%3),%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps 0x20(%3),%%xmm2 \n" - "mulps 0x20(%3),%%xmm6 \n" - "mulps 0x30(%3),%%xmm1 \n" - "mulps 0x30(%3),%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); -} -#endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, @@ -9148,43 +5116,9 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 -#ifdef HAS_HALFFLOATROW_SSE2 +#if 1 static float kScaleBias = 1.9259299444e-34f; -void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - scale *= kScaleBias; - asm volatile( - "movd %3,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm2 \n" // 8 shorts - "add $0x10,%0 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 - "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats - "punpckhwd %%xmm5,%%xmm3 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "psrld $0xd,%%xmm2 \n" - "psrld $0xd,%%xmm3 \n" - "packssdw %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,-0x10(%0,%1,1) \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(scale) // %3 - : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); -} #endif // HAS_HALFFLOATROW_SSE2 #ifdef HAS_HALFFLOATROW_AVX2 @@ -9357,106 +5291,6 @@ void RGBColorTableRow_X86(uint8_t* dst_argb, } #endif // HAS_RGBCOLORTABLEROW_X86 -#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 -// Tranform RGB pixels with luma table. -void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff) { - uintptr_t pixel_temp; - uintptr_t table_temp; - asm volatile( - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0x8,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%2),%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "phaddw %%xmm0,%%xmm0 \n" - "pand %%xmm4,%%xmm0 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb (%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,(%3) \n" - "movzb 0x1(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x1(%3) \n" - "movzb 0x2(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x2(%3) \n" - "movzb 0x3(%2),%0 \n" - "mov %b0,0x3(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x4(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x4(%3) \n" - "movzb 0x5(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x5(%3) \n" - "movzb 0x6(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x6(%3) \n" - "movzb 0x7(%2),%0 \n" - "mov %b0,0x7(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x8(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x8(%3) \n" - "movzb 0x9(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x9(%3) \n" - "movzb 0xa(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xa(%3) \n" - "movzb 0xb(%2),%0 \n" - "mov %b0,0xb(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - - "movzb 0xc(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xc(%3) \n" - "movzb 0xd(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xd(%3) \n" - "movzb 0xe(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xe(%3) \n" - "movzb 0xf(%2),%0 \n" - "mov %b0,0xf(%3) \n" - "lea 0x10(%2),%2 \n" - "lea 0x10(%3),%3 \n" - "sub $0x4,%4 \n" - "jg 1b \n" - : "=&d"(pixel_temp), // %0 - "=&a"(table_temp), // %1 - "+r"(src_argb), // %2 - "+r"(dst_argb), // %3 - "+rm"(width) // %4 - : "r"(luma), // %5 - "rm"(lumacoeff) // %6 - : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 static const uvec8 kYUV24Shuffle[3] = { {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12}, @@ -9466,40 +5300,7 @@ static const uvec8 kYUV24Shuffle[3] = { // Convert biplanar NV21 to packed YUV24 // NV21 has VU in memory for chroma. // YUV24 is VUY in memory -void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - asm volatile( - "sub %0,%1 \n" - "movdqa (%4),%%xmm4 \n" // 3 shuffler constants - "movdqa 16(%4),%%xmm5 \n" - "movdqa 32(%4),%%xmm6 \n" - "1: \n" - "movdqu (%0),%%xmm2 \n" // load 16 Y values - "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values - "lea 16(%0),%0 \n" - "movdqa %%xmm2,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3 - "shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5 - "shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7 - "pshufb %%xmm4, %%xmm0 \n" // weave into YUV24 - "pshufb %%xmm5, %%xmm1 \n" - "pshufb %%xmm6, %%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm1,16(%2) \n" - "movdqu %%xmm2,32(%2) \n" - "lea 48(%2),%2 \n" - "sub $16,%3 \n" // 16 pixels per loop - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_yuv24), // %2 - "+r"(width) // %3 - : "r"(&kYUV24Shuffle[0]) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} + // Convert biplanar NV21 to packed YUV24 // NV21 has VU in memory for chroma. @@ -9587,34 +5388,14 @@ void NV21ToYUV24Row_AVX512(const uint8_t* src_y, #endif // HAS_NV21ToYUV24ROW_AVX512 -#ifdef HAS_SWAPUVROW_SSSE3 +#if 1 // Shuffle table for reversing the bytes. static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; // Convert UV plane of NV12 to VU of NV21. -void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("movdqu %3,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : "m"(kShuffleUVToVU) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} #endif // HAS_SWAPUVROW_SSSE3 #ifdef HAS_SWAPUVROW_AVX2 @@ -9642,50 +5423,7 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { } #endif // HAS_SWAPUVROW_AVX2 -void HalfMergeUVRow_SSSE3(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 - "pabsb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // load 16 U values - "movdqu (%1),%%xmm1 \n" // load 16 V values - "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row - "movdqu 0(%1,%5,1),%%xmm3 \n" - "lea 0x10(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" // half size - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x10(%1),%1 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" // store 8 UV pixels - "lea 0x10(%2),%2 \n" - "sub $0x10,%3 \n" // 16 src pixels per loop - "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(src_stride_u)), // %4 - "r"((ptrdiff_t)(src_stride_v)) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} void HalfMergeUVRow_AVX2(const uint8_t* src_u, int src_stride_u, @@ -9733,25 +5471,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { - asm volatile( - "pxor %%xmm1,%%xmm1 \n" - LABELALIGN - "1: \n" - "movd (%0),%%xmm0 \n" // load float - "maxss %%xmm1, %%xmm0 \n" // clamp to zero - "add 4, %0 \n" - "movd %%xmm0, (%1) \n" // store float - "add 4, %1 \n" - "sub $0x4,%2 \n" // 1 float per loop - "jg 1b \n" - : "+r"(src_x), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} #ifdef HAS_CONVERT16TO8ROW_AVX2 void Convert8To8Row_AVX2(const uint8_t* src_y, diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 93bc431bc..270bae4e3 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -1419,17 +1419,9 @@ static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, } #endif -#ifdef HAS_RGB24TOYJROW_RVV -#endif -#ifdef HAS_RAWTOYJROW_RVV -#endif -#ifdef HAS_RGB24TOYROW_RVV -#endif -#ifdef HAS_RAWTOYROW_RVV -#endif // Blend src_argb over src_argb1 and store to dst_argb. // dst_argb may be src_argb or src_argb1. diff --git a/source/row_win.cc b/source/row_win.cc index 77070d031..9d200c847 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -102,21 +102,9 @@ extern "C" { _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ dst_argb += 32; -#if defined(HAS_I422TOARGBROW_SSSE3) -#endif -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) -#endif - -#if defined(HAS_I444TOARGBROW_SSSE3) - -#endif - -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) - -#endif #if defined(HAS_ARGBTOYROW_AVX2) diff --git a/source/scale.cc b/source/scale.cc index 9c1e9b264..7d1948ddb 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -81,7 +81,7 @@ static void ScalePlaneDown2(int src_width, : ScaleRowDown2Box_SME; } #endif -#if defined(HAS_SCALEROWDOWN2_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowDown2 = filtering == kFilterNone @@ -185,7 +185,7 @@ static void ScalePlaneDown2_16(int src_width, : ScaleRowDown2Box_16_SME; } #endif -#if defined(HAS_SCALEROWDOWN2_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering == kFilterNone @@ -284,7 +284,7 @@ static void ScalePlaneDown4(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN4_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; @@ -353,7 +353,7 @@ static void ScalePlaneDown4_16(int src_width, filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; } #endif -#if defined(HAS_SCALEROWDOWN4_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; @@ -442,23 +442,15 @@ static void ScalePlaneDown34(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN34_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { if (dst_width % 24 == 0) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_SSSE3; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; } } else { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; } } } @@ -534,14 +526,10 @@ static void ScalePlaneDown34_16(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN34_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3; } } #endif @@ -630,22 +618,14 @@ static void ScalePlaneDown38(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN38_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; } if (dst_width % 12 == 0 && !filtering) { - ScaleRowDown38_3 = ScaleRowDown38_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_SSSE3; } if (dst_width % 6 == 0 && filtering) { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; } } #endif @@ -740,14 +720,10 @@ static void ScalePlaneDown38_16(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN38_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; } } #endif @@ -926,11 +902,9 @@ static int ScalePlaneBox(int src_width, : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) = ScaleAddRow_C; -#if defined(HAS_SCALEADDROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleAddRow = ScaleAddRow_Any_SSE2; if (IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_SSE2; } } #endif @@ -1015,9 +989,8 @@ static int ScalePlaneBox_16(int src_width, void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr, int src_width) = ScaleAddRow_16_C; -#if defined(HAS_SCALEADDROW_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_16_SSE2; } #endif @@ -1076,11 +1049,9 @@ static int ScalePlaneBilinearDown(int src_width, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -1119,9 +1090,8 @@ static int ScalePlaneBilinearDown(int src_width, } #endif -#if defined(HAS_SCALEFILTERCOLS_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_SSSE3; } #endif #if defined(HAS_SCALEFILTERCOLS_NEON) @@ -1196,19 +1166,15 @@ static int ScalePlaneBilinearDown_16(int src_width, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_16_Any_SSE2; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; } } #endif -#if defined(HAS_INTERPOLATEROW_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; } } #endif @@ -1234,9 +1200,8 @@ static int ScalePlaneBilinearDown_16(int src_width, } #endif -#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_16_SSSE3; } #endif if (y > max_y) { @@ -1290,11 +1255,9 @@ static int ScalePlaneBilinearUp(int src_width, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -1328,9 +1291,8 @@ static int ScalePlaneBilinearUp(int src_width, if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_C; } -#if defined(HAS_SCALEFILTERCOLS_SSSE3) +#if 0 if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_SSSE3; } #endif #if defined(HAS_SCALEFILTERCOLS_NEON) @@ -1351,9 +1313,8 @@ static int ScalePlaneBilinearUp(int src_width, #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; -#if defined(HAS_SCALECOLS_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_SSE2; } #endif } @@ -1439,15 +1400,13 @@ static void ScalePlaneUp2_Linear(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; } #endif -#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; } #endif @@ -1504,15 +1463,13 @@ static void ScalePlaneUp2_Bilinear(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; } #endif -#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; } #endif @@ -1570,9 +1527,8 @@ static void ScalePlaneUp2_12_Linear(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif @@ -1625,9 +1581,8 @@ static void ScalePlaneUp2_12_Bilinear(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; } #endif @@ -1673,9 +1628,8 @@ static void ScalePlaneUp2_16_Linear(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; } #endif @@ -1723,9 +1677,8 @@ static void ScalePlaneUp2_16_Bilinear(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2; } #endif @@ -1779,19 +1732,15 @@ static int ScalePlaneBilinearUp_16(int src_width, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_16_Any_SSE2; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; } } #endif -#if defined(HAS_INTERPOLATEROW_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; } } #endif @@ -1820,16 +1769,14 @@ static int ScalePlaneBilinearUp_16(int src_width, if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_16_C; } -#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) +#if 0 if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_16_SSSE3; } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_16_C; -#if defined(HAS_SCALECOLS_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_16_SSE2; } #endif } @@ -1917,9 +1864,8 @@ static void ScalePlaneSimple(int src_width, if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_C; -#if defined(HAS_SCALECOLS_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_SSE2; } #endif } @@ -1954,9 +1900,8 @@ static void ScalePlaneSimple_16(int src_width, if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_16_C; -#if defined(HAS_SCALECOLS_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_16_SSE2; } #endif } diff --git a/source/scale_any.cc b/source/scale_any.cc index c380bebbc..f7f457e3a 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -49,7 +49,7 @@ extern "C" { dst_ptr + n * BPP, r + 1); \ } -#ifdef HAS_SCALEROWDOWN2_SSSE3 +#if 0 SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3, @@ -70,7 +70,7 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3, 1, 15) #endif -#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 +#if 0 SDANY(ScaleUVRowDown2Box_Any_SSSE3, ScaleUVRowDown2Box_SSSE3, ScaleUVRowDown2Box_C, @@ -167,7 +167,7 @@ SDANY(ScaleRowDown2Box_Any_LSX, 1, 31) #endif -#ifdef HAS_SCALEROWDOWN4_SSSE3 +#if 0 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, @@ -203,7 +203,7 @@ SDANY(ScaleRowDown4Box_Any_LSX, 1, 15) #endif -#ifdef HAS_SCALEROWDOWN34_SSSE3 +#if 0 SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, ScaleRowDown34_C, @@ -284,7 +284,7 @@ SDANY(ScaleRowDown34_1_Box_Any_LSX, 1, 47) #endif -#ifdef HAS_SCALEROWDOWN38_SSSE3 +#if 0 SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, ScaleRowDown38_C, @@ -345,7 +345,7 @@ SDANY(ScaleRowDown38_2_Box_Any_LSX, 11) #endif -#ifdef HAS_SCALEARGBROWDOWN2_SSE2 +#if 0 SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2, ScaleARGBRowDown2_C, @@ -420,7 +420,7 @@ SDANY(ScaleARGBRowDown2Box_Any_LSX, dst_ptr + n * BPP, r); \ } -#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 +#if 0 SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2, ScaleARGBRowDownEven_C, @@ -484,7 +484,7 @@ SDAANY(ScaleUVRowDownEven_Any_NEON, memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \ } -#ifdef HAS_SCALEADDROW_SSE2 +#if 0 SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15) #endif #ifdef HAS_SCALEADDROW_AVX2 @@ -510,7 +510,7 @@ SAROW(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, 1, 2, 15) SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ } -#ifdef HAS_SCALEADDROW_SSE2 +#if 0 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) #endif #ifdef HAS_SCALEADDROW_AVX2 @@ -597,7 +597,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_C, 0, uint16_t) -#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 +#if 0 SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, ScaleRowUp2_Linear_SSE2, ScaleRowUp2_Linear_C, @@ -605,7 +605,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, uint8_t) #endif -#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 +#if 0 SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, ScaleRowUp2_Linear_SSSE3, ScaleRowUp2_Linear_C, @@ -613,7 +613,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, uint8_t) #endif -#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 +#if 0 SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, ScaleRowUp2_Linear_12_SSSE3, ScaleRowUp2_Linear_16_C, @@ -621,7 +621,7 @@ SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, uint16_t) #endif -#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 +#if 0 SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, ScaleRowUp2_Linear_16_SSE2, ScaleRowUp2_Linear_16_C, @@ -725,7 +725,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C, 0, uint16_t) -#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 +#if 0 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, ScaleRowUp2_Bilinear_SSE2, ScaleRowUp2_Bilinear_C, @@ -733,7 +733,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, uint8_t) #endif -#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 +#if 0 SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, ScaleRowUp2_Bilinear_12_SSSE3, ScaleRowUp2_Bilinear_16_C, @@ -741,7 +741,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, uint16_t) #endif -#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 +#if 0 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, ScaleRowUp2_Bilinear_16_SSE2, ScaleRowUp2_Bilinear_16_C, @@ -749,7 +749,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, uint16_t) #endif -#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 +#if 0 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, ScaleRowUp2_Bilinear_SSSE3, ScaleRowUp2_Bilinear_C, @@ -837,7 +837,7 @@ SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C, 0, uint16_t) -#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 +#if 0 SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3, ScaleUVRowUp2_Linear_SSSE3, ScaleUVRowUp2_Linear_C, @@ -853,7 +853,7 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2, uint8_t) #endif -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +#if 0 SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41, ScaleUVRowUp2_Linear_16_SSE41, ScaleUVRowUp2_Linear_16_C, @@ -935,7 +935,7 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C, 0, uint16_t) -#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 +#if 0 SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3, ScaleUVRowUp2_Bilinear_SSSE3, ScaleUVRowUp2_Bilinear_C, @@ -951,7 +951,7 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2, uint8_t) #endif -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +#if 0 SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41, ScaleUVRowUp2_Bilinear_16_SSE41, ScaleUVRowUp2_Bilinear_16_C, diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 506409c15..02951eccc 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -67,7 +67,7 @@ static void ScaleARGBDown2(int src_width, src_argb += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 4; } -#if defined(HAS_SCALEARGBROWDOWN2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDown2 = filtering == kFilterNone @@ -180,11 +180,9 @@ static int ScaleARGBDown4Box(int src_width, (void)dx; assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. -#if defined(HAS_SCALEARGBROWDOWN2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; } } #endif @@ -246,7 +244,7 @@ static void ScaleARGBDownEven(int src_width, assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4; -#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 : ScaleARGBRowDownEven_Any_SSE2; @@ -331,11 +329,9 @@ static int ScaleARGBBilinearDown(int src_width, clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. src_argb += xl * 4; x -= (int)(xl << 16); -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -373,9 +369,8 @@ static int ScaleARGBBilinearDown(int src_width, InterpolateRow = InterpolateRow_RVV; } #endif -#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif #if defined(HAS_SCALEARGBFILTERCOLS_NEON) @@ -454,11 +449,9 @@ static int ScaleARGBBilinearUp(int src_width, int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -500,9 +493,8 @@ static int ScaleARGBBilinearUp(int src_width, ScaleARGBFilterCols = filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } -#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) +#if 0 if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif #if defined(HAS_SCALEARGBFILTERCOLS_NEON) @@ -526,9 +518,8 @@ static int ScaleARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBFilterCols_RVV; } #endif -#if defined(HAS_SCALEARGBCOLS_SSE2) +#if 0 if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBCols_SSE2; } #endif #if defined(HAS_SCALEARGBCOLS_NEON) @@ -549,9 +540,8 @@ static int ScaleARGBBilinearUp(int src_width, #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; -#if defined(HAS_SCALEARGBCOLSUP2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif } @@ -638,11 +628,9 @@ static int ScaleYUVToARGBBilinearUp(int src_width, void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(src_width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -706,11 +694,9 @@ static int ScaleYUVToARGBBilinearUp(int src_width, void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -756,9 +742,8 @@ static int ScaleYUVToARGBBilinearUp(int src_width, ScaleARGBFilterCols = filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } -#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) +#if 0 if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif #if defined(HAS_SCALEARGBFILTERCOLS_NEON) @@ -782,9 +767,8 @@ static int ScaleYUVToARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBFilterCols_RVV; } #endif -#if defined(HAS_SCALEARGBCOLS_SSE2) +#if 0 if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBCols_SSE2; } #endif #if defined(HAS_SCALEARGBCOLS_NEON) @@ -805,9 +789,8 @@ static int ScaleYUVToARGBBilinearUp(int src_width, #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; -#if defined(HAS_SCALEARGBCOLSUP2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif } @@ -914,9 +897,8 @@ static void ScaleARGBSimple(int src_width, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; (void)src_height; -#if defined(HAS_SCALEARGBCOLS_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleARGBCols = ScaleARGBCols_SSE2; } #endif #if defined(HAS_SCALEARGBCOLS_NEON) @@ -937,9 +919,8 @@ static void ScaleARGBSimple(int src_width, #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; -#if defined(HAS_SCALEARGBCOLSUP2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBCols = ScaleARGBColsUp2_SSE2; } #endif } diff --git a/source/scale_common.cc b/source/scale_common.cc index 537f030aa..367292158 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1636,11 +1636,9 @@ void ScalePlaneVertical(int src_height, assert(dst_width > 0); assert(dst_height > 0); src_argb += (x >> 16) * bpp; -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -1718,19 +1716,15 @@ void ScalePlaneVertical_16(int src_height, assert(dst_width > 0); assert(dst_height > 0); src_argb += (x >> 16) * wpp; -#if defined(HAS_INTERPOLATEROW_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_16_Any_SSE2; if (IS_ALIGNED(dst_width_words, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; } } #endif -#if defined(HAS_INTERPOLATEROW_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(dst_width_words, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; } } #endif diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 6a2524230..58fa88da3 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -94,101 +94,11 @@ static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt -void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 - "pabsb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); -} -void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 - "pabsb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} #ifdef HAS_SCALEROWDOWN2_AVX2 void ScaleRowDown2_AVX2(const uint8_t* src_ptr, @@ -291,89 +201,9 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, } #endif // HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - ptrdiff_t stridex3; - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "pabsw %%xmm4,%%xmm5 \n" - "pabsb %%xmm4,%%xmm4 \n" // 0x0101 - "psllw $0x3,%%xmm5 \n" // 0x0008 - "lea 0x00(%4,%4,2),%3 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%4,2),%%xmm2 \n" - "movdqu 0x10(%0,%4,2),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "=&r"(stridex3) // %3 - : "r"(src_stride) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} #ifdef HAS_SCALEROWDOWN4_AVX2 void ScaleRowDown4_AVX2(const uint8_t* src_ptr, @@ -465,310 +295,17 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, } #endif // HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kShuf0), // %0 - "m"(kShuf1), // %1 - "m"(kShuf2) // %2 - ); - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm2 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - asm volatile( - "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - asm volatile( - "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1,0x8(%1) \n" - "lea 0xc(%1),%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kShuf38a), // %3 - "m"(kShuf38b) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); -} -void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" - : - : "m"(kShufAb0), // %0 - "m"(kShufAb1), // %1 - "m"(kShufAb2), // %2 - "m"(kScaleAb2) // %3 - ); - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,(%1) \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - : - : "m"(kShufAc), // %0 - "m"(kShufAc3), // %1 - "m"(kScaleAc33) // %2 - ); - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm6 \n" - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqu 0x00(%0,%3,2),%%xmm6 \n" - "lea 0x10(%0),%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movd %%xmm6,(%1) \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} + + + + static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; @@ -776,611 +313,13 @@ static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3}; -#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 -void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm0,%%xmm0 \n" // 0 - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $1,%%xmm6 \n" // all 2 - LABELALIGN - "1: \n" - "movq (%0),%%xmm1 \n" // 01234567 - "movq 1(%0),%%xmm2 \n" // 12345678 - "movdqa %%xmm1,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "paddw %%xmm6,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) - "paddw %%xmm5,%%xmm5 \n" - "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) - "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) - "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm2,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm6,%%xmm1 \n" - "paddw %%xmm3,%%xmm3 \n" - "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1) \n" - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif -#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 -void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "1: \n" - "pxor %%xmm0,%%xmm0 \n" // 0 - // above line - "movq (%0),%%xmm1 \n" // 01234567 - "movq 1(%0),%%xmm2 \n" // 12345678 - "movdqa %%xmm1,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm4 \n" // near+far - "movdqa %%xmm3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) - "paddw %%xmm5,%%xmm5 \n" // 2*near - "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) - "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm2,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - // below line - "movq (%0,%3),%%xmm6 \n" // 01234567 - "movq 1(%0,%3),%%xmm2 \n" // 12345678 - "movdqa %%xmm6,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - - "movdqa %%xmm6,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm7 \n" - "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16) - "paddw %%xmm7,%%xmm5 \n" // near+far - "movdqa %%xmm3,%%xmm7 \n" - "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16) - "paddw %%xmm7,%%xmm7 \n" // 2*near - "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo) - - "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm6,%%xmm2 \n" // near+far - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) - - // xmm4 xmm1 - // xmm5 xmm2 - "pcmpeqw %%xmm0,%%xmm0 \n" - "psrlw $15,%%xmm0 \n" - "psllw $3,%%xmm0 \n" // all 8 - - "movdqa %%xmm4,%%xmm3 \n" - "movdqa %%xmm5,%%xmm6 \n" - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo) - "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) - "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm3 \n" // ^ div by 16 - - "movdqa %%xmm1,%%xmm7 \n" - "movdqa %%xmm2,%%xmm6 \n" - "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi) - "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) - "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm7 \n" // ^ div by 16 - - "packuswb %%xmm7,%%xmm3 \n" - "movdqu %%xmm3,(%1) \n" // save above line - - "movdqa %%xmm5,%%xmm3 \n" - "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo) - "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 - - "movdqa %%xmm2,%%xmm3 \n" - "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi) - "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) - "psrlw $4,%%xmm2 \n" // ^ div by 16 - - "packuswb %%xmm2,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // save below line - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 -void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %3,%%xmm5 \n" - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // 01234567 (16) - "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) - - "movdqa %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) - "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) - - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far) - "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far) - - "paddw %%xmm4,%%xmm1 \n" // far+2 - "paddw %%xmm4,%%xmm3 \n" // far+2 - "paddw %%xmm0,%%xmm1 \n" // near+far+2 - "paddw %%xmm2,%%xmm3 \n" // near+far+2 - "paddw %%xmm0,%%xmm0 \n" // 2*near - "paddw %%xmm2,%%xmm2 \n" // 2*near - "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi) - - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,16(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearShuffleFar) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 -void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" - "psllw $3,%%xmm7 \n" // all 8 - "movdqa %5,%%xmm6 \n" - - LABELALIGN - "1: \n" - // above line - "movdqu (%0),%%xmm0 \n" // 01234567 (16) - "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) - "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far) - "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far) - "paddw %%xmm0,%%xmm1 \n" // near+far - "paddw %%xmm2,%%xmm3 \n" // near+far - "paddw %%xmm0,%%xmm0 \n" // 2*near - "paddw %%xmm2,%%xmm2 \n" // 2*near - "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo) - "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi) - - // below line - "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16) - "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16) - "movdqa %%xmm1,%%xmm3 \n" - "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16) - "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16) - "movdqa %%xmm3,%%xmm5 \n" - "movdqa %%xmm1,%%xmm4 \n" - "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far) - "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far) - "paddw %%xmm1,%%xmm4 \n" // near+far - "paddw %%xmm3,%%xmm5 \n" // near+far - "paddw %%xmm1,%%xmm1 \n" // 2*near - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) - "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 - "movdqu %%xmm4,(%1) \n" - - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi) - "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm4 \n" // ^ div by 16 - "movdqu %%xmm4,0x10(%1) \n" - - "movdqa %%xmm1,%%xmm4 \n" - "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo) - "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm1 \n" // ^ div by 16 - "movdqu %%xmm1,(%1,%4,2) \n" - - "movdqa %%xmm3,%%xmm4 \n" - "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi) - "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm3 \n" // ^ div by 16 - "movdqu %%xmm3,0x10(%1,%4,2) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearShuffleFar) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 -void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqd %%xmm4,%%xmm4 \n" - "psrld $31,%%xmm4 \n" - "pslld $1,%%xmm4 \n" // all 2 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0123 (16b) - "movq 2(%0),%%xmm1 \n" // 1234 (16b) - - "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) - "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) - - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - - "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) - - "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) - "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) - "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - - "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "packssdw %%xmm1,%%xmm0 \n" - "pshufd $0b11011000,%%xmm0,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 -void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pxor %%xmm7,%%xmm7 \n" - "pcmpeqd %%xmm6,%%xmm6 \n" - "psrld $31,%%xmm6 \n" - "pslld $3,%%xmm6 \n" // all 8 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) - "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) - "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0),%%xmm0 \n" // 0123 (16b) - "movq 2(%0),%%xmm1 \n" // 1234 (16b) - "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) - "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) - "paddd %%xmm0,%%xmm2 \n" // near+far (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0,%3,2),%%xmm2 \n" - "movq 2(%0,%3,2),%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) - "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) - "paddd %%xmm2,%%xmm4 \n" // near+far (lo) - "paddd %%xmm3,%%xmm5 \n" // near+far (hi) - "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) - "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) - "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) - "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm1,%%xmm0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) - "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) - "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) - "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) - - "packssdw %%xmm0,%%xmm4 \n" - "pshufd $0b11011000,%%xmm4,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packssdw %%xmm2,%%xmm5 \n" - "pshufd $0b11011000,%%xmm5,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4,2) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 -void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - "movdqa %3,%%xmm3 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 01234567 - "movq 1(%0),%%xmm1 \n" // 12345678 - "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 - "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 - "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 - "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi) - "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo) - "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 -void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $3,%%xmm6 \n" // all 8 - "movdqa %5,%%xmm7 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 01234567 - "movq 1(%0),%%xmm1 \n" // 12345678 - "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 - "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 - "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 - "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi) - "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo) - - "movq (%0,%3),%%xmm1 \n" - "movq 1(%0,%3),%%xmm4 \n" - "punpcklwd %%xmm1,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm4 \n" - "movdqa %%xmm1,%%xmm3 \n" - "punpckhdq %%xmm4,%%xmm3 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) - "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm2,%%xmm0 \n" - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) - "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) - - "packuswb %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif #ifdef HAS_SCALEROWUP2_LINEAR_AVX2 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, @@ -1756,34 +695,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, #endif // Reads 16xN bytes and produces 16 shorts at a time. -void ScaleAddRow_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" // src_ptr += 16 - "movdqu (%1),%%xmm0 \n" - "movdqu 0x10(%1),%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. @@ -1825,383 +737,31 @@ static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - intptr_t x0, x1, temp_pixel; - asm volatile( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" // 0x007f007f - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" // 0x00010001 - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movzwl 0x00(%1,%4,1),%k2 \n" - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + - // 1 - "paddusb %%xmm7,%%xmm1 \n" - "pmaddubsw %%xmm0,%%xmm1 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "paddw %9,%%xmm1 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,%k2 \n" - "mov %w2,(%0) \n" - "lea 0x2(%0),%0 \n" - "subl $0x2,%5 \n" - "jge 2b \n" - - LABELALIGN - "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm2 \n" - "paddusb %%xmm7,%%xmm2 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "paddw %9,%%xmm2 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,%k2 \n" - "mov %b2,(%0) \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "=&a"(temp_pixel), // %2 - "=&r"(x0), // %3 - "=&r"(x1), // %4 -#if defined(__x86_64__) - "+rm"(dst_width) // %5 -#else - "+m"(dst_width) // %5 -#endif - : "rm"(x), // %6 - "rm"(dx), // %7 -#if defined(__x86_64__) - "x"(kFsub80), // %8 - "x"(kFadd40) // %9 -#else - "m"(kFsub80), // %8 - "m"(kFadd40) // %9 -#endif - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - (void)x; - (void)dx; - asm volatile( - "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} + + + // Reads 4 pixels at a time. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx); - ptrdiff_t src_stepx_x12; - (void)src_stride; - asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" - LABELALIGN - "1: \n" - "movd (%0),%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%0,%1,2),%%xmm2 \n" - "movd 0x00(%0,%4,1),%%xmm3 \n" - "lea 0x00(%0,%1,4),%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width), // %3 - "=&r"(src_stepx_x12) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} // Blends four 2x2 to 4x1. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx); - ptrdiff_t src_stepx_x12; - asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" - "lea 0x00(%0,%5,1),%5 \n" - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movhps 0x00(%0,%1,1),%%xmm0 \n" - "movq 0x00(%0,%1,2),%%xmm1 \n" - "movhps 0x00(%0,%4,1),%%xmm1 \n" - "lea 0x00(%0,%1,4),%0 \n" - "movq (%5),%%xmm2 \n" - "movhps 0x00(%5,%1,1),%%xmm2 \n" - "movq 0x00(%5,%1,2),%%xmm3 \n" - "movhps 0x00(%5,%4,1),%%xmm3 \n" - "lea 0x00(%5,%1,4),%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+rm"(dst_width), // %3 - "=&r"(src_stepx_x12), // %4 - "+r"(src_stride) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} -void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - intptr_t x0, x1; - asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x11,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x5,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "cmp $0x0,%4 \n" - "jl 99f \n" - "sub $0x4,%4 \n" - "jl 49f \n" - LABELALIGN - "40: \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "pextrw $0x7,%%xmm2,%k1 \n" - "paddd %%xmm3,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%3,%0,4),%%xmm1 \n" - "movd 0x00(%3,%1,4),%%xmm4 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" - - "49: \n" - "test $0x2,%4 \n" - "je 29f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%2) \n" - "lea 0x8(%2),%2 \n" - "29: \n" - "test $0x1,%4 \n" - "je 99f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "99: \n" - : "=&a"(x0), // %0 - "=&d"(x1), // %1 - "+r"(dst_argb), // %2 - "+r"(src_argb), // %3 - "+r"(dst_width) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - (void)x; - (void)dx; - asm volatile( - "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpckldq %%xmm0,%%xmm0 \n" - "punpckhdq %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} // Shuffle table for arranging 2 pixels into pairs for pmaddubsw static const uvec8 kShuffleColARGB = { @@ -2215,80 +775,7 @@ static const uvec8 kShuffleFractions = { }; // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version -void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - intptr_t x0, x1; - asm volatile( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" - : - : "m"(kShuffleColARGB), // %0 - "m"(kShuffleFractions) // %1 - ); - asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movhps 0x00(%1,%4,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%0) \n" - "lea 0x8(%0),%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" - - LABELALIGN - "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%0) \n" - - LABELALIGN "99: \n" - - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+rm"(dst_width), // %2 - "=&r"(x0), // %3 - "=&r"(x1) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} // Divide num by div and return as 16.16 fixed point result. int FixedDiv_X86(int num, int div) { @@ -2321,7 +808,7 @@ int FixedDiv1_X86(int num, int div) { return num; } -#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \ +#if 1 || \ defined(HAS_SCALEUVROWDOWN2BOX_AVX2) // Shuffle table for splitting UV into upper and lower part of register. @@ -2332,46 +819,6 @@ static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, 0x80, 0x80, 0x80, 0x80}; #endif -#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 - -void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5, %%xmm5 \n" // zero - "movdqa %4,%%xmm1 \n" // split shuffler - "movdqa %5,%%xmm3 \n" // merge shuffler - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // 8 UV row 0 - "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 - "lea 0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv - "pshufb %%xmm1,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add - "pmaddubsw %%xmm4,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" // vertical add - "psrlw $0x1,%%xmm0 \n" // round - "pavgw %%xmm5,%%xmm0 \n" - "pshufb %%xmm3,%%xmm0 \n" // merge uv - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" // 4 UV - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kShuffleSplitUV), // %4 - "m"(kShuffleMergeUV) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, @@ -2417,129 +864,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3, 1, 1, 3, 1, 3}; -#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 -void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - "movdqa %3,%%xmm3 \n" - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 00112233 (1u1v) - "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) - "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) - "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) - "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi) - "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo) - "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kUVLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 -void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $3,%%xmm6 \n" // all 8 - "movdqa %5,%%xmm7 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 00112233 (1u1v) - "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) - "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) - "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) - "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi) - "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo) - - "movq (%0,%3),%%xmm1 \n" - "movq 2(%0,%3),%%xmm4 \n" - "punpcklbw %%xmm4,%%xmm1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "punpckhdq %%xmm1,%%xmm3 \n" - "punpckldq %%xmm1,%%xmm1 \n" - "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) - "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm2,%%xmm0 \n" - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) - "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) - - "packuswb %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kUVLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif #ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 @@ -2665,148 +990,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 -void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqd %%xmm4,%%xmm4 \n" - "psrld $31,%%xmm4 \n" - "pslld $1,%%xmm4 \n" // all 2 - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - - "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v) - "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v) - - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far) - - "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) - "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) - "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - - "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "packusdw %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 2 uv to 4 uv - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 -void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pxor %%xmm7,%%xmm7 \n" - "pcmpeqd %%xmm6,%%xmm6 \n" - "psrld $31,%%xmm6 \n" - "pslld $3,%%xmm6 \n" // all 8 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) - "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) - "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0,%3,2),%%xmm2 \n" - "movq 4(%0,%3,2),%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo) - "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi) - "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo) - "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi) - "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo) - "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi) - "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) - "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm1,%%xmm0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) - "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) - "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) - "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) - - "packusdw %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packusdw %%xmm2,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4,2) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 2 uv to 4 uv - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 3d41a2398..f80e476f2 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -88,11 +88,9 @@ static void ScaleUVDown2(int src_width, src_uv += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 2; } -#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && filtering) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; } } #endif @@ -138,7 +136,7 @@ static void ScaleUVDown2(int src_width, #endif // This code is not enabled. Only box filter is available at this time. -#if defined(HAS_SCALEUVROWDOWN2_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleUVRowDown2 = filtering == kFilterNone @@ -200,11 +198,9 @@ static int ScaleUVDown4Box(int src_width, assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. -#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; } } #endif @@ -276,7 +272,7 @@ static void ScaleUVDownEven(int src_width, assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2; -#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3 : ScaleUVRowDownEven_Any_SSSE3; @@ -363,11 +359,9 @@ static int ScaleUVBilinearDown(int src_width, clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2. src_uv += xl * 2; x -= (int)(xl << 16); -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -405,9 +399,8 @@ static int ScaleUVBilinearDown(int src_width, InterpolateRow = InterpolateRow_RVV; } #endif -#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; } #endif #if defined(HAS_SCALEUVFILTERCOLS_NEON) @@ -473,11 +466,9 @@ static int ScaleUVBilinearUp(int src_width, int dst_width, int x, int dx) = filtering ? ScaleUVFilterCols_C : ScaleUVCols_C; const int max_y = (src_height - 1) << 16; -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 8)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -518,9 +509,8 @@ static int ScaleUVBilinearUp(int src_width, if (src_width >= 32768) { ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C; } -#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) +#if 0 if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; } #endif #if defined(HAS_SCALEUVFILTERCOLS_NEON) @@ -531,9 +521,8 @@ static int ScaleUVBilinearUp(int src_width, } } #endif -#if defined(HAS_SCALEUVCOLS_SSSE3) +#if 0 if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVFilterCols = ScaleUVCols_SSSE3; } #endif #if defined(HAS_SCALEUVCOLS_NEON) @@ -546,9 +535,8 @@ static int ScaleUVBilinearUp(int src_width, #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleUVFilterCols = ScaleUVColsUp2_C; -#if defined(HAS_SCALEUVCOLSUP2_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { - ScaleUVFilterCols = ScaleUVColsUp2_SSSE3; } #endif } @@ -636,9 +624,8 @@ static void ScaleUVLinearUp2(int src_width, (void)src_width; assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3; } #endif @@ -696,9 +683,8 @@ static void ScaleUVBilinearUp2(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3; } #endif @@ -757,9 +743,8 @@ static void ScaleUVLinearUp2_16(int src_width, (void)src_width; assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41; } #endif @@ -811,9 +796,8 @@ static void ScaleUVBilinearUp2_16(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41; } #endif @@ -865,9 +849,8 @@ static void ScaleUVSimple(int src_width, int x, int dx) = (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C; (void)src_height; -#if defined(HAS_SCALEUVCOLS_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVCols = ScaleUVCols_SSSE3; } #endif #if defined(HAS_SCALEUVCOLS_NEON) @@ -880,9 +863,8 @@ static void ScaleUVSimple(int src_width, #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleUVCols = ScaleUVColsUp2_C; -#if defined(HAS_SCALEUVCOLSUP2_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { - ScaleUVCols = ScaleUVColsUp2_SSSE3; } #endif } diff --git a/source/scale_win.cc b/source/scale_win.cc index 32c0506fa..9b31a335a 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -90,109 +90,13 @@ static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0}; // Reads 32 pixels, throws half away and writes 16 pixels. -__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - ret - } -} // Blends 32x1 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pavgw xmm0, xmm5 // (x + 1) / 2 - pavgw xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - ret - } -} // Blends 32x2 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add - paddw xmm1, xmm3 - psrlw xmm0, 1 - psrlw xmm1, 1 - pavgw xmm0, xmm5 // (x + 1) / 2 - pavgw xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - pop esi - ret - } -} #ifdef HAS_SCALEROWDOWN2_AVX2 // Reads 64 pixels, throws half away and writes 32 pixels. @@ -310,94 +214,10 @@ __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, #endif // HAS_SCALEROWDOWN2_AVX2 // Point samples 32 pixels to 8 pixels. -__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 - psrld xmm5, 24 - pslld xmm5, 16 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm0, 8 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - - ret - } -} // Blends 32x4 rectangle to 8x1. -__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - movdqa xmm5, xmm4 - packuswb xmm4, xmm4 - psllw xmm5, 3 // constant 0x0008 - wloop: - movdqu xmm0, [eax] // average rows - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add rows 0, 1 - paddw xmm1, xmm3 - movdqu xmm2, [eax + esi * 2] - movdqu xmm3, [eax + esi * 2 + 16] - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 2 - paddw xmm1, xmm3 - movdqu xmm2, [eax + edi] - movdqu xmm3, [eax + edi + 16] - lea eax, [eax + 32] - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 3 - paddw xmm1, xmm3 - phaddw xmm0, xmm1 - paddw xmm0, xmm5 // + 8 for round - psrlw xmm0, 4 // /16 for average of 4 * 4 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - - pop edi - pop esi - ret - } -} #ifdef HAS_SCALEROWDOWN4_AVX2 // Point samples 64 pixels to 16 pixels. @@ -500,38 +320,7 @@ __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. -__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm3, xmmword ptr kShuf0 - movdqa xmm4, xmmword ptr kShuf1 - movdqa xmm5, xmmword ptr kShuf2 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm1 - palignr xmm1, xmm0, 8 - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + 8], xmm1 - movq qword ptr [edx + 16], xmm2 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - ret - } -} // Blends 32x2 rectangle to 24x1 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. @@ -548,295 +337,24 @@ __declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShuf01 - movdqa xmm3, xmmword ptr kShuf11 - movdqa xmm4, xmmword ptr kShuf21 - movdqa xmm5, xmmword ptr kMadd01 - movdqa xmm6, xmmword ptr kMadd11 - movdqa xmm7, xmmword ptr kRound34 - wloop: - movdqu xmm0, [eax] // pixels 0..7 - movdqu xmm1, [eax + esi] - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 - movdqu xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, xmmword ptr kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - pop esi - ret - } -} // Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShuf01 - movdqa xmm3, xmmword ptr kShuf11 - movdqa xmm4, xmmword ptr kShuf21 - movdqa xmm5, xmmword ptr kMadd01 - movdqa xmm6, xmmword ptr kMadd11 - movdqa xmm7, xmmword ptr kRound34 - wloop: - movdqu xmm0, [eax] // pixels 0..7 - movdqu xmm1, [eax + esi] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 - movdqu xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, xmmword ptr kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx+24] - sub ecx, 24 - jg wloop - - pop esi - ret - } -} // 3/8 point sampler // Scale 32 pixels to 12 -__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm4, xmmword ptr kShuf38a - movdqa xmm5, xmmword ptr kShuf38b - xloop: - movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 - lea eax, [eax + 32] - pshufb xmm0, xmm4 - pshufb xmm1, xmm5 - paddusb xmm0, xmm1 - - movq qword ptr [edx], xmm0 // write 12 pixels - movhlps xmm1, xmm0 - movd [edx + 8], xmm1 - lea edx, [edx + 12] - sub ecx, 12 - jg xloop - - ret - } -} // Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShufAc - movdqa xmm3, xmmword ptr kShufAc3 - movdqa xmm4, xmmword ptr kScaleAc33 - pxor xmm5, xmm5 - xloop: - movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 - movdqu xmm6, [eax + esi] - movhlps xmm1, xmm0 - movhlps xmm7, xmm6 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - movdqu xmm6, [eax + esi * 2] - lea eax, [eax + 16] - movhlps xmm7, xmm6 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - - movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - pshufb xmm6, xmm2 - - movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - pshufb xmm7, xmm3 - paddusw xmm6, xmm7 - - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 - packuswb xmm6, xmm6 - - movd [edx], xmm6 // write 6 pixels - psrlq xmm6, 16 - movd [edx + 2], xmm6 - lea edx, [edx + 6] - sub ecx, 6 - jg xloop - - pop esi - ret - } -} // Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShufAb0 - movdqa xmm3, xmmword ptr kShufAb1 - movdqa xmm4, xmmword ptr kShufAb2 - movdqa xmm5, xmmword ptr kScaleAb2 - xloop: - movdqu xmm0, [eax] // average 2 rows into xmm0 - movdqu xmm1, [eax + esi] - lea eax, [eax + 16] - pavgb xmm0, xmm1 - - movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 - pshufb xmm1, xmm2 - movdqa xmm6, xmm0 - pshufb xmm6, xmm3 - paddusw xmm1, xmm6 - pshufb xmm0, xmm4 - paddusw xmm1, xmm0 - - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 - packuswb xmm1, xmm1 - - movd [edx], xmm1 // write 6 pixels - psrlq xmm1, 16 - movd [edx + 2], xmm1 - lea edx, [edx + 6] - sub ecx, 6 - jg xloop - - pop esi - ret - } -} // Reads 16 bytes and accumulates to 16 shorts at a time. -__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr - mov ecx, [esp + 12] // src_width - pxor xmm5, xmm5 - // sum rows - xloop: - movdqu xmm3, [eax] // read 16 bytes - lea eax, [eax + 16] - movdqu xmm0, [edx] // read 16 words from destination - movdqu xmm1, [edx + 16] - movdqa xmm2, xmm3 - punpcklbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - paddusw xmm0, xmm2 // sum 16 words - paddusw xmm1, xmm3 - movdqu [edx], xmm0 // write 16 words to destination - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 16 - jg xloop - ret - } -} #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. @@ -880,369 +398,28 @@ static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - __asm { - push ebx - push esi - push edi - mov edi, [esp + 12 + 4] // dst_ptr - mov esi, [esp + 12 + 8] // src_ptr - mov ecx, [esp + 12 + 12] // dst_width - movd xmm2, [esp + 12 + 16] // x - movd xmm3, [esp + 12 + 20] // dx - mov eax, 0x04040000 // shuffle to line up fractions with pixel. - movd xmm5, eax - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pcmpeqb xmm7, xmm7 // generate 0x0001 - psrlw xmm7, 15 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm1, 9 // 7 bit fractions. - movzx ebx, word ptr [esi + edx] // 2 source x1 pixels - movd xmm4, ebx - pshufb xmm1, xmm5 // 0011 - punpcklwd xmm0, xmm4 - psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm1, xmm6 // 0..7f and 7f..0 - paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm1, xmm1 // 8 bits, 2 pixels. - movd ebx, xmm1 - mov [edi], bx - lea edi, [edi + 2] - sub ecx, 2 // 2 pixels - jge xloop2 - - xloop29: - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm2, 9 // 7 bit fractions. - pshufb xmm2, xmm5 // 0011 - psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm2, xmm6 // 0..7f and 7f..0 - paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm2, xmm0 // 16 bit - paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm2, xmm2 // 8 bits - movd ebx, xmm2 - mov [edi], bl - - xloop99: - - pop edi - pop esi - pop ebx - ret - } -} // Reads 16 pixels, duplicates them and writes 32 pixels. -__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width - wloop: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 - punpckhbw xmm1, xmm1 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - ret - } -} // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - shufps xmm0, xmm1, 0xdd - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - ret - } -} // Blends 8x1 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - ret - } -} // Blends 8x2 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // dst_width - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop esi - ret - } -} // Reads 4 pixels at a time. -__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - __asm { - push ebx - push edi - mov eax, [esp + 8 + 4] // src_argb - // src_stride ignored - mov ebx, [esp + 8 + 12] // src_stepx - mov edx, [esp + 8 + 16] // dst_argb - mov ecx, [esp + 8 + 20] // dst_width - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - wloop: - movd xmm0, [eax] - movd xmm1, [eax + ebx] - punpckldq xmm0, xmm1 - movd xmm2, [eax + ebx * 2] - movd xmm3, [eax + edi] - lea eax, [eax + ebx * 4] - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop edi - pop ebx - ret - } -} // Blends four 2x2 to 4x1. -__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov esi, [esp + 12 + 8] // src_stride - mov ebx, [esp + 12 + 12] // src_stepx - mov edx, [esp + 12 + 16] // dst_argb - mov ecx, [esp + 12 + 20] // dst_width - lea esi, [eax + esi] // row1 pointer - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - wloop: - movq xmm0, qword ptr [eax] // row0 4 pairs - movhps xmm0, qword ptr [eax + ebx] - movq xmm1, qword ptr [eax + ebx * 2] - movhps xmm1, qword ptr [eax + edi] - lea eax, [eax + ebx * 4] - movq xmm2, qword ptr [esi] // row1 4 pairs - movhps xmm2, qword ptr [esi + ebx] - movq xmm3, qword ptr [esi + ebx * 2] - movhps xmm3, qword ptr [esi + edi] - lea esi, [esi + ebx * 4] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop edi - pop esi - pop ebx - ret - } -} // Column scaling unfiltered. SSE2 version. -__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - push edi - push esi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 - pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 - paddd xmm2, xmm0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 2 - pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 - paddd xmm2, xmm0 // x3 x2 x1 x0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 4 - pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 - - pextrw eax, xmm2, 1 // get x0 integer. - pextrw edx, xmm2, 3 // get x1 integer. - - cmp ecx, 0 - jle xloop99 - sub ecx, 4 - jl xloop49 - - // 4 Pixel loop. - xloop4: - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - pextrw edx, xmm2, 7 // get x3 integer. - paddd xmm2, xmm3 // x += dx - punpckldq xmm0, xmm1 // x0 x1 - - movd xmm1, [esi + eax * 4] // 1 source x2 pixels - movd xmm4, [esi + edx * 4] // 1 source x3 pixels - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - punpckldq xmm1, xmm4 // x2 x3 - punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 // 4 pixels - jge xloop4 - - xloop49: - test ecx, 2 - je xloop29 - - // 2 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - punpckldq xmm0, xmm1 // x0 x1 - - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - - xloop29: - test ecx, 1 - je xloop99 - - // 1 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x2 pixels - movd dword ptr [edi], xmm0 - xloop99: - - pop esi - pop edi - ret - } -} // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. // TODO(fbarchard): Port to Neon @@ -1258,104 +435,10 @@ static const uvec8 kShuffleFractions = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; -__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - movdqa xmm4, xmmword ptr kShuffleColARGB - movdqa xmm5, xmmword ptr kShuffleFractions - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - psrlw xmm1, 9 // 7 bit fractions. - movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels - pshufb xmm1, xmm5 // 0000000011111111 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 2 // 2 pixels - jge xloop2 - - xloop29: - - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - psrlw xmm2, 9 // 7 bit fractions. - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - pshufb xmm2, xmm5 // 00000000 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. - psrlw xmm0, 7 - packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. - movd [edi], xmm0 - - xloop99: - - pop edi - pop esi - ret - } -} // Reads 4 pixels, duplicates them and writes 8 pixels. -__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - mov edx, [esp + 4] // dst_argb - mov eax, [esp + 8] // src_argb - mov ecx, [esp + 12] // dst_width - wloop: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpckldq xmm0, xmm0 - punpckhdq xmm1, xmm1 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg wloop - - ret - } -} // Divide num by div and return as 16.16 fixed point result. __declspec(naked) int FixedDiv_X86(int num, int div) { diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index c29562cb8..d1dce2ff2 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -240,21 +240,15 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) { if (has_avx2) { h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); } else { - int has_sse42 = TestCpuFlag(kCpuHasSSE42); - if (has_sse42) { h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - if (has_ssse3) { h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } } } -#elif defined(HAS_HAMMINGDISTANCE_SSE42) - int has_sse42 = TestCpuFlag(kCpuHasSSE42); - if (has_sse42) { +#elif 0 h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); @@ -361,21 +355,15 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { if (has_avx2) { h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); } else { - int has_sse42 = TestCpuFlag(kCpuHasSSE42); - if (has_sse42) { h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - if (has_ssse3) { h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } } } -#elif defined(HAS_HAMMINGDISTANCE_SSE42) - int has_sse42 = TestCpuFlag(kCpuHasSSE42); - if (has_sse42) { +#elif 0 h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc index 177f3a669..141ae7cfb 100644 --- a/unit_test/convert_argb_test.cc +++ b/unit_test/convert_argb_test.cc @@ -762,9 +762,6 @@ TESTATOA(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1) TESTATOA(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) // TESTATOA(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1) TESTATOA(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) -#ifdef LITTLE_ENDIAN_ONLY_TEST -// TESTATOA(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) -#endif TESTATOA(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) // TESTATOA(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) // TESTATOA(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) @@ -1686,12 +1683,10 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { ARGBToAR30Row_C(src, dst_c, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { ARGBToAR30Row_AVX2(src, dst_opt, kPixels); } else if (has_ssse3) { - ARGBToAR30Row_SSSE3(src, dst_opt, kPixels); } else { ARGBToAR30Row_C(src, dst_opt, kPixels); } @@ -1720,12 +1715,10 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { ABGRToAR30Row_C(src, dst_c, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { ABGRToAR30Row_AVX2(src, dst_opt, kPixels); } else if (has_ssse3) { - ABGRToAR30Row_SSSE3(src, dst_opt, kPixels); } else { ABGRToAR30Row_C(src, dst_opt, kPixels); } diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index 80186de7a..1d39bc20e 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -168,10 +168,6 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { defined(_M_X64) int has_x86 = TestCpuFlag(kCpuHasX86); if (has_x86) { - int has_sse2 = TestCpuFlag(kCpuHasSSE2); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - int has_sse41 = TestCpuFlag(kCpuHasSSE41); - int has_sse42 = TestCpuFlag(kCpuHasSSE42); int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_erms = TestCpuFlag(kCpuHasERMS); @@ -190,10 +186,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8); int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8); printf("Has X86 0x%x\n", has_x86); - printf("Has SSE2 0x%x\n", has_sse2); printf("Has SSSE3 0x%x\n", has_ssse3); - printf("Has SSE4.1 0x%x\n", has_sse41); - printf("Has SSE4.2 0x%x\n", has_sse42); printf("Has AVX 0x%x\n", has_avx); printf("Has AVX2 0x%x\n", has_avx2); printf("Has ERMS 0x%x\n", has_erms); diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 2e26b4cf6..80674b94a 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1536,17 +1536,11 @@ TEST_F(LibYUVPlanarTest, TestAffine) { EXPECT_EQ(96u, interpolate_pixels_C[128][0]); EXPECT_EQ(191u, interpolate_pixels_C[255][3]); -#if defined(HAS_ARGBAFFINEROW_SSE2) +#if 0 SIMD_ALIGNED(uint8_t interpolate_pixels_Opt[1280][4]); - ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0], - uv_step, 1280); EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4)); - int has_sse2 = TestCpuFlag(kCpuHasSSE2); - if (has_sse2) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) { - ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0], - uv_step, 1280); } } #endif @@ -3916,14 +3910,11 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) { dst_pixels_y_c, 16384, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { Convert16To8Row_AVX2(reinterpret_cast(src_pixels_y), dst_pixels_y_opt, 16384, kPixels); } else if (has_ssse3) { - Convert16To8Row_SSSE3(reinterpret_cast(src_pixels_y), - dst_pixels_y_opt, 16384, kPixels); } else { Convert16To8Row_C(reinterpret_cast(src_pixels_y), dst_pixels_y_opt, 16384, kPixels); @@ -4020,16 +4011,11 @@ TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) { 1024, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_sse2 = TestCpuFlag(kCpuHasSSE2); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { Convert8To16Row_AVX2(src_pixels_y, reinterpret_cast(dst_pixels_y_opt), 1024, kPixels); - } else if (has_sse2) { - Convert8To16Row_SSE2(src_pixels_y, - reinterpret_cast(dst_pixels_y_opt), 1024, - kPixels); } else { Convert8To16Row_C(src_pixels_y, reinterpret_cast(dst_pixels_y_opt), 1024, diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc index abc08efa8..a2d6daa00 100644 --- a/unit_test/rotate_test.cc +++ b/unit_test/rotate_test.cc @@ -892,10 +892,8 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Test) { Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4, (uint8_t*)dst_pixels_opt, width * 4, width); } else -#elif defined(HAS_TRANSPOSE4X4_32_SSE2) +#elif 0 if (TestCpuFlag(kCpuHasSSE2)) { - Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4, - (uint8_t*)dst_pixels_opt, width * 4, width); } else #endif { @@ -938,8 +936,6 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Opt) { Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4, (uint8_t*)dst_pixels_opt, width * 4, width); } else if (TestCpuFlag(kCpuHasSSE2)) { - Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4, - (uint8_t*)dst_pixels_opt, width * 4, width); } else #endif { diff --git a/unit_test/scale_plane_test.cc b/unit_test/scale_plane_test.cc index 979c70aad..c9b773f52 100644 --- a/unit_test/scale_plane_test.cc +++ b/unit_test/scale_plane_test.cc @@ -43,7 +43,7 @@ namespace libyuv { #ifdef ENABLE_ROW_TESTS -#ifdef HAS_SCALEROWDOWN2_SSSE3 +#if 0 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]); SIMD_ALIGNED(uint8_t dst_pixels_opt[64]); @@ -52,7 +52,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt)); memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); if (!has_ssse3) { printf("Warning SSSE3 not detected; Skipping test.\n"); } else { @@ -114,7 +113,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { EXPECT_EQ(0u, dst_pixels_c[63]); // Test regular half size SSSE3. - ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); EXPECT_EQ(64u, dst_pixels_opt[0]); EXPECT_EQ(25u, dst_pixels_opt[1]); @@ -125,7 +123,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { // Compare C and SSSE3 match. ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); - ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); for (int i = 0; i < 64; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } diff --git a/util/psnr.cc b/util/psnr.cc index c7bee7f97..170a4b835 100644 --- a/util/psnr.cc +++ b/util/psnr.cc @@ -106,128 +106,11 @@ static uint32_t SumSquareError_NEON(const uint8_t* src_a, return sse; } #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -#define HAS_SUMSQUAREERROR_SSE2 -__declspec(naked) static uint32_t SumSquareError_SSE2(const uint8_t* /*src_a*/, - const uint8_t* /*src_b*/, - int /*count*/) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - pxor xmm0, xmm0 - pxor xmm5, xmm5 - sub edx, eax - wloop: - movdqu xmm1, [eax] - movdqu xmm2, [eax + edx] - lea eax, [eax + 16] - movdqu xmm3, xmm1 - psubusb xmm1, xmm2 - psubusb xmm2, xmm3 - por xmm1, xmm2 - movdqu xmm2, xmm1 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm0, xmm1 - paddd xmm0, xmm2 - sub ecx, 16 - ja wloop - - pshufd xmm1, xmm0, 0EEh - paddd xmm0, xmm1 - pshufd xmm1, xmm0, 01h - paddd xmm0, xmm1 - movd eax, xmm0 - ret - } -} #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) -#define HAS_SUMSQUAREERROR_SSE2 -static uint32_t SumSquareError_SSE2(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse; - asm volatile( // NOLINT - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" - "1: \n" - "movdqu (%0),%%xmm1 \n" - "movdqu (%0,%1,1),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqu %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqu %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" - - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=g"(sse) // %3 - : - : "memory", "cc" -#if defined(__SSE2__) - , - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); // NOLINT - return sse; -} #endif // LIBYUV_DISABLE_X86 etc -#if defined(HAS_SUMSQUAREERROR_SSE2) -#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) -static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile( // NOLINT - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), - "=d"(cpu_info[3]) - : "a"(info_type)); -} -// For gcc/clang but not clangcl. -#elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__)) -static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile( // NOLINT - "cpuid \n" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), - "=d"(cpu_info[3]) - : "a"(info_type)); -} -#endif - -static int CpuHasSSE2() { -#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) - int cpu_info[4]; - __cpuid(cpu_info, 1); - if (cpu_info[3] & 0x04000000) { - return 1; - } -#endif - return 0; -} -#endif // HAS_SUMSQUAREERROR_SSE2 - static uint32_t SumSquareError_C(const uint8_t* src_a, const uint8_t* src_b, int count) { @@ -246,11 +129,6 @@ double ComputeSumSquareError(const uint8_t* src_a, int count) = SumSquareError_C; #if defined(HAS_SUMSQUAREERROR_NEON) SumSquareError = SumSquareError_NEON; -#endif -#if defined(HAS_SUMSQUAREERROR_SSE2) - if (CpuHasSSE2()) { - SumSquareError = SumSquareError_SSE2; - } #endif const int kBlockSize = 1 << 15; uint64_t sse = 0;