diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index a08734e90..231a0177b 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -19,14 +19,6 @@ namespace libyuv { extern "C" { #endif -// The following are available for Visual C and GCC: -#if !defined(LIBYUV_DISABLE_X86) && \ - ((defined(__x86_64__) && !defined(LIBYUV_ENABLE_ROWWIN)) || \ - defined(__i386__) || defined(_M_IX86)) -#define HAS_HASHDJB2_SSE41 -#define HAS_SUMSQUAREERROR_SSE2 -#define HAS_HAMMINGDISTANCE_SSE42 -#endif // The following are available for Visual C and clangcl 32 bit: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ @@ -36,12 +28,6 @@ extern "C" { #define HAS_SUMSQUAREERROR_AVX2 #endif -// The following are available for GCC and clangcl: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - !defined(LIBYUV_ENABLE_ROWWIN) -#define HAS_HAMMINGDISTANCE_SSSE3 -#endif // The following are available for GCC and clangcl: #if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ @@ -68,12 +54,6 @@ extern "C" { uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, int count); -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count); -uint32_t HammingDistance_SSSE3(const uint8_t* src_a, - const uint8_t* src_b, - int count); uint32_t HammingDistance_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count); @@ -86,9 +66,6 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a, uint32_t SumSquareError_C(const uint8_t* src_a, const uint8_t* src_b, int count); -uint32_t SumSquareError_SSE2(const uint8_t* src_a, - const uint8_t* src_b, - int count); uint32_t SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count); @@ -100,7 +77,6 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a, int count); uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed); -uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed); uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed); uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed); diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index 49f8a44bf..b84764657 100644 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -20,28 +20,14 @@ extern "C" { #endif // The following are available for Visual C 32 bit: -// TODO - port to clangcl on rotate_win -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ - !defined(__clang__) -#define HAS_TRANSPOSEWX8_SSSE3 -#define HAS_TRANSPOSEUVWX8_SSE2 -#endif // The following are available for GCC 32 or 64 bit: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__i386__) || defined(__x86_64__)) && \ !defined(LIBYUV_ENABLE_ROWWIN) -#define HAS_TRANSPOSEWX8_SSSE3 -#define HAS_TRANSPOSE4X4_32_SSE2 #define HAS_TRANSPOSE4X4_32_AVX2 #endif -// The following are available for 64 bit GCC: -#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \ - !defined(LIBYUV_ENABLE_ROWWIN) -#define HAS_TRANSPOSEWX8_FAST_SSSE3 -#define HAS_TRANSPOSEUVWX8_SSE2 -#endif #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) @@ -98,16 +84,6 @@ void TransposeWxH_SME(const uint8_t* src, int dst_stride, int width, int height); -void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_Fast_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); void TransposeWx16_LSX(const uint8_t* src, int src_stride, uint8_t* dst, @@ -124,16 +100,6 @@ void TransposeWx16_Any_NEON(const uint8_t* src, uint8_t* dst, int dst_stride, int width); -void TransposeWx8_Any_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); -void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); void TransposeWx16_Any_LSX(const uint8_t* src, int src_stride, uint8_t* dst, @@ -163,13 +129,6 @@ void TransposeUVWx16_C(const uint8_t* src, uint8_t* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); void TransposeUVWx8_NEON(const uint8_t* src, int src_stride, uint8_t* dst_a, @@ -193,13 +152,6 @@ void TransposeUVWx16_LSX(const uint8_t* src, int dst_stride_b, int width); -void TransposeUVWx8_Any_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width); void TransposeUVWx8_Any_NEON(const uint8_t* src, int src_stride, uint8_t* dst_a, @@ -239,11 +191,6 @@ void Transpose4x4_32_NEON(const uint8_t* src, int dst_stride, int width); -void Transpose4x4_32_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); void Transpose4x4_32_AVX2(const uint8_t* src, int src_stride, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 40272cf5a..b8e3294ca 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -36,85 +36,14 @@ extern "C" { // The following are available on clang x86 platforms: #if defined(USE_ROW_GCC) // Conversions: -#define HAS_ARGB1555TOARGBROW_SSE2 -#define HAS_ARGB4444TOARGBROW_SSE2 -#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_ARGBSETROW_X86 -#define HAS_ARGBSHUFFLEROW_SSSE3 -#define HAS_ARGBTOARGB1555ROW_SSE2 -#define HAS_ARGBTOARGB4444ROW_SSE2 -#define HAS_ARGBTORAWROW_SSSE3 -#define HAS_ARGBTORGB24ROW_SSSE3 -#define HAS_ARGBTORGB565DITHERROW_SSE2 -#define HAS_ARGBTORGB565ROW_SSE2 #define HAS_COPYROW_ERMS -#define HAS_COPYROW_SSE2 -#define HAS_H422TOARGBROW_SSSE3 -#define HAS_HALFFLOATROW_SSE2 -#define HAS_I422TOARGB1555ROW_SSSE3 -#define HAS_I422TOARGB4444ROW_SSSE3 -#define HAS_I422TOARGBROW_SSSE3 -#define HAS_I422TORGB24ROW_SSSE3 -#define HAS_I422TORGB565ROW_SSSE3 -#define HAS_I422TORGBAROW_SSSE3 -#define HAS_I422TOUYVYROW_SSE2 -#define HAS_I422TOYUY2ROW_SSE2 -#define HAS_I444TOARGBROW_SSSE3 -#define HAS_I444TORGB24ROW_SSSE3 -#define HAS_INTERPOLATEROW_SSSE3 -#define HAS_J400TOARGBROW_SSE2 -#define HAS_J422TOARGBROW_SSSE3 -#define HAS_MERGEUVROW_SSE2 -#define HAS_MIRRORROW_SSSE3 -#define HAS_MIRRORSPLITUVROW_SSSE3 -#define HAS_NV12TOARGBROW_SSSE3 -#define HAS_NV12TORGB24ROW_SSSE3 -#define HAS_NV12TORGB565ROW_SSSE3 -#define HAS_NV21TOARGBROW_SSSE3 -#define HAS_NV21TORGB24ROW_SSSE3 -#define HAS_RAWTOARGBROW_SSSE3 -#define HAS_RAWTORGB24ROW_SSSE3 -#define HAS_RGB24TOARGBROW_SSSE3 -#define HAS_RGB565TOARGBROW_SSE2 #define HAS_SETROW_ERMS #define HAS_SETROW_X86 -#define HAS_SPLITUVROW_SSE2 -#define HAS_UYVYTOARGBROW_SSSE3 -#define HAS_UYVYTOUV422ROW_SSE2 -#define HAS_UYVYTOUVROW_SSE2 -#define HAS_UYVYTOYROW_SSE2 -#define HAS_YUY2TOARGBROW_SSSE3 -#define HAS_YUY2TOUV422ROW_SSE2 -#define HAS_YUY2TOUVROW_SSE2 -#define HAS_YUY2TOYROW_SSE2 // Effects: -#define HAS_ARGBADDROW_SSE2 -#define HAS_ARGBAFFINEROW_SSE2 -#define HAS_ARGBBLENDROW_SSSE3 -#define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORTABLEROW_X86 -#define HAS_ARGBCOPYALPHAROW_SSE2 -#define HAS_ARGBCOPYYTOALPHAROW_SSE2 -#define HAS_ARGBGRAYROW_SSSE3 -#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 -#define HAS_ARGBMIRRORROW_SSE2 -// TODO: Re-enable once rounding behaviour is fixed. -// #define HAS_ARGBMULTIPLYROW_SSE2 -#define HAS_ARGBPOLYNOMIALROW_SSE2 -#define HAS_ARGBQUANTIZEROW_SSE2 -#define HAS_ARGBSEPIAROW_SSSE3 -#define HAS_ARGBSHADEROW_SSE2 -#define HAS_ARGBSUBTRACTROW_SSE2 -#define HAS_BLENDPLANEROW_SSSE3 -#define HAS_COMPUTECUMULATIVESUMROW_SSE2 -#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #define HAS_RGBCOLORTABLEROW_X86 -#define HAS_SOBELROW_SSE2 -#define HAS_SOBELTOPLANEROW_SSE2 -#define HAS_SOBELXROW_SSE2 -#define HAS_SOBELXYROW_SSE2 -#define HAS_SOBELYROW_SSE2 // The following functions fail on gcc/clang 32 bit with fpic and framepointer. // caveat: clangcl uses row_win.cc which works. @@ -123,8 +52,6 @@ extern "C" { !defined(LIBYUV_ENABLE_ROWWIN) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_SSSE3 -#define HAS_I444ALPHATOARGBROW_SSSE3 #endif #if (defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ defined(_MSC_VER)) && \ @@ -200,77 +127,11 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ (defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_ENABLE_ROWWIN) -#define HAS_AB64TOARGBROW_SSSE3 -#define HAS_ABGRTOAR30ROW_SSSE3 -#define HAS_ABGRTOYJROW_SSSE3 -#define HAS_AR64TOARGBROW_SSSE3 -#define HAS_ARGBATTENUATEROW_SSSE3 -#define HAS_ARGBTOAB64ROW_SSSE3 -#define HAS_ARGBTOAR30ROW_SSSE3 -#define HAS_ARGBTOAR64ROW_SSSE3 -#define HAS_ARGBTOUV444ROW_SSSE3 -#define HAS_ARGBTOUVJ444ROW_SSSE3 -#define HAS_ARGBUNATTENUATEROW_SSE2 -#define HAS_CONVERT16TO8ROW_SSSE3 -#define HAS_CONVERT8TO16ROW_SSE2 -#define HAS_DETILEROW_16_SSE2 -#define HAS_DETILEROW_SSE2 -#define HAS_DETILESPLITUVROW_SSSE3 -#define HAS_DETILETOYUY2_SSE2 -#define HAS_HALFMERGEUVROW_SSSE3 -#define HAS_I210TOAR30ROW_SSSE3 -#define HAS_I210TOARGBROW_SSSE3 -#define HAS_I212TOAR30ROW_SSSE3 -#define HAS_I212TOARGBROW_SSSE3 -#define HAS_I400TOARGBROW_SSE2 -#define HAS_I410TOAR30ROW_SSSE3 -#define HAS_I410TOARGBROW_SSSE3 -#define HAS_I422TOAR30ROW_SSSE3 -#define HAS_MERGEARGBROW_SSE2 -#define HAS_MERGERGBROW_SSSE3 -#define HAS_MERGEXRGBROW_SSE2 -#define HAS_MIRRORUVROW_SSSE3 -#define HAS_NV21TOYUV24ROW_SSSE3 -#define HAS_P210TOAR30ROW_SSSE3 -#define HAS_P210TOARGBROW_SSSE3 -#define HAS_P410TOAR30ROW_SSSE3 -#define HAS_P410TOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_AVX2 -#define HAS_RAWTORGBAROW_SSSE3 -#define HAS_RGB24MIRRORROW_SSSE3 -#define HAS_RGBATOYJROW_SSSE3 -#define HAS_SPLITARGBROW_SSE2 -#define HAS_SPLITARGBROW_SSSE3 -#define HAS_SPLITRGBROW_SSE41 -#define HAS_SPLITRGBROW_SSSE3 -#define HAS_SPLITXRGBROW_SSE2 -#define HAS_SPLITXRGBROW_SSSE3 -#define HAS_SWAPUVROW_SSSE3 -#define HAS_YUY2TONVUVROW_SSE2 // TODO: port row_win to use 8 bit coefficients. -#define HAS_ARGBTOYJROW_SSSE3 -#define HAS_ARGBTOYROW_SSSE3 -#define HAS_ARGBTOYMATRIXROW_SSSE3 -#define HAS_BGRATOYROW_SSSE3 -#define HAS_ABGRTOYROW_SSSE3 -#define HAS_RGBATOYROW_SSSE3 // TODO: adjust row_win to use 8 bit negative coefficients. -#define HAS_ABGRTOUVJROW_SSSE3 -#define HAS_ARGBTOUVJROW_SSSE3 -#define HAS_ABGRTOUVROW_SSSE3 -#define HAS_ARGBTOUVROW_SSSE3 -#define HAS_BGRATOUVROW_SSSE3 -#define HAS_RGBATOUVROW_SSSE3 -#define HAS_ARGBTOUVMATRIXROW_SSSE3 -#define HAS_ARGBTOUV444MATRIXROW_SSSE3 -#if defined(__x86_64__) || !defined(__pic__) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I210ALPHATOARGBROW_SSSE3 -#define HAS_I410ALPHATOARGBROW_SSSE3 -#endif #endif // The following are available for AVX2 gcc/clang x86 platforms: @@ -1753,30 +1614,22 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_AVX512BW(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); -void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYJRow_AVX512BW(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYJRow_AVX512BW(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); -void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); void BGRAToYRow_AVX512BW(const uint8_t* src_bgra, uint8_t* dst_y, int width); void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width); -void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); -void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYRow_AVX512BW(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); @@ -2167,12 +2020,6 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2190,10 +2037,6 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width, @@ -2202,10 +2045,6 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGBToYMatrixRow_Any_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); void ARGBToYMatrixRow_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width, @@ -2251,11 +2090,6 @@ void ARGBToYMatrixRow_Any_LASX(const uint8_t* src_argb, const struct ArgbConstants* c); -void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2266,12 +2100,6 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGBToUVMatrixRow_Any_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); void ARGBToUVMatrixRow_Any_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2284,11 +2112,6 @@ void ARGBToUVMatrixRow_Any_AVX512BW(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGBToUV444MatrixRow_Any_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); void ARGBToUV444MatrixRow_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2317,13 +2140,6 @@ void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON_DotProd(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -2424,36 +2240,6 @@ void ABGRToUVJRow_AVX512BW(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2494,36 +2280,6 @@ void ABGRToUVJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVJRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -2854,23 +2610,7 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, uint8_t* dst_v, int width); -void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJ444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void ARGBToUVJ444Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void ARGBToUV444Row_AVX2(const uint8_t* src_argb, uint8_t* dst_u, @@ -2917,33 +2657,24 @@ void ARGBToUVJ444Row_C(const uint8_t* src_argb, int width); void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width); -void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -2960,7 +2691,6 @@ void MirrorSplitUVRow_C(const uint8_t* src_uv, void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width); void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); @@ -2968,9 +2698,6 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -2979,16 +2706,10 @@ void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width); void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); -void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -2997,10 +2718,6 @@ void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); -void SplitUVRow_SSE2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -3017,10 +2734,6 @@ void SplitUVRow_RVV(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); -void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void SplitUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -3045,14 +2758,6 @@ void DetileRow_Any_NEON(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width); -void DetileRow_SSE2(const uint8_t* src, - ptrdiff_t src_tile_stride, - uint8_t* dst, - int width); -void DetileRow_Any_SSE2(const uint8_t* src, - ptrdiff_t src_tile_stride, - uint8_t* dst, - int width); void DetileRow_AVX(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, @@ -3073,14 +2778,6 @@ void DetileRow_16_Any_NEON(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width); -void DetileRow_16_SSE2(const uint16_t* src, - ptrdiff_t src_tile_stride, - uint16_t* dst, - int width); -void DetileRow_16_Any_SSE2(const uint16_t* src, - ptrdiff_t src_tile_stride, - uint16_t* dst, - int width); void DetileRow_16_AVX(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, @@ -3094,16 +2791,6 @@ void DetileSplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); -void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, - ptrdiff_t src_tile_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void DetileSplitUVRow_Any_SSSE3(const uint8_t* src_uv, - ptrdiff_t src_tile_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void DetileSplitUVRow_NEON(const uint8_t* src_uv, ptrdiff_t src_tile_stride, uint8_t* dst_u, @@ -3120,18 +2807,6 @@ void DetileToYUY2_C(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width); -void DetileToYUY2_SSE2(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width); -void DetileToYUY2_Any_SSE2(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width); void DetileToYUY2_NEON(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, const uint8_t* src_uv, @@ -3150,10 +2825,6 @@ void MergeUVRow_C(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); -void MergeUVRow_SSE2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width); void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -3178,10 +2849,6 @@ void MergeUVRow_RVV(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); -void MergeUVRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void MergeUVRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -3213,12 +2880,6 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, uint8_t* dst_uv, int width); -void HalfMergeUVRow_SSSE3(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width); void HalfMergeUVRow_AVX2(const uint8_t* src_u, int src_stride_u, @@ -3232,16 +2893,6 @@ void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitRGBRow_SSSE3(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitRGBRow_SSE41(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -3257,16 +2908,6 @@ void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitRGBRow_Any_SSE41(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); void SplitRGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -3283,11 +2924,6 @@ void MergeRGBRow_C(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width); -void MergeRGBRow_SSSE3(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width); void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -3298,11 +2934,6 @@ void MergeRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width); -void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); void MergeRGBRow_Any_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -3314,12 +2945,6 @@ void MergeARGBRow_C(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width); -void MergeARGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width); void MergeARGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -3338,12 +2963,6 @@ void MergeARGBRow_RVV(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width); -void MergeARGBRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - int width); void MergeARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3362,18 +2981,6 @@ void SplitARGBRow_C(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); void SplitARGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -3392,18 +2999,6 @@ void SplitARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); -void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width); void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -3421,11 +3016,6 @@ void MergeXRGBRow_C(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width); -void MergeXRGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width); void MergeXRGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -3441,11 +3031,6 @@ void MergeXRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width); -void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3461,16 +3046,6 @@ void SplitXRGBRow_C(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -3486,16 +3061,6 @@ void SplitXRGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); -void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width); void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -3791,18 +3356,10 @@ void Convert8To16Row_C(const uint8_t* src_y, uint16_t* dst_y, int scale, int width); -void Convert8To16Row_SSE2(const uint8_t* src_y, - uint16_t* dst_y, - int scale, - int width); void Convert8To16Row_AVX2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width); -void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int scale, - int width); void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int scale, @@ -3824,10 +3381,6 @@ void Convert16To8Row_C(const uint16_t* src_y, uint8_t* dst_y, int scale, int width); -void Convert16To8Row_SSSE3(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width); void Convert16To8Row_AVX2(const uint16_t* src_y, uint8_t* dst_y, int scale, @@ -3836,10 +3389,6 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, uint8_t* dst_y, int scale, int width); -void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int scale, - int width); void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr, uint8_t* dst_ptr, int scale, @@ -3897,7 +3446,6 @@ void Convert8To8Row_Any_AVX2(const uint8_t* src_ptr, int bias, int width); -void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width); void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); @@ -3905,7 +3453,6 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); void CopyRow_SME(const uint8_t* src, uint8_t* dst, int width); void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count); void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); -void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3913,19 +3460,12 @@ void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count); void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width); -void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_a, - int width); void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width); @@ -3938,9 +3478,6 @@ void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb, void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, uint8_t* dst_a, int width); -void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3952,12 +3489,8 @@ void ARGBExtractAlphaRow_Any_LSX(const uint8_t* src_ptr, int width); void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width); -void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3983,10 +3516,6 @@ void ARGBShuffleRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); -void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width); void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, @@ -4003,10 +3532,6 @@ void ARGBShuffleRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); -void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint8_t* param, - int width); void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, @@ -4024,17 +3549,8 @@ void ARGBShuffleRow_Any_LASX(const uint8_t* src_ptr, const uint8_t* param, int width); -void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width); -void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); -void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); -void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); @@ -4114,32 +3630,11 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width); void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width); -void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4196,13 +3691,6 @@ void ARGB4444ToARGBRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width); @@ -4213,10 +3701,6 @@ void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, uint32_t dither4, int width); -void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, - uint8_t* dst, - uint32_t dither4, - int width); void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, uint32_t dither4, @@ -4315,18 +3799,6 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64, uint8_t* dst_ar64, const uint8_t* shuffler, int width); -void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width); -void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width); -void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width); -void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width); void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width); void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width); void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width); @@ -4341,18 +3813,6 @@ void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width); void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width); void AR64ToAB64Row_RVV(const uint16_t* src_ar64, uint16_t* dst_ab64, int width); void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width); -void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int width); -void ARGBToAB64Row_Any_SSSE3(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int width); -void AR64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int width); -void AB64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBToAR64Row_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int width); @@ -4378,15 +3838,11 @@ void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr, uint8_t* dst_ptr, int width); -void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); -void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4589,93 +4045,19 @@ void I422ToRGBARow_AVX2(const uint8_t* y_buf, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I444ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I444ToRGB24Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); void I444ToRGB24Row_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToAR30Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I212ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I212ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void I410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I422ToAR30Row_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4732,13 +4114,6 @@ void I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4746,13 +4121,6 @@ void I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4760,31 +4128,11 @@ void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void NV12ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); void NV12ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, @@ -4795,10 +4143,6 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width); void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, @@ -4808,24 +4152,11 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void NV21ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* vu_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -4835,26 +4166,6 @@ void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, const struct YuvConstants* yuvconstants, int width); -void P210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width); void P210ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, @@ -4876,54 +4187,24 @@ void P410ToAR30Row_AVX2(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width); void I422ToARGB4444Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width); void I422ToARGB1555Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); void I422ToRGB565Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width); void I422ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4948,18 +4229,6 @@ void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I444ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4972,68 +4241,6 @@ void I444ToRGB24Row_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I210AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I410AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5090,13 +4297,6 @@ void I410AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I444AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I444AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5104,13 +4304,6 @@ void I444AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5118,36 +4311,16 @@ void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -5158,32 +4331,15 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToYUV24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void NV21ToYUV24Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, @@ -5192,26 +4348,6 @@ void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void P210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void P410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void P210ToARGBRow_Any_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ptr, @@ -5232,54 +4368,24 @@ void P410ToAR30Row_Any_AVX2(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5291,10 +4397,6 @@ void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I400ToARGBRow_SSE2(const uint8_t* y_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -5319,10 +4421,6 @@ void I400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const struct YuvConstants* param, - int width); void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* param, @@ -5337,10 +4435,6 @@ void I400ToARGBRow_Any_LSX(const uint8_t* src_ptr, int width); // ARGB preattenuated alpha blend. -void ARGBBlendRow_SSSE3(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -5359,16 +4453,6 @@ void ARGBBlendRow_C(const uint8_t* src_argb, int width); // Unattenuated planar alpha blend. -void BlendPlaneRow_SSSE3(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width); -void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); void BlendPlaneRow_AVX2(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, @@ -5396,14 +4480,6 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -5446,14 +4522,6 @@ void ARGBAddRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBAddRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBAddRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -5493,14 +4561,6 @@ void ARGBSubtractRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBSubtractRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width); -void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -5534,27 +4594,6 @@ void ARGBSubtractRow_Any_LASX(const uint8_t* y_buf, uint8_t* dst_ptr, int width); -void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -5562,10 +4601,6 @@ void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr, void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint32_t param, - int width); void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, @@ -6028,20 +5063,6 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); -void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); -void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_uv, - int width); -void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, int stride_yuy2, @@ -6104,20 +5125,6 @@ void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); -void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void YUY2ToNVUVRow_Any_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_uv, - int width); -void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, @@ -6162,16 +5169,6 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); -void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); -void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, int stride_uyvy, @@ -6233,16 +5230,6 @@ void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); -void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width); -void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, @@ -6276,8 +5263,6 @@ void UYVYToUV422Row_Any_LASX(const uint8_t* src_ptr, void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width); -void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); @@ -6334,26 +5319,6 @@ void I422ToUYVYRow_C(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_frame, int width); -void I422ToYUY2Row_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width); -void I422ToUYVYRow_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width); -void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); -void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - int width); void I422ToYUY2Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -6437,9 +5402,6 @@ void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf, // Effects related row functions. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width); @@ -6455,9 +5417,6 @@ void ARGBAttenuateRow_LASX(const uint8_t* src_argb, void ARGBAttenuateRow_RVV(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -6476,21 +5435,14 @@ extern const uint32_t fixed_invtbl8[256]; void ARGBUnattenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width); void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb, uint8_t* dst_argb, @@ -6499,7 +5451,6 @@ void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBSepiaRow_C(uint8_t* dst_argb, int width); -void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width); void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width); void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width); void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width); @@ -6509,10 +5460,6 @@ void ARGBColorMatrixRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width); -void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width); void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, @@ -6545,11 +5492,6 @@ void ARGBQuantizeRow_C(uint8_t* dst_argb, int interval_size, int interval_offset, int width); -void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width); void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int scale, int interval_size, @@ -6565,10 +5507,6 @@ void ARGBShadeRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value); -void ARGBShadeRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value); void ARGBShadeRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width, @@ -6583,16 +5521,6 @@ void ARGBShadeRow_LASX(const uint8_t* src_argb, uint32_t value); // Used for blur. -void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, - const int32_t* botleft, - int width, - int area, - uint8_t* dst, - int count); -void ComputeCumulativeSumRow_SSE2(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width); void CumulativeSumToAverageRow_C(const int32_t* tl, const int32_t* bl, @@ -6612,11 +5540,6 @@ void ARGBAffineRow_C(const uint8_t* src_argb, const float* uv_dudv, int width); LIBYUV_API -void ARGBAffineRow_SSE2(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* src_dudv, - int width); // Used for I420Scale, ARGBScale, and ARGBInterpolate. void InterpolateRow_C(uint8_t* dst_ptr, @@ -6624,11 +5547,6 @@ void InterpolateRow_C(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); -void InterpolateRow_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int dst_width, - int source_y_fraction); void InterpolateRow_AVX2(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, @@ -6659,11 +5577,6 @@ void InterpolateRow_Any_NEON(uint8_t* dst_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride_ptr, - int width, - int source_y_fraction); void InterpolateRow_Any_AVX2(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, @@ -6739,11 +5652,6 @@ void SobelXRow_C(const uint8_t* src_y0, const uint8_t* src_y2, uint8_t* dst_sobelx, int width); -void SobelXRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width); void SobelXRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, @@ -6753,10 +5661,6 @@ void SobelYRow_C(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width); -void SobelYRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width); void SobelYRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, @@ -6765,10 +5669,6 @@ void SobelRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); -void SobelRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); void SobelRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, @@ -6781,10 +5681,6 @@ void SobelToPlaneRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width); -void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width); void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, @@ -6797,10 +5693,6 @@ void SobelXYRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); -void SobelXYRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width); void SobelXYRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, @@ -6809,10 +5701,6 @@ void SobelXYRow_LSX(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); -void SobelRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void SobelRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -6821,10 +5709,6 @@ void SobelRow_Any_LSX(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -6833,10 +5717,6 @@ void SobelToPlaneRow_Any_LSX(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void SobelXYRow_Any_SSE2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - int width); void SobelXYRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -6850,10 +5730,6 @@ void ARGBPolynomialRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width); -void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width); void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, @@ -6861,14 +5737,6 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, // Scale and convert to half float. void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width); -void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width); -void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - float param, - int width); void HalfFloatRow_AVX2(const uint16_t* src, uint16_t* dst, float scale, @@ -6944,11 +5812,6 @@ void ARGBLumaColorTableRow_C(const uint8_t* src_argb, int width, const uint8_t* luma, uint32_t lumacoeff); -void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff); float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width); float ScaleMaxSamples_NEON(const float* src, @@ -6999,7 +5862,6 @@ void GaussCol_NEON(const uint16_t* src0, uint32_t* dst, int width); -void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width); #ifdef __cplusplus } // extern "C" diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 16f626439..8ec7a9436 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -27,39 +27,9 @@ extern "C" { defined(__i386__)) #define HAS_FIXEDDIV1_X86 #define HAS_FIXEDDIV_X86 -#define HAS_SCALEADDROW_SSE2 -#define HAS_SCALEARGBCOLS_SSE2 -#define HAS_SCALEARGBCOLSUP2_SSE2 -#define HAS_SCALEARGBFILTERCOLS_SSSE3 -#define HAS_SCALEARGBROWDOWN2_SSE2 -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -#define HAS_SCALECOLSUP2_SSE2 -#define HAS_SCALEFILTERCOLS_SSSE3 -#define HAS_SCALEROWDOWN2_SSSE3 -#define HAS_SCALEROWDOWN34_SSSE3 -#define HAS_SCALEROWDOWN38_SSSE3 -#define HAS_SCALEROWDOWN4_SSSE3 #endif // The following are available for gcc/clang x86 platforms: -// TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - !defined(LIBYUV_ENABLE_ROWWIN) -#define HAS_SCALEUVROWDOWN2BOX_SSSE3 -#define HAS_SCALEROWUP2_LINEAR_SSE2 -#define HAS_SCALEROWUP2_LINEAR_SSSE3 -#define HAS_SCALEROWUP2_BILINEAR_SSE2 -#define HAS_SCALEROWUP2_BILINEAR_SSSE3 -#define HAS_SCALEROWUP2_LINEAR_12_SSSE3 -#define HAS_SCALEROWUP2_BILINEAR_12_SSSE3 -#define HAS_SCALEROWUP2_LINEAR_16_SSE2 -#define HAS_SCALEROWUP2_BILINEAR_16_SSE2 -#define HAS_SCALEUVROWUP2_LINEAR_SSSE3 -#define HAS_SCALEUVROWUP2_BILINEAR_SSSE3 -#define HAS_SCALEUVROWUP2_LINEAR_16_SSE41 -#define HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 -#endif // The following are available for gcc/clang x86 platforms, but // require clang 3.4 or gcc 4.7. @@ -610,18 +580,6 @@ void ScaleUVFilterCols64_C(uint8_t* dst_uv, int dx); // Specialized scalers for x86. -void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleRowDown2_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -634,14 +592,6 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); -void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleRowDown4_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -651,63 +601,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); -void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); @@ -732,38 +626,6 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); @@ -789,22 +651,6 @@ void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -821,14 +667,6 @@ void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); -void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -838,67 +676,14 @@ void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); -void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); -void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width); void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); -void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); -void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); // ARGB Column functions -void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); -void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, @@ -926,18 +711,6 @@ void ScaleARGBFilterCols_RVV(uint8_t* dst_argb, int dx); // ARGB Row functions -void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width); void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -986,18 +759,6 @@ void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); -void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -1022,16 +783,6 @@ void ScaleARGBRowDown2Box_Any_LSX(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); -void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); -void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width); void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -1062,16 +813,6 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width); -void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, @@ -1094,18 +835,6 @@ void ScaleARGBRowDownEvenBox_Any_LSX(const uint8_t* src_ptr, int dst_width); // UV Row functions -void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_uv, - int dst_width); void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, @@ -1146,18 +875,6 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); -void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -1174,16 +891,6 @@ void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); -void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); -void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_uv, - int dst_width); void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, @@ -1204,16 +911,6 @@ void ScaleUVRowDownEven_RVV(const uint8_t* src_ptr, int32_t src_stepx, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_ptr, - int dst_width); void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, @@ -1225,22 +922,6 @@ void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); -void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); @@ -1281,22 +962,6 @@ void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); -void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width); -void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width); void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); diff --git a/source/compare.cc b/source/compare.cc index e85cc6d07..ca8c78cbb 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -32,32 +32,31 @@ LIBYUV_API uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { const int kBlockSize = 1 << 15; // 32768; int remainder; - uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) = + uint32_t (*HashDjb2_Opt)(const uint8_t* src, int count, uint32_t seed) = HashDjb2_C; -#if defined(HAS_HASHDJB2_SSE41) +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - HashDjb2_SSE = HashDjb2_SSE41; } #endif #if defined(HAS_HASHDJB2_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - HashDjb2_SSE = HashDjb2_AVX2; + HashDjb2_Opt = HashDjb2_AVX2; } #endif #if defined(HAS_HASHDJB2_NEON) if (TestCpuFlag(kCpuHasNEON)) { - HashDjb2_SSE = HashDjb2_NEON; + HashDjb2_Opt = HashDjb2_NEON; } #endif while (count >= (uint64_t)kBlockSize) { - seed = HashDjb2_SSE(src, kBlockSize, seed); + seed = HashDjb2_Opt(src, kBlockSize, seed); src += kBlockSize; count -= kBlockSize; } remainder = (int)count & ~15; if (remainder) { - seed = HashDjb2_SSE(src, remainder, seed); + seed = HashDjb2_Opt(src, remainder, seed); src += remainder; count -= remainder; } @@ -144,14 +143,12 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a, HammingDistance = HammingDistance_NEON_DotProd; } #endif -#if defined(HAS_HAMMINGDISTANCE_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - HammingDistance = HammingDistance_SSSE3; } #endif -#if defined(HAS_HAMMINGDISTANCE_SSE42) +#if 0 if (TestCpuFlag(kCpuHasSSE42)) { - HammingDistance = HammingDistance_SSE42; } #endif #if defined(HAS_HAMMINGDISTANCE_AVX2) @@ -204,10 +201,9 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a, SumSquareError = SumSquareError_NEON_DotProd; } #endif -#if defined(HAS_SUMSQUAREERROR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { // Note only used for multiples of 16 so count is not checked. - SumSquareError = SumSquareError_SSE2; } #endif #if defined(HAS_SUMSQUAREERROR_AVX2) diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 33a725e58..3f6502909 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -26,155 +26,16 @@ extern "C" { // "memory" clobber prevents the reads from being removed #if defined(__x86_64__) -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint64_t diff; - asm volatile( - "xor %3,%3 \n" - "xor %%r8,%%r8 \n" - "xor %%r9,%%r9 \n" - "xor %%r10,%%r10 \n" - - // Process 32 bytes per loop. - LABELALIGN - "1: \n" - "mov (%0),%%rcx \n" - "mov 0x8(%0),%%rdx \n" - "xor (%1),%%rcx \n" - "xor 0x8(%1),%%rdx \n" - "popcnt %%rcx,%%rcx \n" - "popcnt %%rdx,%%rdx \n" - "mov 0x10(%0),%%rsi \n" - "mov 0x18(%0),%%rdi \n" - "xor 0x10(%1),%%rsi \n" - "xor 0x18(%1),%%rdi \n" - "popcnt %%rsi,%%rsi \n" - "popcnt %%rdi,%%rdi \n" - "add $0x20,%0 \n" - "add $0x20,%1 \n" - "add %%rcx,%3 \n" - "add %%rdx,%%r8 \n" - "add %%rsi,%%r9 \n" - "add %%rdi,%%r10 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - - "add %%r8, %3 \n" - "add %%r9, %3 \n" - "add %%r10, %3 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=&r"(diff) // %3 - : - : "cc", "memory", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); - - return (uint32_t)(diff); -} #else -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - asm volatile( - // Process 16 bytes per loop. - LABELALIGN - "1: \n" - "mov (%0),%%ecx \n" - "mov 0x4(%0),%%edx \n" - "xor (%1),%%ecx \n" - "xor 0x4(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "mov 0x8(%0),%%ecx \n" - "mov 0xc(%0),%%edx \n" - "xor 0x8(%1),%%ecx \n" - "xor 0xc(%1),%%edx \n" - "popcnt %%ecx,%%ecx \n" - "add %%ecx,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "add $0x10,%0 \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "+r"(diff) // %3 - : - : "cc", "memory", "ecx", "edx"); - - return diff; -} #endif static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; -uint32_t HammingDistance_SSSE3(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff; - asm volatile( - "movdqa %4,%%xmm2 \n" - "movdqa %5,%%xmm3 \n" - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub %0,%1 \n" - - LABELALIGN - "1: \n" - "movdqa (%0),%%xmm4 \n" - "movdqa 0x10(%0), %%xmm5 \n" - "pxor (%0,%1), %%xmm4 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pand %%xmm2,%%xmm6 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm6,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "paddb %%xmm7,%%xmm6 \n" - "pxor 0x10(%0,%1),%%xmm5 \n" - "add $0x20,%0 \n" - "movdqa %%xmm5,%%xmm4 \n" - "pand %%xmm2,%%xmm5 \n" - "psrlw $0x4,%%xmm4 \n" - "movdqa %%xmm3,%%xmm7 \n" - "pshufb %%xmm5,%%xmm7 \n" - "pand %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufb %%xmm4,%%xmm5 \n" - "paddb %%xmm7,%%xmm5 \n" - "paddb %%xmm5,%%xmm6 \n" - "psadbw %%xmm1,%%xmm6 \n" - "paddd %%xmm6,%%xmm0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - - "pshufd $0xaa,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0, %3 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=r"(diff) // %3 - : "m"(kNibbleMask), // %4 - "m"(kBitCount) // %5 - : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); - - return diff; -} #ifdef HAS_HAMMINGDISTANCE_AVX2 uint32_t HammingDistance_AVX2(const uint8_t* src_a, @@ -232,47 +93,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, } #endif // HAS_HAMMINGDISTANCE_AVX2 -uint32_t SumSquareError_SSE2(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse; - asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=r"(sse) // %3 - : - : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); - return sse; -} static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 static const uvec32 kHashMul0 = { @@ -300,60 +121,7 @@ static const uvec32 kHashMul3 = { 0x00000001, // 33 ^ 0 }; -uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { - uint32_t hash; - asm volatile( - "movd %2,%%xmm0 \n" - "pxor %%xmm7,%%xmm7 \n" - "movdqa %4,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pmulld %%xmm6,%%xmm0 \n" - "movdqa %5,%%xmm5 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "pmulld %%xmm5,%%xmm3 \n" - "movdqa %6,%%xmm5 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpckhwd %%xmm7,%%xmm4 \n" - "pmulld %%xmm5,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "punpckhbw %%xmm7,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "pmulld %%xmm5,%%xmm2 \n" - "movdqa %8,%%xmm5 \n" - "punpckhwd %%xmm7,%%xmm1 \n" - "pmulld %%xmm5,%%xmm1 \n" - "paddd %%xmm4,%%xmm3 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm1 \n" - "pshufd $0xe,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "pshufd $0x1,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "sub $0x10,%1 \n" - "jg 1b \n" - "movd %%xmm0,%3 \n" - : "+r"(src), // %0 - "+r"(count), // %1 - "+rm"(seed), // %2 - "=r"(hash) // %3 - : "m"(kHash16x33), // %4 - "m"(kHashMul0), // %5 - "m"(kHashMul1), // %6 - "m"(kHashMul2), // %7 - "m"(kHashMul3) // %8 - : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); - return hash; -} #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) #ifdef __cplusplus diff --git a/source/compare_win.cc b/source/compare_win.cc index 9d5bb27cd..1268a084d 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -26,57 +26,10 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_IX86) && \ (!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN)) -uint32_t HammingDistance_SSE42(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - int i; - for (i = 0; i < count - 3; i += 4) { - uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT - src_a += 4; - src_b += 4; - diff += __popcnt(x); - } - return diff; -} __declspec(naked) uint32_t - SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - pxor xmm0, xmm0 - pxor xmm5, xmm5 - wloop: - movdqu xmm1, [eax] - lea eax, [eax + 16] - movdqu xmm2, [edx] - lea edx, [edx + 16] - movdqa xmm3, xmm1 // abs trick - psubusb xmm1, xmm2 - psubusb xmm2, xmm3 - por xmm1, xmm2 - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm0, xmm1 - paddd xmm0, xmm2 - sub ecx, 16 - jg wloop - - pshufd xmm1, xmm0, 0xee - paddd xmm0, xmm1 - pshufd xmm1, xmm0, 0x01 - paddd xmm0, xmm1 - movd eax, xmm0 - ret - } -} #ifdef HAS_SUMSQUAREERROR_AVX2 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. @@ -147,53 +100,7 @@ uvec32 kHashMul3 = { }; __declspec(naked) uint32_t - HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { - __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count - movd xmm0, [esp + 12] // seed - pxor xmm7, xmm7 // constant 0 for unpck - movdqa xmm6, xmmword ptr kHash16x33 - - wloop: - movdqu xmm1, [eax] // src[0-15] - lea eax, [eax + 16] - pmulld xmm0, xmm6 // hash *= 33 ^ 16 - movdqa xmm5, xmmword ptr kHashMul0 - movdqa xmm2, xmm1 - punpcklbw xmm2, xmm7 // src[0-7] - movdqa xmm3, xmm2 - punpcklwd xmm3, xmm7 // src[0-3] - pmulld xmm3, xmm5 - movdqa xmm5, xmmword ptr kHashMul1 - movdqa xmm4, xmm2 - punpckhwd xmm4, xmm7 // src[4-7] - pmulld xmm4, xmm5 - movdqa xmm5, xmmword ptr kHashMul2 - punpckhbw xmm1, xmm7 // src[8-15] - movdqa xmm2, xmm1 - punpcklwd xmm2, xmm7 // src[8-11] - pmulld xmm2, xmm5 - movdqa xmm5, xmmword ptr kHashMul3 - punpckhwd xmm1, xmm7 // src[12-15] - pmulld xmm1, xmm5 - paddd xmm3, xmm4 // add 16 results - paddd xmm1, xmm2 - paddd xmm1, xmm3 - - pshufd xmm2, xmm1, 0x0e // upper 2 dwords - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0x01 - paddd xmm1, xmm2 - paddd xmm0, xmm1 - sub ecx, 16 - jg wloop - - movd eax, xmm0 // return hash - ret - } -} // Visual C 2012 required for AVX2. #ifdef HAS_HASHDJB2_AVX2 diff --git a/source/convert.cc b/source/convert.cc index d9fb54778..3f2f53a1a 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -689,11 +689,9 @@ int I010ToNV12(const uint16_t* src_y, Convert16To8Row = Convert16To8Row_SME; } #endif -#if defined(HAS_CONVERT16TO8ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Convert16To8Row = Convert16To8Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - Convert16To8Row = Convert16To8Row_SSSE3; } } #endif @@ -714,11 +712,9 @@ int I010ToNV12(const uint16_t* src_y, } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1154,11 +1150,9 @@ int I422ToNV21(const uint8_t* src_y, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1204,11 +1198,9 @@ int I422ToNV21(const uint8_t* src_y, MergeUVRow = MergeUVRow_RVV; } #endif -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -1673,13 +1665,9 @@ int YUY2ToI420(const uint8_t* src_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } -#if defined(HAS_YUY2TOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; - YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_SSE2; - YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -1764,13 +1752,9 @@ int UYVYToI420(const uint8_t* src_uyvy, src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } -#if defined(HAS_UYVYTOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - UYVYToUVRow = UYVYToUVRow_Any_SSE2; - UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUVRow = UYVYToUVRow_SSE2; - UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -1863,13 +1847,9 @@ int AYUVToNV12(const uint8_t* src_ayuv, src_stride_ayuv = -src_stride_ayuv; } // place holders for future intel code -#if defined(HAS_AYUVTOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - AYUVToUVRow = AYUVToUVRow_Any_SSE2; - AYUVToYRow = AYUVToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - AYUVToUVRow = AYUVToUVRow_SSE2; - AYUVToYRow = AYUVToYRow_SSE2; } } #endif @@ -1940,13 +1920,9 @@ int AYUVToNV21(const uint8_t* src_ayuv, src_stride_ayuv = -src_stride_ayuv; } // place holders for future intel code -#if defined(HAS_AYUVTOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - AYUVToVURow = AYUVToVURow_Any_SSE2; - AYUVToYRow = AYUVToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - AYUVToVURow = AYUVToVURow_SSE2; - AYUVToYRow = AYUVToYRow_SSE2; } } #endif @@ -2070,19 +2046,15 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -2183,11 +2155,9 @@ int ARGBToI420Matrix(const uint8_t* src_argb, const struct ArgbConstants* c) = ARGBToUVMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -2271,11 +2241,9 @@ ARGBToUVMatrixRow_C; ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3; } } #endif @@ -2421,19 +2389,15 @@ int ARGBToI420Alpha(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -2487,7 +2451,7 @@ int ARGBToI420Alpha(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 : ARGBExtractAlphaRow_Any_SSE2; @@ -2614,19 +2578,15 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif -#if defined(HAS_BGRATOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - BGRAToYRow = BGRAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - BGRAToYRow = BGRAToYRow_SSSE3; } } #endif -#if defined(HAS_BGRATOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - BGRAToUVRow = BGRAToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_SSSE3; } } #endif @@ -2721,19 +2681,15 @@ int ABGRToI420(const uint8_t* src_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; } } #endif @@ -2876,11 +2832,9 @@ int RGBAToI420(const uint8_t* src_rgba, src_rgba = src_rgba + (height - 1) * src_stride_rgba; src_stride_rgba = -src_stride_rgba; } -#if defined(HAS_RGBATOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGBAToYRow = RGBAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToYRow = RGBAToYRow_SSSE3; } } #endif @@ -2900,11 +2854,9 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif -#if defined(HAS_RGBATOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGBAToUVRow = RGBAToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_SSSE3; } } #endif @@ -3022,11 +2974,9 @@ int RGB24ToI420(const uint8_t* src_rgb24, ARGBToUVRow_C; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -3095,11 +3045,9 @@ int RGB24ToI420(const uint8_t* src_rgb24, src_stride_rgb24 = -src_stride_rgb24; } -#if defined(HAS_RGB24TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif @@ -3153,11 +3101,9 @@ int RGB24ToI420(const uint8_t* src_rgb24, RGB24ToARGBRow = RGB24ToARGBRow_RVV; } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -3288,11 +3234,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24, // Other platforms do intermediate conversion from RGB24 to ARGB. #else // HAS_RGB24TOYJROW -#if defined(HAS_RGB24TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif @@ -3346,11 +3290,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24, RGB24ToARGBRow = RGB24ToARGBRow_RVV; } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -3370,11 +3312,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_ARGBTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif @@ -3458,11 +3398,9 @@ int RAWToI420(const uint8_t* src_raw, ARGBToUVRow_C; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -3531,11 +3469,9 @@ int RAWToI420(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -3589,11 +3525,9 @@ int RAWToI420(const uint8_t* src_raw, RAWToARGBRow = RAWToARGBRow_RVV; } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -3728,11 +3662,9 @@ int RAWToJ420(const uint8_t* src_raw, // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYJROW -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -3786,11 +3718,9 @@ int RAWToJ420(const uint8_t* src_raw, RAWToARGBRow = RAWToARGBRow_RVV; } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -3802,11 +3732,9 @@ int RAWToJ420(const uint8_t* src_raw, } } #endif -#if defined(HAS_ARGBTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif @@ -3894,11 +3822,9 @@ int RAWToI444(const uint8_t* src_raw, } // TODO: add row coalesce when main loop handles large width in blocks // TODO: implement UV444 or trim the ifdef below -#if defined(HAS_ARGBTOUV444ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; } } #endif @@ -3950,11 +3876,9 @@ int RAWToI444(const uint8_t* src_raw, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -4012,11 +3936,9 @@ int RAWToI444(const uint8_t* src_raw, } #endif -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -4121,11 +4043,9 @@ int RAWToJ444(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } // TODO: add row coalesce when main loop handles large width in blocks -#if defined(HAS_ARGBTOUVJ444ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJ444Row = ARGBToUVJ444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJ444Row = ARGBToUVJ444Row_SSSE3; } } #endif @@ -4177,11 +4097,9 @@ int RAWToJ444(const uint8_t* src_raw, } } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -4231,11 +4149,9 @@ int RAWToJ444(const uint8_t* src_raw, } #endif -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -4390,11 +4306,9 @@ int RGB565ToI420(const uint8_t* src_rgb565, } #endif // Other platforms do intermediate conversion from RGB565 to ARGB. -#if defined(HAS_RGB565TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_SSE2; } } #endif @@ -4406,11 +4320,9 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -4430,11 +4342,9 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -4581,11 +4491,9 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, #endif // Other platforms do intermediate conversion from ARGB1555 to ARGB. -#if defined(HAS_ARGB1555TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; } } #endif @@ -4597,19 +4505,15 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -4750,11 +4654,9 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif -#if defined(HAS_ARGB4444TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; } } #endif @@ -4782,19 +4684,15 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -4917,11 +4815,9 @@ int RGB24ToJ400(const uint8_t* src_rgb24, RGB24ToARGBRow_C; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -4993,11 +4889,9 @@ int RGB24ToJ400(const uint8_t* src_rgb24, height = 1; src_stride_rgb24 = dst_stride_yj = 0; } -#if defined(HAS_RGB24TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif @@ -5082,11 +4976,9 @@ int RAWToJ400(const uint8_t* src_raw, RAWToARGBRow_C; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -5160,11 +5052,9 @@ int RAWToJ400(const uint8_t* src_raw, src_stride_raw = dst_stride_yj = 0; } -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 7672a6692..c66b5bb30 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -76,11 +76,9 @@ int I420ToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I422TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -332,11 +330,9 @@ int I422ToARGBMatrix(const uint8_t* src_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } -#if defined(HAS_I422TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -586,11 +582,9 @@ int I444ToARGBMatrix(const uint8_t* src_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } -#if defined(HAS_I444TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToARGBRow = I444ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; } } #endif @@ -823,11 +817,9 @@ int I444ToRGB24Matrix(const uint8_t* src_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_rgb24 = 0; } -#if defined(HAS_I444TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - I444ToRGB24Row = I444ToRGB24Row_SSSE3; } } #endif @@ -965,11 +957,9 @@ int I010ToAR30Matrix(const uint16_t* src_y, I210ToAR30Row = I210ToAR30Row_SME; } #endif -#if defined(HAS_I210TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToAR30Row = I210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210ToAR30Row = I210ToAR30Row_SSSE3; } } #endif @@ -1125,11 +1115,9 @@ int I012ToAR30Matrix(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_I212TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I212ToAR30Row = I212ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I212ToAR30Row = I212ToAR30Row_SSSE3; } } #endif @@ -1219,11 +1207,9 @@ int I210ToAR30Matrix(const uint16_t* src_y, I210ToAR30Row = I210ToAR30Row_SME; } #endif -#if defined(HAS_I210TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToAR30Row = I210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210ToAR30Row = I210ToAR30Row_SSSE3; } } #endif @@ -1392,11 +1378,9 @@ int I410ToAR30Matrix(const uint16_t* src_y, I410ToAR30Row = I410ToAR30Row_SME; } #endif -#if defined(HAS_I410TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToAR30Row = I410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToAR30Row = I410ToAR30Row_SSSE3; } } #endif @@ -1446,11 +1430,9 @@ int I010ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I210TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToARGBRow = I210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210ToARGBRow = I210ToARGBRow_SSSE3; } } #endif @@ -1628,11 +1610,9 @@ int I012ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I212TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I212ToARGBRow = I212ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I212ToARGBRow = I212ToARGBRow_SSSE3; } } #endif @@ -1702,11 +1682,9 @@ int I210ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I210TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToARGBRow = I210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210ToARGBRow = I210ToARGBRow_SSSE3; } } #endif @@ -1881,11 +1859,9 @@ int I410ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I410TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToARGBRow = I410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToARGBRow = I410ToARGBRow_SSSE3; } } #endif @@ -1949,11 +1925,9 @@ int P010ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_P210TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToARGBRow = P210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P210ToARGBRow = P210ToARGBRow_SSSE3; } } #endif @@ -2018,11 +1992,9 @@ int P210ToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_P210TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToARGBRow = P210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P210ToARGBRow = P210ToARGBRow_SSSE3; } } #endif @@ -2085,11 +2057,9 @@ int P010ToAR30Matrix(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_P210TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToAR30Row = P210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P210ToAR30Row = P210ToAR30Row_SSSE3; } } #endif @@ -2154,11 +2124,9 @@ int P210ToAR30Matrix(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_P210TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P210ToAR30Row = P210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P210ToAR30Row = P210ToAR30Row_SSSE3; } } #endif @@ -2232,11 +2200,9 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; } } #endif @@ -2287,11 +2253,9 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, I422AlphaToARGBRow = I422AlphaToARGBRow_RVV; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -2385,11 +2349,9 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; } } #endif @@ -2440,11 +2402,9 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, I422AlphaToARGBRow = I422AlphaToARGBRow_RVV; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -2536,11 +2496,9 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; } } #endif @@ -2575,11 +2533,9 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -2818,11 +2774,9 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y, I210AlphaToARGBRow = I210AlphaToARGBRow_SME; } #endif -#if defined(HAS_I210ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; } } #endif @@ -2834,11 +2788,9 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y, } } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -2950,11 +2902,9 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y, I210AlphaToARGBRow = I210AlphaToARGBRow_SME; } #endif -#if defined(HAS_I210ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; } } #endif @@ -2966,11 +2916,9 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y, } } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -3080,11 +3028,9 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y, I410AlphaToARGBRow = I410AlphaToARGBRow_SME; } #endif -#if defined(HAS_I410ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; } } #endif @@ -3096,11 +3042,9 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y, } } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -3186,11 +3130,9 @@ int I400ToARGBMatrix(const uint8_t* src_y, height = 1; src_stride_y = dst_stride_argb = 0; } -#if defined(HAS_I400TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I400ToARGBRow = I400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_SSE2; } } #endif @@ -3280,11 +3222,9 @@ int J400ToARGB(const uint8_t* src_y, height = 1; src_stride_y = dst_stride_argb = 0; } -#if defined(HAS_J400TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - J400ToARGBRow = J400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - J400ToARGBRow = J400ToARGBRow_SSE2; } } #endif @@ -3630,11 +3570,9 @@ int RGB24ToARGB(const uint8_t* src_rgb24, height = 1; src_stride_rgb24 = dst_stride_argb = 0; } -#if defined(HAS_RGB24TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif @@ -3722,11 +3660,9 @@ int RAWToARGB(const uint8_t* src_raw, height = 1; src_stride_raw = dst_stride_argb = 0; } -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -3815,11 +3751,9 @@ int RAWToRGBA(const uint8_t* src_raw, height = 1; src_stride_raw = dst_stride_rgba = 0; } -#if defined(HAS_RAWTORGBAROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToRGBARow = RAWToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToRGBARow = RAWToRGBARow_SSSE3; } } #endif @@ -3876,11 +3810,9 @@ int RGB565ToARGB(const uint8_t* src_rgb565, height = 1; src_stride_rgb565 = dst_stride_argb = 0; } -#if defined(HAS_RGB565TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_SSE2; } } #endif @@ -3951,11 +3883,9 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555, height = 1; src_stride_argb1555 = dst_stride_argb = 0; } -#if defined(HAS_ARGB1555TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; } } #endif @@ -4031,11 +3961,9 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444, height = 1; src_stride_argb4444 = dst_stride_argb = 0; } -#if defined(HAS_ARGB4444TOARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; } } #endif @@ -4202,11 +4130,9 @@ int AR64ToARGB(const uint16_t* src_ar64, height = 1; src_stride_ar64 = dst_stride_argb = 0; } -#if defined(HAS_AR64TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - AR64ToARGBRow = AR64ToARGBRow_SSSE3; } } #endif @@ -4266,11 +4192,9 @@ int AB64ToARGB(const uint16_t* src_ab64, height = 1; src_stride_ab64 = dst_stride_argb = 0; } -#if defined(HAS_AB64TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - AB64ToARGBRow = AB64ToARGBRow_SSSE3; } } #endif @@ -4329,11 +4253,9 @@ int NV12ToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_NV12TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; } } #endif @@ -4421,11 +4343,9 @@ int NV21ToARGBMatrix(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_NV21TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_SSSE3; } } #endif @@ -4590,11 +4510,9 @@ int NV12ToRGB24Matrix(const uint8_t* src_y, NV12ToRGB24Row = NV12ToRGB24Row_SME; } #endif -#if defined(HAS_NV12TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - NV12ToRGB24Row = NV12ToRGB24Row_SSSE3; } } #endif @@ -4666,11 +4584,9 @@ int NV21ToRGB24Matrix(const uint8_t* src_y, NV21ToRGB24Row = NV21ToRGB24Row_SME; } #endif -#if defined(HAS_NV21TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - NV21ToRGB24Row = NV21ToRGB24Row_SSSE3; } } #endif @@ -4786,11 +4702,9 @@ int NV21ToYUV24(const uint8_t* src_y, } } #endif -#if defined(HAS_NV21TOYUV24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV21ToYUV24Row = NV21ToYUV24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - NV21ToYUV24Row = NV21ToYUV24Row_SSSE3; } } #endif @@ -4841,11 +4755,9 @@ int YUY2ToARGBMatrix(const uint8_t* src_yuy2, height = 1; src_stride_yuy2 = dst_stride_argb = 0; } -#if defined(HAS_YUY2TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; } } #endif @@ -4931,11 +4843,9 @@ int UYVYToARGBMatrix(const uint8_t* src_uyvy, height = 1; src_stride_uyvy = dst_stride_argb = 0; } -#if defined(HAS_UYVYTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - UYVYToARGBRow = UYVYToARGBRow_SSSE3; } } #endif @@ -5140,11 +5050,9 @@ int I422ToRGBAMatrix(const uint8_t* src_y, dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; dst_stride_rgba = -dst_stride_rgba; } -#if defined(HAS_I422TORGBAROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif @@ -5267,11 +5175,9 @@ int NV12ToRGB565Matrix(const uint8_t* src_y, dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } -#if defined(HAS_NV12TORGB565ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; } } #endif @@ -5362,11 +5268,9 @@ int I420ToRGBAMatrix(const uint8_t* src_y, dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; dst_stride_rgba = -dst_stride_rgba; } -#if defined(HAS_I422TORGBAROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif @@ -5494,11 +5398,9 @@ int I420ToRGB24Matrix(const uint8_t* src_y, dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } -#if defined(HAS_I422TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } #endif @@ -5698,11 +5600,9 @@ int I422ToRGB24Matrix(const uint8_t* src_y, dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } -#if defined(HAS_I422TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } #endif @@ -5827,11 +5727,9 @@ int I420ToARGB1555(const uint8_t* src_y, dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; dst_stride_argb1555 = -dst_stride_argb1555; } -#if defined(HAS_I422TOARGB1555ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; } } #endif @@ -5918,11 +5816,9 @@ int I420ToARGB4444(const uint8_t* src_y, dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; dst_stride_argb4444 = -dst_stride_argb4444; } -#if defined(HAS_I422TOARGB4444ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; } } #endif @@ -6010,11 +5906,9 @@ int I420ToRGB565Matrix(const uint8_t* src_y, dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } -#if defined(HAS_I422TORGB565ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_SSSE3; } } #endif @@ -6152,11 +6046,9 @@ int I422ToRGB565Matrix(const uint8_t* src_y, dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } -#if defined(HAS_I422TORGB565ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGB565Row = I422ToRGB565Row_SSSE3; } } #endif @@ -6268,11 +6160,9 @@ int I420ToRGB565Dither(const uint8_t* src_y, if (!dither4x4) { dither4x4 = kDither565_4x4; } -#if defined(HAS_I422TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -6332,11 +6222,9 @@ int I420ToRGB565Dither(const uint8_t* src_y, I422ToARGBRow = I422ToARGBRow_RVV; } #endif -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; } } #endif @@ -6429,11 +6317,9 @@ int I420ToAR30Matrix(const uint8_t* src_y, dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_I422TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToAR30Row = I422ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToAR30Row = I422ToAR30Row_SSSE3; } } #endif @@ -6575,11 +6461,9 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I444TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToARGBRow = I444ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; } } #endif @@ -6623,17 +6507,13 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif @@ -6724,11 +6604,9 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I444TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToARGBRow = I444ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; } } #endif @@ -6771,14 +6649,12 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y, I444ToARGBRow = I444ToARGBRow_RVV; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_AVX2) @@ -6850,11 +6726,9 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y, dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } -#if defined(HAS_I444TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - I444ToRGB24Row = I444ToRGB24Row_SSSE3; } } #endif @@ -6898,17 +6772,13 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif @@ -7020,11 +6890,9 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y, I410ToAR30Row = I410ToAR30Row_SME; } #endif -#if defined(HAS_I410TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToAR30Row = I410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToAR30Row = I410ToAR30Row_SSSE3; } } #endif @@ -7037,10 +6905,8 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; - ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif @@ -7144,11 +7010,9 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y, I410ToAR30Row = I410ToAR30Row_SME; } #endif -#if defined(HAS_I410TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToAR30Row = I410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToAR30Row = I410ToAR30Row_SSSE3; } } #endif @@ -7161,9 +7025,8 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) @@ -7229,11 +7092,9 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I410TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToARGBRow = I410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToARGBRow = I410ToARGBRow_SSSE3; } } #endif @@ -7264,10 +7125,8 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; - ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif @@ -7352,11 +7211,9 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I410TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410ToARGBRow = I410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410ToARGBRow = I410ToARGBRow_SSSE3; } } #endif @@ -7387,9 +7244,8 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) @@ -7464,11 +7320,9 @@ static int I420AlphaToARGBMatrixBilinear( dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; } } #endif @@ -7511,11 +7365,9 @@ static int I420AlphaToARGBMatrixBilinear( I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -7557,17 +7409,13 @@ static int I420AlphaToARGBMatrixBilinear( } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif @@ -7684,11 +7532,9 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; } } #endif @@ -7731,11 +7577,9 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -7777,14 +7621,12 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_AVX2) @@ -7887,11 +7729,9 @@ static int I010AlphaToARGBMatrixBilinear( I410AlphaToARGBRow = I410AlphaToARGBRow_SME; } #endif -#if defined(HAS_I410ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; } } #endif @@ -7903,11 +7743,9 @@ static int I010AlphaToARGBMatrixBilinear( } } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -7949,10 +7787,8 @@ static int I010AlphaToARGBMatrixBilinear( } #endif -#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; - ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif @@ -8081,11 +7917,9 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, I410AlphaToARGBRow = I410AlphaToARGBRow_SME; } #endif -#if defined(HAS_I410ALPHATOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; } } #endif @@ -8097,11 +7931,9 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, } } #endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -8143,9 +7975,8 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) @@ -8211,11 +8042,9 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_P410TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P410ToARGBRow = P410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P410ToARGBRow = P410ToARGBRow_SSSE3; } } #endif @@ -8246,9 +8075,8 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y, } #endif -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41; } #endif @@ -8322,11 +8150,9 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_P410TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P410ToARGBRow = P410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P410ToARGBRow = P410ToARGBRow_SSSE3; } } #endif @@ -8357,9 +8183,8 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y, } #endif -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41; } #endif @@ -8419,11 +8244,9 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_P410TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P410ToAR30Row = P410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P410ToAR30Row = P410ToAR30Row_SSSE3; } } #endif @@ -8454,9 +8277,8 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y, } #endif -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41; } #endif @@ -8530,11 +8352,9 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_P410TOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - P410ToAR30Row = P410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - P410ToAR30Row = P410ToAR30Row_SSSE3; } } #endif @@ -8565,9 +8385,8 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y, } #endif -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41; } #endif @@ -8629,11 +8448,9 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y, dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } -#if defined(HAS_I444TORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - I444ToRGB24Row = I444ToRGB24Row_SSSE3; } } #endif @@ -8668,14 +8485,12 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y, I444ToRGB24Row = I444ToRGB24Row_RVV; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_AVX2) diff --git a/source/convert_from.cc b/source/convert_from.cc index 5cf88fa2d..161b6cf53 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -362,11 +362,9 @@ int I422ToYUY2(const uint8_t* src_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; } -#if defined(HAS_I422TOYUY2ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_SSE2; } } #endif @@ -421,11 +419,9 @@ int I420ToYUY2(const uint8_t* src_y, dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; dst_stride_yuy2 = -dst_stride_yuy2; } -#if defined(HAS_I422TOYUY2ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_SSE2; } } #endif @@ -508,11 +504,9 @@ int I422ToUYVY(const uint8_t* src_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; } -#if defined(HAS_I422TOUYVYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_SSE2; } } #endif @@ -583,11 +577,9 @@ int I420ToUYVY(const uint8_t* src_y, dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; dst_stride_uyvy = -dst_stride_uyvy; } -#if defined(HAS_I422TOUYVYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_SSE2; } } #endif diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 2c66611e6..262335936 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -52,11 +52,9 @@ int ARGBToI444(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_ARGBTOUV444ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; } } #endif @@ -108,11 +106,9 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -201,11 +197,9 @@ int ARGBToI444Matrix(const uint8_t* src_argb, const struct ArgbConstants* c) = ARGBToUV444MatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -263,11 +257,9 @@ ARGBToUV444MatrixRow_C; } #endif -#if defined(HAS_ARGBTOUV444MATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_SSSE3; } } #endif @@ -351,19 +343,15 @@ int ARGBToI422(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -512,11 +500,9 @@ int ARGBToI422Matrix(const uint8_t* src_argb, const struct ArgbConstants* c) = ARGBToUVMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -600,11 +586,9 @@ ARGBToUVMatrixRow_C; ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3; } } #endif @@ -713,19 +697,15 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -784,11 +764,9 @@ int ARGBToNV12(const uint8_t* src_argb, ARGBToYRow = ARGBToYRow_RVV; } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -879,11 +857,9 @@ int ARGBToNV12Matrix(const uint8_t* src_argb, const struct ArgbConstants* c) = ARGBToUVMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -967,11 +943,9 @@ ARGBToUVMatrixRow_C; ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3; } } #endif @@ -995,11 +969,9 @@ ARGBToUVMatrixRow_C; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1100,19 +1072,15 @@ int ARGBToNV21(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -1229,11 +1197,9 @@ int ARGBToNV21(const uint8_t* src_argb, ARGBToYRow = ARGBToYRow_RVV; } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1332,19 +1298,15 @@ int ABGRToNV12(const uint8_t* src_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; } } #endif @@ -1449,11 +1411,9 @@ int ABGRToNV12(const uint8_t* src_abgr, ABGRToYRow = ABGRToYRow_RVV; } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1553,19 +1513,15 @@ int ABGRToNV21(const uint8_t* src_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYRow = ABGRToYRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVRow = ABGRToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; } } #endif @@ -1662,11 +1618,9 @@ int ABGRToNV21(const uint8_t* src_abgr, ABGRToYRow = ABGRToYRow_RVV; } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1771,19 +1725,15 @@ int ARGBToYUY2(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_yuy2 = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -1900,11 +1850,9 @@ int ARGBToYUY2(const uint8_t* src_argb, ARGBToYRow = ARGBToYRow_RVV; } #endif -#if defined(HAS_I422TOYUY2ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToYUY2Row = I422ToYUY2Row_SSE2; } } #endif @@ -1995,19 +1943,15 @@ int ARGBToUYVY(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_uyvy = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif @@ -2124,11 +2068,9 @@ int ARGBToUYVY(const uint8_t* src_argb, ARGBToYRow = ARGBToYRow_RVV; } #endif -#if defined(HAS_I422TOUYVYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - I422ToUYVYRow = I422ToUYVYRow_SSE2; } } #endif @@ -2211,11 +2153,9 @@ int ARGBToI400(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_y = 0; } -#if defined(HAS_ARGBTOYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; } } #endif @@ -2364,11 +2304,9 @@ int ARGBToRGB24(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_rgb24 = 0; } -#if defined(HAS_ARGBTORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; } } #endif @@ -2456,11 +2394,9 @@ int ARGBToRAW(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_raw = 0; } -#if defined(HAS_ARGBTORAWROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToRAWRow = ARGBToRAWRow_SSSE3; } } #endif @@ -2544,11 +2480,9 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, if (!dither4x4) { dither4x4 = kDither565_4x4; } -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; } } #endif @@ -2626,11 +2560,9 @@ int ARGBToRGB565(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_rgb565 = 0; } -#if defined(HAS_ARGBTORGB565ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } } #endif @@ -2706,11 +2638,9 @@ int ARGBToARGB1555(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb1555 = 0; } -#if defined(HAS_ARGBTOARGB1555ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; } } #endif @@ -2780,11 +2710,9 @@ int ARGBToARGB4444(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb4444 = 0; } -#if defined(HAS_ARGBTOARGB4444ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; } } #endif @@ -2862,11 +2790,9 @@ int ABGRToAR30(const uint8_t* src_abgr, } } #endif -#if defined(HAS_ABGRTOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ABGRToAR30Row = ABGRToAR30Row_SSSE3; } } #endif @@ -2919,11 +2845,9 @@ int ARGBToAR30(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOAR30ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBToAR30Row = ARGBToAR30Row_SSSE3; } } #endif @@ -2975,11 +2899,9 @@ int ARGBToJ444(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_ARGBTOUVJ444ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJ444Row = ARGBToUVJ444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJ444Row = ARGBToUVJ444Row_SSSE3; } } #endif @@ -3031,11 +2953,9 @@ int ARGBToJ444(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -3179,19 +3099,15 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif @@ -3295,19 +3211,15 @@ int ARGBToJ422(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0; } -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif @@ -3445,11 +3357,9 @@ int ARGBToJ400(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_yj = 0; } -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -3516,11 +3426,9 @@ int RGBAToJ400(const uint8_t* src_rgba, height = 1; src_stride_rgba = dst_stride_yj = 0; } -#if defined(HAS_RGBATOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGBAToYJRow = RGBAToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToYJRow = RGBAToYJRow_SSSE3; } } #endif @@ -3613,19 +3521,15 @@ int ABGRToJ420(const uint8_t* src_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYJRow = ABGRToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYJRow = ABGRToYJRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVJRow = ABGRToUVJRow_SSSE3; } } #endif @@ -3781,19 +3685,15 @@ int ABGRToJ422(const uint8_t* src_abgr, height = 1; src_stride_abgr = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0; } -#if defined(HAS_ABGRTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYJRow = ABGRToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYJRow = ABGRToYJRow_SSSE3; } } #endif -#if defined(HAS_ABGRTOUVJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVJRow = ABGRToUVJRow_SSSE3; } } #endif @@ -3927,11 +3827,9 @@ int ABGRToJ400(const uint8_t* src_abgr, height = 1; src_stride_abgr = dst_stride_yj = 0; } -#if defined(HAS_ABGRTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ABGRToYJRow = ABGRToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToYJRow = ABGRToYJRow_SSSE3; } } #endif @@ -4015,11 +3913,9 @@ int ARGBToAR64(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_ar64 = 0; } -#if defined(HAS_ARGBTOAR64ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBToAR64Row = ARGBToAR64Row_SSSE3; } } #endif @@ -4079,11 +3975,9 @@ int ARGBToAB64(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_ab64 = 0; } -#if defined(HAS_ARGBTOAB64ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBToAB64Row = ARGBToAB64Row_SSSE3; } } #endif @@ -4140,11 +4034,9 @@ int RAWToNV21Matrix(const uint8_t* src_raw, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; void (*MergeUVRow)(const uint8_t* src_uj, const uint8_t* src_vj, uint8_t* dst_vu, int width) = MergeUVRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; } } #endif @@ -4213,11 +4105,9 @@ int RAWToNV21Matrix(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif @@ -4297,11 +4187,9 @@ int RAWToNV21Matrix(const uint8_t* src_raw, ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3; } } #endif @@ -4322,11 +4210,9 @@ int RAWToNV21Matrix(const uint8_t* src_raw, } #endif -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif diff --git a/source/planar_functions.cc b/source/planar_functions.cc index fde3717a4..20ccd6a57 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -52,7 +52,7 @@ void CopyPlane(const uint8_t* src_y, return; } -#if defined(HAS_COPYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } @@ -148,11 +148,9 @@ void Convert16To8Plane(const uint16_t* src_y, Convert16To8Row = Convert16To8Row_SME; } #endif -#if defined(HAS_CONVERT16TO8ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Convert16To8Row = Convert16To8Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - Convert16To8Row = Convert16To8Row_SSSE3; } } #endif @@ -209,11 +207,9 @@ void Convert8To16Plane(const uint8_t* src_y, height = 1; src_stride_y = dst_stride_y = 0; } -#if defined(HAS_CONVERT8TO16ROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Convert8To16Row = Convert8To16Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { - Convert8To16Row = Convert8To16Row_SSE2; } } #endif @@ -607,11 +603,9 @@ void SplitUVPlane(const uint8_t* src_uv, height = 1; src_stride_uv = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_SPLITUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SplitUVRow = SplitUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_SSE2; } } #endif @@ -682,11 +676,9 @@ void MergeUVPlane(const uint8_t* src_u, height = 1; src_stride_u = src_stride_v = dst_stride_uv = 0; } -#if defined(HAS_MERGEUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - MergeUVRow = MergeUVRow_SSE2; } } #endif @@ -1008,11 +1000,9 @@ void SwapUVPlane(const uint8_t* src_uv, src_stride_uv = dst_stride_vu = 0; } -#if defined(HAS_SWAPUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - SwapUVRow = SwapUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - SwapUVRow = SwapUVRow_SSSE3; } } #endif @@ -1108,11 +1098,9 @@ int DetilePlane(const uint8_t* src_y, dst_stride_y = -dst_stride_y; } -#if defined(HAS_DETILEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - DetileRow = DetileRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - DetileRow = DetileRow_SSE2; } } #endif @@ -1165,11 +1153,9 @@ int DetilePlane_16(const uint16_t* src_y, dst_stride_y = -dst_stride_y; } -#if defined(HAS_DETILEROW_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - DetileRow_16 = DetileRow_16_Any_SSE2; if (IS_ALIGNED(width, 16)) { - DetileRow_16 = DetileRow_16_SSE2; } } #endif @@ -1234,11 +1220,9 @@ void DetileSplitUVPlane(const uint8_t* src_uv, dst_stride_v = -dst_stride_v; } -#if defined(HAS_DETILESPLITUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - DetileSplitUVRow = DetileSplitUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - DetileSplitUVRow = DetileSplitUVRow_SSSE3; } } #endif @@ -1305,11 +1289,9 @@ void DetileToYUY2(const uint8_t* src_y, } #endif -#if defined(HAS_DETILETOYUY2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - DetileToYUY2 = DetileToYUY2_Any_SSE2; if (IS_ALIGNED(width, 16)) { - DetileToYUY2 = DetileToYUY2_SSE2; } } #endif @@ -1368,19 +1350,15 @@ void SplitRGBPlane(const uint8_t* src_rgb, height = 1; src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; } -#if defined(HAS_SPLITRGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - SplitRGBRow = SplitRGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - SplitRGBRow = SplitRGBRow_SSSE3; } } #endif -#if defined(HAS_SPLITRGBROW_SSE41) +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - SplitRGBRow = SplitRGBRow_Any_SSE41; if (IS_ALIGNED(width, 16)) { - SplitRGBRow = SplitRGBRow_SSE41; } } #endif @@ -1448,11 +1426,9 @@ void MergeRGBPlane(const uint8_t* src_r, height = 1; src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; } -#if defined(HAS_MERGERGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - MergeRGBRow = MergeRGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - MergeRGBRow = MergeRGBRow_SSSE3; } } #endif @@ -1511,19 +1487,15 @@ static void SplitARGBPlaneAlpha(const uint8_t* src_argb, dst_stride_a = 0; } -#if defined(HAS_SPLITARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SplitARGBRow = SplitARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - SplitARGBRow = SplitARGBRow_SSE2; } } #endif -#if defined(HAS_SPLITARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - SplitARGBRow = SplitARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - SplitARGBRow = SplitARGBRow_SSSE3; } } #endif @@ -1585,19 +1557,15 @@ static void SplitARGBPlaneOpaque(const uint8_t* src_argb, src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0; } -#if defined(HAS_SPLITXRGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SplitXRGBRow = SplitXRGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - SplitXRGBRow = SplitXRGBRow_SSE2; } } #endif -#if defined(HAS_SPLITXRGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - SplitXRGBRow = SplitXRGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - SplitXRGBRow = SplitXRGBRow_SSSE3; } } #endif @@ -1698,11 +1666,9 @@ static void MergeARGBPlaneAlpha(const uint8_t* src_r, src_stride_r = src_stride_g = src_stride_b = src_stride_a = dst_stride_argb = 0; } -#if defined(HAS_MERGEARGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeARGBRow = MergeARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - MergeARGBRow = MergeARGBRow_SSE2; } } #endif @@ -1765,11 +1731,9 @@ static void MergeARGBPlaneOpaque(const uint8_t* src_r, height = 1; src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; } -#if defined(HAS_MERGEXRGBROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - MergeXRGBRow = MergeXRGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - MergeXRGBRow = MergeXRGBRow_SSE2; } } #endif @@ -2205,13 +2169,9 @@ int YUY2ToI422(const uint8_t* src_yuy2, height = 1; src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_YUY2TOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; - YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -2301,13 +2261,9 @@ int UYVYToI422(const uint8_t* src_uyvy, height = 1; src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; } -#if defined(HAS_UYVYTOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; - UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUV422Row = UYVYToUV422Row_SSE2; - UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -2389,11 +2345,9 @@ int YUY2ToY(const uint8_t* src_yuy2, height = 1; src_stride_yuy2 = dst_stride_y = 0; } -#if defined(HAS_YUY2TOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -2448,11 +2402,9 @@ int UYVYToY(const uint8_t* src_uyvy, height = 1; src_stride_uyvy = dst_stride_y = 0; } -#if defined(HAS_UYVYTOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToYRow = UYVYToYRow_SSE2; } } #endif @@ -2514,11 +2466,9 @@ void MirrorPlane(const uint8_t* src_y, } } #endif -#if defined(HAS_MIRRORROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorRow = MirrorRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSSE3; } } #endif @@ -2580,11 +2530,9 @@ void MirrorUVPlane(const uint8_t* src_uv, } } #endif -#if defined(HAS_MIRRORUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorUVRow = MirrorUVRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - MirrorUVRow = MirrorUVRow_SSSE3; } } #endif @@ -2752,11 +2700,9 @@ int ARGBMirror(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBMIRRORROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBMirrorRow = ARGBMirrorRow_SSE2; } } #endif @@ -2822,11 +2768,9 @@ int RGB24Mirror(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_RGB24MIRRORROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24MirrorRow = RGB24MirrorRow_SSSE3; } } #endif @@ -2869,9 +2813,8 @@ int ARGBBlend(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBBLENDROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBBlendRow = ARGBBlendRow_SSSE3; } #endif #if defined(HAS_ARGBBLENDROW_NEON) @@ -2932,11 +2875,9 @@ int BlendPlane(const uint8_t* src_y0, src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; } -#if defined(HAS_BLENDPLANEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - BlendPlaneRow = BlendPlaneRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - BlendPlaneRow = BlendPlaneRow_SSSE3; } } #endif @@ -3014,11 +2955,9 @@ int I420Blend(const uint8_t* src_y0, BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride, dst_y, dst_stride_y, width, height); -#if defined(HAS_BLENDPLANEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - BlendPlaneRow = BlendPlaneRow_Any_SSSE3; if (IS_ALIGNED(halfwidth, 8)) { - BlendPlaneRow = BlendPlaneRow_SSSE3; } } #endif @@ -3049,13 +2988,10 @@ int I420Blend(const uint8_t* src_y0, } } #endif -#if defined(HAS_SCALEROWDOWN2_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3; if (IS_ALIGNED(width, 2)) { - ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3; if (IS_ALIGNED(halfwidth, 16)) { - ScaleRowDown2 = ScaleRowDown2Box_SSSE3; } } } @@ -3131,11 +3067,9 @@ int ARGBMultiply(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBMULTIPLYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBMultiplyRow = ARGBMultiplyRow_SSE2; } } #endif @@ -3216,16 +3150,13 @@ int ARGBAdd(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBADDROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBAddRow = ARGBAddRow_SSE2; } #endif -#if defined(HAS_ARGBADDROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBAddRow = ARGBAddRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBAddRow = ARGBAddRow_SSE2; } } #endif @@ -3301,11 +3232,9 @@ int ARGBSubtract(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBSUBTRACTROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBSubtractRow = ARGBSubtractRow_SSE2; } } #endif @@ -3378,11 +3307,9 @@ int RAWToRGB24(const uint8_t* src_raw, height = 1; src_stride_raw = dst_stride_rgb24 = 0; } -#if defined(HAS_RAWTORGB24ROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - RAWToRGB24Row = RAWToRGB24Row_SSSE3; } } #endif @@ -3610,11 +3537,9 @@ int ARGBAttenuate(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBATTENUATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif @@ -3689,11 +3614,9 @@ int ARGBUnattenuate(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBUNATTENUATEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; } } #endif @@ -3740,9 +3663,8 @@ int ARGBGrayTo(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBGRAYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_SSSE3; } #endif #if defined(HAS_ARGBGRAYROW_NEON) @@ -3795,9 +3717,8 @@ int ARGBGray(uint8_t* dst_argb, height = 1; dst_stride_argb = 0; } -#if defined(HAS_ARGBGRAYROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBGrayRow = ARGBGrayRow_SSSE3; } #endif #if defined(HAS_ARGBGRAYROW_NEON) @@ -3848,9 +3769,8 @@ int ARGBSepia(uint8_t* dst_argb, height = 1; dst_stride_argb = 0; } -#if defined(HAS_ARGBSEPIAROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBSepiaRow = ARGBSepiaRow_SSSE3; } #endif #if defined(HAS_ARGBSEPIAROW_NEON) @@ -3909,9 +3829,8 @@ int ARGBColorMatrix(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; } #endif #if defined(HAS_ARGBCOLORMATRIXROW_NEON) @@ -4079,9 +3998,8 @@ int ARGBQuantize(uint8_t* dst_argb, height = 1; dst_stride_argb = 0; } -#if defined(HAS_ARGBQUANTIZEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { - ARGBQuantizeRow = ARGBQuantizeRow_SSE2; } #endif #if defined(HAS_ARGBQUANTIZEROW_NEON) @@ -4118,9 +4036,8 @@ int ARGBComputeCumulativeSum(const uint8_t* src_argb, if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { return -1; } -#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; } #endif @@ -4176,10 +4093,8 @@ int ARGBBlur(const uint8_t* src_argb, if (radius <= 0 || height <= 1) { return -1; } -#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; - CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; } #endif // Compute enough CumulativeSum for first row to be blurred. After this @@ -4273,9 +4188,8 @@ int ARGBShade(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBSHADEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { - ARGBShadeRow = ARGBShadeRow_SSE2; } #endif #if defined(HAS_ARGBSHADEROW_NEON) @@ -4332,11 +4246,9 @@ int InterpolatePlane(const uint8_t* src0, height = 1; src_stride0 = src_stride1 = dst_stride = 0; } -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -4414,11 +4326,9 @@ int InterpolatePlane_16(const uint16_t* src0, height = 1; src_stride0 = src_stride1 = dst_stride = 0; } -#if defined(HAS_INTERPOLATEROW_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow_16 = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - InterpolateRow_16 = InterpolateRow_16_SSSE3; } } #endif @@ -4544,11 +4454,9 @@ int ARGBShuffle(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBSHUFFLEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBShuffleRow = ARGBShuffleRow_SSSE3; } } #endif @@ -4621,11 +4529,9 @@ int AR64Shuffle(const uint16_t* src_ar64, src_stride_ar64 = dst_stride_ar64 = 0; } // Assembly versions can be reused if it's implemented with shuffle. -#if defined(HAS_ARGBSHUFFLEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - AR64ShuffleRow = ARGBShuffleRow_SSSE3; } } #endif @@ -4761,11 +4667,9 @@ static int ARGBSobelize(const uint8_t* src_argb, src_stride_argb = -src_stride_argb; } -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif @@ -4815,9 +4719,8 @@ static int ARGBSobelize(const uint8_t* src_argb, } #endif -#if defined(HAS_SOBELYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SobelYRow = SobelYRow_SSE2; } #endif #if defined(HAS_SOBELYROW_NEON) @@ -4825,9 +4728,8 @@ static int ARGBSobelize(const uint8_t* src_argb, SobelYRow = SobelYRow_NEON; } #endif -#if defined(HAS_SOBELXROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SobelXRow = SobelXRow_SSE2; } #endif #if defined(HAS_SOBELXROW_NEON) @@ -4895,11 +4797,9 @@ int ARGBSobel(const uint8_t* src_argb, int height) { void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) = SobelRow_C; -#if defined(HAS_SOBELROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SobelRow = SobelRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SobelRow = SobelRow_SSE2; } } #endif @@ -4933,11 +4833,9 @@ int ARGBSobelToPlane(const uint8_t* src_argb, int height) { void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_, int width) = SobelToPlaneRow_C; -#if defined(HAS_SOBELTOPLANEROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SobelToPlaneRow = SobelToPlaneRow_SSE2; } } #endif @@ -4972,11 +4870,9 @@ int ARGBSobelXY(const uint8_t* src_argb, int height) { void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) = SobelXYRow_C; -#if defined(HAS_SOBELXYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SobelXYRow = SobelXYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SobelXYRow = SobelXYRow_SSE2; } } #endif @@ -5027,9 +4923,8 @@ int ARGBPolynomial(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { - ARGBPolynomialRow = ARGBPolynomialRow_SSE2; } #endif #if defined(HAS_ARGBPOLYNOMIALROW_AVX2) @@ -5077,11 +4972,9 @@ int HalfFloatPlane(const uint16_t* src_y, height = 1; src_stride_y = dst_stride_y = 0; } -#if defined(HAS_HALFFLOATROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - HalfFloatRow = HalfFloatRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - HalfFloatRow = HalfFloatRow_SSE2; } } #endif @@ -5189,9 +5082,8 @@ int ARGBLumaColorTable(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { - ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; } #endif @@ -5229,11 +5121,9 @@ int ARGBCopyAlpha(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBCOPYALPHAROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; } } #endif @@ -5279,7 +5169,7 @@ int ARGBExtractAlpha(const uint8_t* src_argb, } void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, int width) = ARGBExtractAlphaRow_C; -#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 : ARGBExtractAlphaRow_Any_SSE2; @@ -5343,11 +5233,9 @@ int ARGBCopyYToAlpha(const uint8_t* src_y, height = 1; src_stride_y = dst_stride_argb = 0; } -#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; } } #endif @@ -5397,11 +5285,9 @@ int YUY2ToNV12(const uint8_t* src_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } -#if defined(HAS_YUY2TOYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif @@ -5438,11 +5324,9 @@ int YUY2ToNV12(const uint8_t* src_yuy2, } #endif -#if defined(HAS_YUY2TONVUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2; } } #endif @@ -5505,11 +5389,9 @@ int UYVYToNV12(const uint8_t* src_uyvy, src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } -#if defined(HAS_SPLITUVROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - SplitUVRow = SplitUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_SSE2; } } #endif @@ -5543,11 +5425,9 @@ int UYVYToNV12(const uint8_t* src_uyvy, } #endif -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -5642,9 +5522,8 @@ void HalfMergeUVPlane(const uint8_t* src_u, HalfMergeUVRow = HalfMergeUVRow_NEON; } #endif -#if defined(HAS_HALFMERGEUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - HalfMergeUVRow = HalfMergeUVRow_SSSE3; } #endif #if defined(HAS_HALFMERGEUVROW_AVX2) diff --git a/source/rotate.cc b/source/rotate.cc index d4a9fcd27..3db253a52 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -64,19 +64,15 @@ void TransposePlane(const uint8_t* src, TransposeWxH = TransposeWxH_SME; } #endif -#if defined(HAS_TRANSPOSEWX8_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - TransposeWx8 = TransposeWx8_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - TransposeWx8 = TransposeWx8_SSSE3; } } #endif -#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - TransposeWx8 = TransposeWx8_Fast_SSSE3; } } #endif @@ -174,11 +170,9 @@ void RotatePlane180(const uint8_t* src, } } #endif -#if defined(HAS_MIRRORROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorRow = MirrorRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSSE3; } } #endif @@ -206,7 +200,7 @@ void RotatePlane180(const uint8_t* src, } } #endif -#if defined(HAS_COPYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } @@ -301,11 +295,9 @@ void SplitTransposeUV(const uint8_t* src, TransposeUVWxH = TransposeUVWxH_SME; } #endif -#if defined(HAS_TRANSPOSEUVWX8_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - TransposeUVWx8 = TransposeUVWx8_Any_SSE2; if (IS_ALIGNED(width, 8)) { - TransposeUVWx8 = TransposeUVWx8_SSE2; } } #endif @@ -397,9 +389,8 @@ void SplitRotateUV180(const uint8_t* src, MirrorSplitUVRow = MirrorSplitUVRow_NEON; } #endif -#if defined(HAS_MIRRORSPLITUVROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - MirrorSplitUVRow = MirrorSplitUVRow_SSSE3; } #endif #if defined(HAS_MIRRORSPLITUVROW_LSX) diff --git a/source/rotate_any.cc b/source/rotate_any.cc index bf62c067b..6756d362f 100644 --- a/source/rotate_any.cc +++ b/source/rotate_any.cc @@ -35,12 +35,6 @@ TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7) #ifdef HAS_TRANSPOSEWX16_NEON TANY(TransposeWx16_Any_NEON, TransposeWx16_NEON, TransposeWx16_C, 15) #endif -#ifdef HAS_TRANSPOSEWX8_SSSE3 -TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7) -#endif -#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 -TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15) -#endif #ifdef HAS_TRANSPOSEWX16_LSX TANY(TransposeWx16_Any_LSX, TransposeWx16_LSX, TransposeWx16_C, 15) #endif @@ -62,9 +56,6 @@ TANY(TransposeWx16_Any_LSX, TransposeWx16_LSX, TransposeWx16_C, 15) #ifdef HAS_TRANSPOSEUVWX8_NEON TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) #endif -#ifdef HAS_TRANSPOSEUVWX8_SSE2 -TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) -#endif #ifdef HAS_TRANSPOSEUVWX16_LSX TUVANY(TransposeUVWx16_Any_LSX, TransposeUVWx16_LSX, 7) #endif diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 8c76ca919..546430fb0 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -37,11 +37,9 @@ static int ARGBTranspose(const uint8_t* src_argb, if (src_stride_argb & 3) { return -1; } -#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; if (IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; } } #endif @@ -131,11 +129,9 @@ static int ARGBRotate180(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBMIRRORROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBMirrorRow = ARGBMirrorRow_SSE2; } } #endif @@ -163,7 +159,7 @@ static int ARGBRotate180(const uint8_t* src_argb, } } #endif -#if defined(HAS_COPYROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index 9847ecd48..1f8f49550 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -22,424 +22,11 @@ extern "C" { !defined(LIBYUV_ENABLE_ROWWIN) // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. -#if defined(HAS_TRANSPOSEWX8_SSSE3) -void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(src_stride)), // %3 - "r"((ptrdiff_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // defined(HAS_TRANSPOSEWX8_SSSE3) // Transpose 16x8. 64 bit -#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) -void TransposeWx8_Fast_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(src_stride)), // %3 - "r"((ptrdiff_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", - "xmm15"); -} -#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) // Transpose UV 8x8. 64 bit. -#if defined(HAS_TRANSPOSEUVWX8_SSE2) -void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - asm volatile( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" - // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(src_stride)), // %4 - "r"((ptrdiff_t)(dst_stride_a)), // %5 - "r"((ptrdiff_t)(dst_stride_b)) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7", "xmm8", "xmm9"); -} -#endif // defined(HAS_TRANSPOSEUVWX8_SSE2) -#if defined(HAS_TRANSPOSE4X4_32_SSE2) -// 4 values, little endian view -// a b c d -// e f g h -// i j k l -// m n o p - -// transpose 2x2 -// a e b f from row 0, 1 -// i m j n from row 2, 3 -// c g d h from row 0, 1 -// k o l p from row 2, 3 - -// transpose 4x4 -// a e i m from row 0, 1 -// b f j n from row 0, 1 -// c g k o from row 2, 3 -// d h l p from row 2, 3 - -// Transpose 32 bit values (ARGB) -void Transpose4x4_32_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - asm volatile( - // Main loop transpose 4x4. Read a column, write a row. - "1: \n" - "movdqu (%0),%%xmm0 \n" // a b c d - "movdqu (%0,%3),%%xmm1 \n" // e f g h - "lea (%0,%3,2),%0 \n" // src += stride * 2 - "movdqu (%0),%%xmm2 \n" // i j k l - "movdqu (%0,%3),%%xmm3 \n" // m n o p - "lea (%0,%3,2),%0 \n" // src += stride * 2 - - // Transpose 2x2 - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "movdqa %%xmm0,%%xmm6 \n" - "movdqa %%xmm2,%%xmm7 \n" - "punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1 - "punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3 - "punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1 - "punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3 - - // Transpose 4x4 - "movdqa %%xmm4,%%xmm0 \n" - "movdqa %%xmm4,%%xmm1 \n" - "movdqa %%xmm6,%%xmm2 \n" - "movdqa %%xmm6,%%xmm3 \n" - "punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1 - "punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1 - "punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3 - "punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3 - - "movdqu %%xmm0,(%1) \n" - "lea 16(%1,%4),%1 \n" // dst += stride + 16 - "movdqu %%xmm1,-16(%1) \n" - "movdqu %%xmm2,-16(%1,%4) \n" - "movdqu %%xmm3,-16(%1,%4,2) \n" - "sub %4,%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+rm"(width) // %2 - : "r"((ptrdiff_t)(src_stride)), // %3 - "r"((ptrdiff_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // defined(HAS_TRANSPOSE4X4_32_SSE2) #if defined(HAS_TRANSPOSE4X4_32_AVX2) diff --git a/source/rotate_win.cc b/source/rotate_win.cc index 03eeee3a6..b5e8a1f14 100644 --- a/source/rotate_win.cc +++ b/source/rotate_win.cc @@ -20,230 +20,9 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_IX86) && \ (!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN)) -__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - __asm { - push edi - push esi - push ebp - mov eax, [esp + 12 + 4] // src - mov edi, [esp + 12 + 8] // src_stride - mov edx, [esp + 12 + 12] // dst - mov esi, [esp + 12 + 16] // dst_stride - mov ecx, [esp + 12 + 20] // width - // Read in the data from the source pointer. - // First round of bit swap. - align 4 - convertloop: - movq xmm0, qword ptr [eax] - lea ebp, [eax + 8] - movq xmm1, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm0, xmm1 - movq xmm2, qword ptr [eax] - movdqa xmm1, xmm0 - palignr xmm1, xmm1, 8 - movq xmm3, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm2, xmm3 - movdqa xmm3, xmm2 - movq xmm4, qword ptr [eax] - palignr xmm3, xmm3, 8 - movq xmm5, qword ptr [eax + edi] - punpcklbw xmm4, xmm5 - lea eax, [eax + 2 * edi] - movdqa xmm5, xmm4 - movq xmm6, qword ptr [eax] - palignr xmm5, xmm5, 8 - movq xmm7, qword ptr [eax + edi] - punpcklbw xmm6, xmm7 - mov eax, ebp - movdqa xmm7, xmm6 - palignr xmm7, xmm7, 8 - // Second round of bit swap. - punpcklwd xmm0, xmm2 - punpcklwd xmm1, xmm3 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - palignr xmm2, xmm2, 8 - palignr xmm3, xmm3, 8 - punpcklwd xmm4, xmm6 - punpcklwd xmm5, xmm7 - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - palignr xmm6, xmm6, 8 - palignr xmm7, xmm7, 8 - // Third round of bit swap. - // Write to the destination pointer. - punpckldq xmm0, xmm4 - movq qword ptr [edx], xmm0 - movdqa xmm4, xmm0 - palignr xmm4, xmm4, 8 - movq qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - punpckldq xmm2, xmm6 - movdqa xmm6, xmm2 - palignr xmm6, xmm6, 8 - movq qword ptr [edx], xmm2 - punpckldq xmm1, xmm5 - movq qword ptr [edx + esi], xmm6 - lea edx, [edx + 2 * esi] - movdqa xmm5, xmm1 - movq qword ptr [edx], xmm1 - palignr xmm5, xmm5, 8 - punpckldq xmm3, xmm7 - movq qword ptr [edx + esi], xmm5 - lea edx, [edx + 2 * esi] - movq qword ptr [edx], xmm3 - movdqa xmm7, xmm3 - palignr xmm7, xmm7, 8 - sub ecx, 8 - movq qword ptr [edx + esi], xmm7 - lea edx, [edx + 2 * esi] - jg convertloop - pop ebp - pop esi - pop edi - ret - } -} -__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int w) { - __asm { - push ebx - push esi - push edi - push ebp - mov eax, [esp + 16 + 4] // src - mov edi, [esp + 16 + 8] // src_stride - mov edx, [esp + 16 + 12] // dst_a - mov esi, [esp + 16 + 16] // dst_stride_a - mov ebx, [esp + 16 + 20] // dst_b - mov ebp, [esp + 16 + 24] // dst_stride_b - mov ecx, esp - sub esp, 4 + 16 - and esp, ~15 - mov [esp + 16], ecx - mov ecx, [ecx + 16 + 28] // w - - align 4 - // Read in the data from the source pointer. - // First round of bit swap. - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm0 // use xmm7 as temp register. - punpcklbw xmm0, xmm1 - punpckhbw xmm7, xmm1 - movdqa xmm1, xmm7 - movdqu xmm2, [eax] - movdqu xmm3, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm2 - punpcklbw xmm2, xmm3 - punpckhbw xmm7, xmm3 - movdqa xmm3, xmm7 - movdqu xmm4, [eax] - movdqu xmm5, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm4 - punpcklbw xmm4, xmm5 - punpckhbw xmm7, xmm5 - movdqa xmm5, xmm7 - movdqu xmm6, [eax] - movdqu xmm7, [eax + edi] - lea eax, [eax + 2 * edi] - movdqu [esp], xmm5 // backup xmm5 - neg edi - movdqa xmm5, xmm6 // use xmm5 as temp register. - punpcklbw xmm6, xmm7 - punpckhbw xmm5, xmm7 - movdqa xmm7, xmm5 - lea eax, [eax + 8 * edi + 16] - neg edi - // Second round of bit swap. - movdqa xmm5, xmm0 - punpcklwd xmm0, xmm2 - punpckhwd xmm5, xmm2 - movdqa xmm2, xmm5 - movdqa xmm5, xmm1 - punpcklwd xmm1, xmm3 - punpckhwd xmm5, xmm3 - movdqa xmm3, xmm5 - movdqa xmm5, xmm4 - punpcklwd xmm4, xmm6 - punpckhwd xmm5, xmm6 - movdqa xmm6, xmm5 - movdqu xmm5, [esp] // restore xmm5 - movdqu [esp], xmm6 // backup xmm6 - movdqa xmm6, xmm5 // use xmm6 as temp register. - punpcklwd xmm5, xmm7 - punpckhwd xmm6, xmm7 - movdqa xmm7, xmm6 - - // Third round of bit swap. - // Write to the destination pointer. - movdqa xmm6, xmm0 - punpckldq xmm0, xmm4 - punpckhdq xmm6, xmm4 - movdqa xmm4, xmm6 - movdqu xmm6, [esp] // restore xmm6 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [ebx], xmm0 - movlpd qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm4 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm2 // use xmm0 as the temp register. - punpckldq xmm2, xmm6 - movlpd qword ptr [edx], xmm2 - movhpd qword ptr [ebx], xmm2 - punpckhdq xmm0, xmm6 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm1 // use xmm0 as the temp register. - punpckldq xmm1, xmm5 - movlpd qword ptr [edx], xmm1 - movhpd qword ptr [ebx], xmm1 - punpckhdq xmm0, xmm5 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm3 // use xmm0 as the temp register. - punpckldq xmm3, xmm7 - movlpd qword ptr [edx], xmm3 - movhpd qword ptr [ebx], xmm3 - punpckhdq xmm0, xmm7 - sub ecx, 8 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - jg convertloop - - mov esp, [esp + 16] - pop ebp - pop edi - pop esi - pop ebx - ret - } -} #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) diff --git a/source/row_any.cc b/source/row_any.cc index 82a4abe8d..cb9fdede0 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -53,9 +53,6 @@ extern "C" { memcpy(dst_ptr + (np >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } -#ifdef HAS_MERGEARGBROW_SSE2 -ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7) -#endif #ifdef HAS_MERGEARGBROW_AVX2 ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15) #endif @@ -92,15 +89,9 @@ ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15) memcpy(dst_ptr + (np >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } -#ifdef HAS_I444ALPHATOARGBROW_SSSE3 -ANY41C(I444AlphaToARGBRow_Any_SSSE3, I444AlphaToARGBRow_SSSE3, 0, 0, 4, 7) -#endif #ifdef HAS_I444ALPHATOARGBROW_AVX2 ANY41C(I444AlphaToARGBRow_Any_AVX2, I444AlphaToARGBRow_AVX2, 0, 0, 4, 15) #endif -#ifdef HAS_I422ALPHATOARGBROW_SSSE3 -ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) -#endif #ifdef HAS_I422ALPHATOARGBROW_AVX2 ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) #endif @@ -161,8 +152,7 @@ ANY41CT(I410AlphaToARGBRow_Any_NEON, 7) #endif -#ifdef HAS_I210ALPHATOARGBROW_SSSE3 -ANY41CT(I210AlphaToARGBRow_Any_SSSE3, +#if 0 I210AlphaToARGBRow_SSSE3, 1, 0, @@ -183,8 +173,7 @@ ANY41CT(I210AlphaToARGBRow_Any_AVX2, 15) #endif -#ifdef HAS_I410ALPHATOARGBROW_SSSE3 -ANY41CT(I410AlphaToARGBRow_Any_SSSE3, +#if 0 I410AlphaToARGBRow_SSSE3, 0, 0, @@ -279,25 +268,15 @@ ANY41PT(MergeARGB16To8Row_Any_NEON, } // Merge functions. -#ifdef HAS_MERGERGBROW_SSSE3 -ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) -#endif #ifdef HAS_MERGERGBROW_NEON ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) #endif -#ifdef HAS_MERGEXRGBROW_SSE2 -ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7) -#endif #ifdef HAS_MERGEXRGBROW_AVX2 ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15) #endif #ifdef HAS_MERGEXRGBROW_NEON ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15) #endif -#ifdef HAS_I422TOYUY2ROW_SSE2 -ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) -ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) -#endif #ifdef HAS_I422TOYUY2ROW_AVX2 ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31) ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31) @@ -323,9 +302,6 @@ ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31) #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #endif -#ifdef HAS_BLENDPLANEROW_SSSE3 -ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) -#endif #undef ANY31 // Note that odd width replication includes 444 due to implementation @@ -355,36 +331,9 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) memcpy(dst_ptr + (np >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } -#ifdef HAS_I422TOARGBROW_SSSE3 -ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) -#endif -#ifdef HAS_I422TORGBAROW_SSSE3 -ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) -#endif -#ifdef HAS_I422TOARGB4444ROW_SSSE3 -ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TOARGB1555ROW_SSSE3 -ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TORGB565ROW_SSSE3 -ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) -#endif -#ifdef HAS_I422TORGB24ROW_SSSE3 -ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) -#endif -#ifdef HAS_I422TOAR30ROW_SSSE3 -ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) -#endif #ifdef HAS_I422TOAR30ROW_AVX2 ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) #endif -#ifdef HAS_I444TOARGBROW_SSSE3 -ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) -#endif -#ifdef HAS_I444TORGB24ROW_SSSE3 -ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15) -#endif #ifdef HAS_I422TORGB24ROW_AVX2 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) #endif @@ -470,36 +419,18 @@ ANY31C(I444ToARGBRow_Any_LSX, I444ToARGBRow_LSX, 0, 0, 4, 15) memcpy(dst_ptr + (np >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } -#ifdef HAS_I210TOAR30ROW_SSSE3 -ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I210TOARGBROW_SSSE3 -ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif #ifdef HAS_I210TOARGBROW_AVX2 ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_I210TOAR30ROW_AVX2 ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif -#ifdef HAS_I410TOAR30ROW_SSSE3 -ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I410TOARGBROW_SSSE3 -ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif #ifdef HAS_I410TOARGBROW_AVX2 ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_I410TOAR30ROW_AVX2 ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif -#ifdef HAS_I212TOAR30ROW_SSSE3 -ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_I212TOARGBROW_SSSE3 -ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif #ifdef HAS_I212TOARGBROW_AVX2 ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif @@ -612,9 +543,6 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON, } // Merge functions. -#ifdef HAS_MERGEUVROW_SSE2 -ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) -#endif #ifdef HAS_MERGEUVROW_AVX2 ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) #endif @@ -630,22 +558,10 @@ ANY21(MergeUVRow_Any_LSX, MergeUVRow_LSX, 0, 1, 1, 2, 15) #ifdef HAS_NV21TOYUV24ROW_NEON ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15) #endif -#ifdef HAS_NV21TOYUV24ROW_SSSE3 -ANY21(NV21ToYUV24Row_Any_SSSE3, NV21ToYUV24Row_SSSE3, 1, 1, 2, 3, 15) -#endif #ifdef HAS_NV21TOYUV24ROW_AVX2 ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31) #endif // Math functions. -#ifdef HAS_ARGBMULTIPLYROW_SSE2 -ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3) -#endif -#ifdef HAS_ARGBADDROW_SSE2 -ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3) -#endif -#ifdef HAS_ARGBSUBTRACTROW_SSE2 -ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3) -#endif #ifdef HAS_ARGBMULTIPLYROW_AVX2 ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7) #endif @@ -682,27 +598,18 @@ ANY21(ARGBSubtractRow_Any_LSX, ARGBSubtractRow_LSX, 0, 4, 4, 4, 3) #ifdef HAS_ARGBSUBTRACTROW_LASX ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7) #endif -#ifdef HAS_SOBELROW_SSE2 -ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) -#endif #ifdef HAS_SOBELROW_NEON ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) #endif #ifdef HAS_SOBELROW_LSX ANY21(SobelRow_Any_LSX, SobelRow_LSX, 0, 1, 1, 4, 15) #endif -#ifdef HAS_SOBELTOPLANEROW_SSE2 -ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) -#endif #ifdef HAS_SOBELTOPLANEROW_NEON ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) #endif #ifdef HAS_SOBELTOPLANEROW_LSX ANY21(SobelToPlaneRow_Any_LSX, SobelToPlaneRow_LSX, 0, 1, 1, 1, 31) #endif -#ifdef HAS_SOBELXYROW_SSE2 -ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) -#endif #ifdef HAS_SOBELXYROW_NEON ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #endif @@ -735,9 +642,6 @@ ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15) #ifdef HAS_YUY2TONVUVROW_NEON ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7) #endif -#ifdef HAS_YUY2TONVUVROW_SSE2 -ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7) -#endif #ifdef HAS_YUY2TONVUVROW_AVX2 ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15) #endif @@ -763,9 +667,6 @@ ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15) } // Biplanar to RGB. -#ifdef HAS_NV12TOARGBROW_SSSE3 -ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7) -#endif #ifdef HAS_NV12TOARGBROW_AVX2 ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) #endif @@ -778,9 +679,6 @@ ANY21C(NV12ToARGBRow_Any_LSX, NV12ToARGBRow_LSX, 1, 1, 2, 4, 7) #ifdef HAS_NV12TOARGBROW_LASX ANY21C(NV12ToARGBRow_Any_LASX, NV12ToARGBRow_LASX, 1, 1, 2, 4, 15) #endif -#ifdef HAS_NV21TOARGBROW_SSSE3 -ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) -#endif #ifdef HAS_NV21TOARGBROW_AVX2 ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) #endif @@ -799,21 +697,12 @@ ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) #ifdef HAS_NV21TORGB24ROW_NEON ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7) #endif -#ifdef HAS_NV12TORGB24ROW_SSSE3 -ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) -#endif -#ifdef HAS_NV21TORGB24ROW_SSSE3 -ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) -#endif #ifdef HAS_NV12TORGB24ROW_AVX2 ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31) #endif #ifdef HAS_NV21TORGB24ROW_AVX2 ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31) #endif -#ifdef HAS_NV12TORGB565ROW_SSSE3 -ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) -#endif #ifdef HAS_NV12TORGB565ROW_AVX2 ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) #endif @@ -847,12 +736,6 @@ ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15) memcpy(dst_ptr + (np >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } -#ifdef HAS_P210TOAR30ROW_SSSE3 -ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_P210TOARGBROW_SSSE3 -ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) -#endif #ifdef HAS_P210TOARGBROW_AVX2 ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif @@ -865,12 +748,6 @@ ANY21CT(P210ToAR30Row_Any_NEON, P210ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7) #ifdef HAS_P210TOARGBROW_NEON ANY21CT(P210ToARGBRow_Any_NEON, P210ToARGBRow_NEON, 1, 0, uint16_t, 2, 4, 7) #endif -#ifdef HAS_P410TOAR30ROW_SSSE3 -ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif -#ifdef HAS_P410TOARGBROW_SSSE3 -ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) -#endif #ifdef HAS_P410TOARGBROW_AVX2 ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif @@ -937,19 +814,9 @@ ANY11(CopyRow_Any_AVX512BW, CopyRow_AVX512BW, 0, 1, 1, 127) #ifdef HAS_COPYROW_AVX ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) #endif -#ifdef HAS_COPYROW_SSE2 -ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) -#endif #ifdef HAS_COPYROW_NEON ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) #endif -#if defined(HAS_ARGBTORGB24ROW_SSSE3) -ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15) -ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15) -ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) -ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) -ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) -#endif #if defined(HAS_ARGBTORGB24ROW_AVX2) ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31) #endif @@ -966,37 +833,21 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif -#if defined(HAS_ABGRTOAR30ROW_SSSE3) -ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) -#endif #if defined(HAS_ABGRTOAR30ROW_NEON) ANY11(ABGRToAR30Row_Any_NEON, ABGRToAR30Row_NEON, 0, 4, 4, 7) #endif #if defined(HAS_ARGBTOAR30ROW_NEON) ANY11(ARGBToAR30Row_Any_NEON, ARGBToAR30Row_NEON, 0, 4, 4, 7) #endif -#if defined(HAS_ARGBTOAR30ROW_SSSE3) -ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) -#endif #if defined(HAS_ABGRTOAR30ROW_AVX2) ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) #endif #if defined(HAS_ARGBTOAR30ROW_AVX2) ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) #endif -#if defined(HAS_J400TOARGBROW_SSE2) -ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) -#endif #if defined(HAS_J400TOARGBROW_AVX2) ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) #endif -#if defined(HAS_RGB24TOARGBROW_SSSE3) -ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) -ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) -ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) -ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) -ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) -#endif #if defined(HAS_RAWTOARGBROW_AVX2) ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31) #endif @@ -1006,12 +857,6 @@ ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63) #if defined(HAS_RGB24TOARGBROW_AVX512BW) ANY11(RGB24ToARGBRow_Any_AVX512BW, RGB24ToARGBRow_AVX512BW, 0, 3, 4, 63) #endif -#if defined(HAS_RAWTORGBAROW_SSSE3) -ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) -#endif -#if defined(HAS_RAWTORGB24ROW_SSSE3) -ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7) -#endif #if defined(HAS_RGB565TOARGBROW_AVX2) ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15) #endif @@ -1073,17 +918,9 @@ ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31) #ifdef HAS_YUY2TOYROW_AVX2 ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31) #endif -#ifdef HAS_ARGBTOYROW_SSSE3 -ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(ARGBToYRow_Any_AVX512BW, ARGBToYRow_AVX512BW, 0, 4, 1, 63) #endif -#ifdef HAS_BGRATOYROW_SSSE3 -ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) -ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) -ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(BGRAToYRow_Any_AVX512BW, BGRAToYRow_AVX512BW, 0, 4, 1, 63) #endif @@ -1099,25 +936,12 @@ ANY11(RGBAToYRow_Any_AVX2, RGBAToYRow_AVX2, 0, 4, 1, 31) #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(ABGRToYRow_Any_AVX512BW, ABGRToYRow_AVX512BW, 0, 4, 1, 63) #endif -#ifdef HAS_YUY2TOYROW_SSE2 -ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) -ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) -#endif -#ifdef HAS_ARGBTOYJROW_SSSE3 -ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(ARGBToYJRow_Any_AVX512BW, ARGBToYJRow_AVX512BW, 0, 4, 1, 63) #endif -#ifdef HAS_ABGRTOYJROW_SSSE3 -ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(ABGRToYJRow_Any_AVX512BW, ABGRToYJRow_AVX512BW, 0, 4, 1, 63) #endif -#ifdef HAS_RGBATOYJROW_SSSE3 -ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11(RGBAToYJRow_Any_AVX512BW, RGBAToYJRow_AVX512BW, 0, 4, 1, 63) #endif @@ -1205,38 +1029,6 @@ ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15) #ifdef HAS_RGBATOYROW_LASX ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31) #endif -#ifdef HAS_RGB24TOYROW_NEON -#endif -#ifdef HAS_RGB24TOYJROW_AVX2 -#endif -#ifdef HAS_RGB24TOYJROW_SSSE3 -#endif -#ifdef HAS_RGB24TOYJROW_NEON -#endif -#ifdef HAS_RGB24TOYROW_LSX -#endif -#ifdef HAS_RGB24TOYJROW_LSX -#endif -#ifdef HAS_RGB24TOYJROW_LASX -#endif -#ifdef HAS_RGB24TOYROW_LASX -#endif -#ifdef HAS_RAWTOYROW_NEON -#endif -#ifdef HAS_RAWTOYJROW_AVX2 -#endif -#ifdef HAS_RAWTOYJROW_SSSE3 -#endif -#ifdef HAS_RAWTOYJROW_NEON -#endif -#ifdef HAS_RAWTOYROW_LSX -#endif -#ifdef HAS_RAWTOYROW_LASX -#endif -#ifdef HAS_RAWTOYJROW_LSX -#endif -#ifdef HAS_RAWTOYJROW_LASX -#endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 15) #endif @@ -1287,9 +1079,6 @@ ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31) #ifdef HAS_AYUVTOYROW_NEON ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) #endif -#ifdef HAS_SWAPUVROW_SSSE3 -ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15) -#endif #ifdef HAS_SWAPUVROW_AVX2 ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31) #endif @@ -1344,12 +1133,6 @@ ANY11(ARGB4444ToARGBRow_Any_LSX, ARGB4444ToARGBRow_LSX, 0, 2, 4, 15) #ifdef HAS_ARGB4444TOARGBROW_LASX ANY11(ARGB4444ToARGBRow_Any_LASX, ARGB4444ToARGBRow_LASX, 0, 2, 4, 31) #endif -#ifdef HAS_ARGBATTENUATEROW_SSSE3 -ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) -#endif -#ifdef HAS_ARGBUNATTENUATEROW_SSE2 -ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) -#endif #ifdef HAS_ARGBATTENUATEROW_AVX2 ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7) #endif @@ -1365,9 +1148,6 @@ ANY11(ARGBAttenuateRow_Any_LSX, ARGBAttenuateRow_LSX, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_LASX ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15) #endif -#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 -ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) -#endif #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31) #endif @@ -1401,15 +1181,9 @@ ANY11(ARGBExtractAlphaRow_Any_LSX, ARGBExtractAlphaRow_LSX, 0, 4, 1, 15) #ifdef HAS_ARGBCOPYALPHAROW_AVX2 ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) #endif -#ifdef HAS_ARGBCOPYALPHAROW_SSE2 -ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) -#endif #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) #endif -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) -#endif #undef ANY11B // Any 1 to 1 with parameter. @@ -1429,8 +1203,7 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) memcpy(dst_ptr + np * BPP, vout, r * BPP); \ } -#if defined(HAS_I400TOARGBROW_SSE2) -ANY11P(I400ToARGBRow_Any_SSE2, +#if 0 I400ToARGBRow_SSE2, const struct YuvConstants*, 1, @@ -1462,8 +1235,7 @@ ANY11P(I400ToARGBRow_Any_LSX, 15) #endif -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) -ANY11P(ARGBToRGB565DitherRow_Any_SSE2, +#if 0 ARGBToRGB565DitherRow_SSE2, const uint32_t, 4, @@ -1502,9 +1274,6 @@ ANY11P(ARGBToRGB565DitherRow_Any_LASX, 2, 15) #endif -#ifdef HAS_ARGBSHUFFLEROW_SSSE3 -ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) -#endif #ifdef HAS_ARGBSHUFFLEROW_AVX2 ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15) #endif @@ -1537,21 +1306,9 @@ ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15) memcpy((uint8_t*)(dst_ptr) + np * BPP, vout, r * BPP); \ } -#ifdef HAS_ARGBTOAR64ROW_SSSE3 -ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) -#endif -#ifdef HAS_ARGBTOAB64ROW_SSSE3 -ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) -#endif -#ifdef HAS_AR64TOARGBROW_SSSE3 -ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) -#endif -#ifdef HAS_ARGBTOAR64ROW_SSSE3 -ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) -#endif #ifdef HAS_ARGBTOAR64ROW_AVX2 ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) @@ -1604,8 +1361,7 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) memcpy(dst_ptr + np, vout, r * BPP); \ } -#ifdef HAS_CONVERT16TO8ROW_SSSE3 -ANY11C(Convert16To8Row_Any_SSSE3, +#if 0 Convert16To8Row_SSSE3, 2, 1, @@ -1640,8 +1396,7 @@ ANY11C(Convert16To8Row_Any_NEON, uint8_t, 15) #endif -#ifdef HAS_CONVERT8TO16ROW_SSE2 -ANY11C(Convert8To16Row_Any_SSE2, +#if 0 Convert8To16Row_SSE2, 1, 2, @@ -1748,9 +1503,6 @@ ANY11SB(Convert8To8Row_Any_AVX2, memcpy(dst_ptr + np, vout, r * BPP); \ } -#ifdef HAS_HALFFLOATROW_SSE2 -ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) -#endif #ifdef HAS_HALFFLOATROW_AVX2 ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) #endif @@ -1793,10 +1545,6 @@ ANY11P16(HalfFloatRow_Any_LSX, HalfFloatRow_LSX, uint16_t, uint16_t, 2, 2, 31) memcpy(dst_ptr + np * BPP, vout, r * BPP); \ } -#if defined(HAS_YUY2TOARGBROW_SSSE3) -ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) -ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) -#endif #if defined(HAS_YUY2TOARGBROW_AVX2) ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31) ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) @@ -1836,8 +1584,7 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7) #ifdef HAS_INTERPOLATEROW_AVX2 ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31) #endif -#ifdef HAS_INTERPOLATEROW_SSSE3 -ANY11I(InterpolateRow_Any_SSSE3, +#if 0 InterpolateRow_SSSE3, uint8_t, uint8_t, @@ -1926,9 +1673,6 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2, #ifdef HAS_MIRRORROW_AVX2 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) #endif -#ifdef HAS_MIRRORROW_SSSE3 -ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) -#endif #ifdef HAS_MIRRORROW_NEON ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31) #endif @@ -1941,9 +1685,6 @@ ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63) #ifdef HAS_MIRRORUVROW_AVX2 ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) #endif -#ifdef HAS_MIRRORUVROW_SSSE3 -ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) -#endif #ifdef HAS_MIRRORUVROW_NEON ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) #endif @@ -1956,9 +1697,6 @@ ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15) #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif -#ifdef HAS_ARGBMIRRORROW_SSE2 -ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) -#endif #ifdef HAS_ARGBMIRRORROW_NEON ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) #endif @@ -1968,9 +1706,6 @@ ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7) #ifdef HAS_ARGBMIRRORROW_LASX ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15) #endif -#ifdef HAS_RGB24MIRRORROW_SSSE3 -ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15) -#endif #ifdef HAS_RGB24MIRRORROW_NEON ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15) #endif @@ -2026,9 +1761,6 @@ ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3) memcpy(dst_v + (np >> DUVSHIFT), vout + 256, SS(r, DUVSHIFT)); \ } -#ifdef HAS_SPLITUVROW_SSE2 -ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) -#endif #ifdef HAS_SPLITUVROW_AVX2 ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) #endif @@ -2038,12 +1770,6 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #ifdef HAS_SPLITUVROW_LSX ANY12(SplitUVRow_Any_LSX, SplitUVRow_LSX, 0, 2, 0, 31) #endif -#ifdef HAS_ARGBTOUV444ROW_SSSE3 -ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) -#endif -#ifdef HAS_ARGBTOUVJ444ROW_SSSE3 -ANY12(ARGBToUVJ444Row_Any_SSSE3, ARGBToUVJ444Row_SSSE3, 0, 4, 0, 15) -#endif #ifdef HAS_ARGBTOUV444ROW_AVX2 ANY12(ARGBToUV444Row_Any_AVX2, ARGBToUV444Row_AVX2, 0, 4, 0, 31) #endif @@ -2060,10 +1786,6 @@ ANY12(ARGBToUVJ444Row_Any_AVX512BW, ARGBToUVJ444Row_AVX512BW, 0, 4, 0, 63) ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) #endif -#ifdef HAS_YUY2TOUV422ROW_SSE2 -ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15) -ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) -#endif #ifdef HAS_YUY2TOUV422ROW_NEON ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) ANY12(ARGBToUVJ444Row_Any_NEON, ARGBToUVJ444Row_NEON, 0, 4, 0, 7) @@ -2134,24 +1856,12 @@ ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7) memcpy(dst_b + np, vout + 32, r); \ } -#ifdef HAS_SPLITRGBROW_SSSE3 -ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) -#endif -#ifdef HAS_SPLITRGBROW_SSE41 -ANY13(SplitRGBRow_Any_SSE41, SplitRGBRow_SSE41, 3, 15) -#endif #ifdef HAS_SPLITRGBROW_AVX2 ANY13(SplitRGBRow_Any_AVX2, SplitRGBRow_AVX2, 3, 31) #endif #ifdef HAS_SPLITRGBROW_NEON ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) #endif -#ifdef HAS_SPLITXRGBROW_SSE2 -ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7) -#endif -#ifdef HAS_SPLITXRGBROW_SSSE3 -ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7) -#endif #ifdef HAS_SPLITXRGBROW_AVX2 ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15) #endif @@ -2180,12 +1890,6 @@ ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15) memcpy(dst_a + np, vout + 48, r); \ } -#ifdef HAS_SPLITARGBROW_SSE2 -ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7) -#endif -#ifdef HAS_SPLITARGBROW_SSSE3 -ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7) -#endif #ifdef HAS_SPLITARGBROW_AVX2 ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15) #endif @@ -2276,18 +1980,12 @@ ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15) #ifdef HAS_ARGBTOUVMATRIXROW_AVX512BW ANY12MS(ARGBToUVMatrixRow_Any_AVX512BW, ARGBToUVMatrixRow_AVX512BW, 0, 4, 63) #endif -#ifdef HAS_ARGBTOUVMATRIXROW_SSSE3 -ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7) -#endif #ifdef HAS_ARGBTOUV444MATRIXROW_AVX2 ANY12M(ARGBToUV444MatrixRow_Any_AVX2, ARGBToUV444MatrixRow_AVX2, 4, 31) #endif #ifdef HAS_ARGBTOUV444MATRIXROW_AVX512BW ANY12M(ARGBToUV444MatrixRow_Any_AVX512BW, ARGBToUV444MatrixRow_AVX512BW, 4, 63) #endif -#ifdef HAS_ARGBTOUV444MATRIXROW_SSSE3 -ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15) -#endif #ifdef HAS_ARGBTOUV444MATRIXROW_NEON ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7) #endif @@ -2308,9 +2006,6 @@ ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7) memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r); \ } -#ifdef HAS_ARGBTOYROW_SSSE3 -ANY11MC(ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_SSSE3, 4, 15) -#endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11MC(ARGBToYMatrixRow_Any_AVX2, ARGBToYMatrixRow_AVX2, 4, 31) #endif @@ -2358,26 +2053,10 @@ ANY12S(ABGRToUVJRow_Any_AVX2, ABGRToUVJRow_AVX2, 0, 4, 31) #ifdef HAS_ABGRTOUVJROW_AVX512BW ANY12S(ABGRToUVJRow_Any_AVX512BW, ABGRToUVJRow_AVX512BW, 0, 4, 63) #endif -#ifdef HAS_ARGBTOUVJROW_SSSE3 -ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) -#endif -#ifdef HAS_ABGRTOUVJROW_SSSE3 -ANY12S(ABGRToUVJRow_Any_SSSE3, ABGRToUVJRow_SSSE3, 0, 4, 15) -#endif -#ifdef HAS_ARGBTOUVROW_SSSE3 -ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) -ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) -ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) -ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) -#endif #ifdef HAS_YUY2TOUVROW_AVX2 ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31) ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31) #endif -#ifdef HAS_YUY2TOUVROW_SSE2 -ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15) -ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) -#endif #ifdef HAS_ARGBTOUVROW_NEON ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #endif @@ -2594,15 +2273,9 @@ ANY11S(AYUVToVURow_Any_SVE2, AYUVToVURow_SVE2, 0, 4, 1) #ifdef HAS_DETILEROW_NEON ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, uint8_t, 1, 15) #endif -#ifdef HAS_DETILEROW_SSE2 -ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, uint8_t, 1, 15) -#endif #ifdef HAS_DETILEROW_16_NEON ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15) #endif -#ifdef HAS_DETILEROW_16_SSE2 -ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15) -#endif #ifdef HAS_DETILEROW_16_AVX ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15) #endif @@ -2629,9 +2302,6 @@ ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15) #ifdef HAS_DETILESPLITUVROW_NEON ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15) #endif -#ifdef HAS_DETILESPLITUVROW_SSSE3 -ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) -#endif #define ANYDETILEMERGE(NAMEANY, ANY_SIMD, MASK) \ void NAMEANY(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, \ @@ -2657,9 +2327,6 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15) #endif -#ifdef HAS_DETILETOYUY2_SSE2 -ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15) -#endif #ifdef __cplusplus } // extern "C" diff --git a/source/row_common.cc b/source/row_common.cc index b2a0ec12b..c54a093df 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -4046,7 +4046,7 @@ void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { #define MAXTWIDTH 2048 #if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \ - defined(HAS_I422TORGB565ROW_SSSE3) && !defined(LIBYUV_ENABLE_ROWWIN) + 0 && !defined(LIBYUV_ENABLE_ROWWIN) // row_win.cc has asm version, but GCC uses 2 step wrapper. void I422ToRGB565Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, @@ -4057,8 +4057,6 @@ void I422ToRGB565Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -4068,7 +4066,7 @@ void I422ToRGB565Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_I422TOARGB1555ROW_SSSE3) +#if 0 void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4079,8 +4077,6 @@ void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); - ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -4090,7 +4086,7 @@ void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_I422TOARGB4444ROW_SSSE3) +#if 0 void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4101,8 +4097,6 @@ void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); - ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -4112,7 +4106,7 @@ void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_NV12TORGB565ROW_SSSE3) +#if 0 void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, @@ -4122,8 +4116,6 @@ void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); src_y += twidth; src_uv += twidth; dst_rgb565 += twidth * 2; @@ -4132,7 +4124,7 @@ void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_NV12TORGB24ROW_SSSE3) +#if 0 void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, @@ -4142,8 +4134,6 @@ void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); src_y += twidth; src_uv += twidth; dst_rgb24 += twidth * 3; @@ -4152,7 +4142,7 @@ void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_NV21TORGB24ROW_SSSE3) +#if 0 void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, @@ -4162,8 +4152,6 @@ void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth); - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); src_y += twidth; src_vu += twidth; dst_rgb24 += twidth * 3; @@ -4186,7 +4174,6 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_uv += twidth; @@ -4210,7 +4197,6 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_vu += twidth; @@ -4234,7 +4220,6 @@ void I422ToRGB565Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB565ROW_AVX2) ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); #else - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); #endif src_y += twidth; src_u += twidth / 2; @@ -4260,7 +4245,6 @@ void I422ToARGB1555Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTOARGB1555ROW_AVX2) ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); #else - ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); #endif src_y += twidth; src_u += twidth / 2; @@ -4286,7 +4270,6 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTOARGB4444ROW_AVX2) ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); #else - ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); #endif src_y += twidth; src_u += twidth / 2; @@ -4312,7 +4295,6 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_u += twidth / 2; @@ -4338,7 +4320,6 @@ void I444ToRGB24Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_u += twidth; @@ -4363,7 +4344,6 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, #if defined(HAS_ARGBTORGB565ROW_AVX2) ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); #else - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); #endif src_y += twidth; src_uv += twidth; @@ -4383,12 +4363,12 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, } #endif // HAS_RAWTOYJROW_AVX2 -#ifdef HAS_RGB24TOYJROW_SSSE3 +#if 0 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. } #endif // HAS_RGB24TOYJROW_SSSE3 -#ifdef HAS_RAWTOYJROW_SSSE3 +#if 0 // Convert 16 RAW pixels (64 bytes) to 16 YJ values. } #endif // HAS_RAWTOYJROW_SSSE3 diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 0da6e2ada..16b2c2780 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -21,7 +21,7 @@ extern "C" { (defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_ENABLE_ROWWIN) -#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) +#if 1 || 1 // Constants for ARGB @@ -30,9 +30,9 @@ static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u}; -#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) +#endif // 1 || 1 -#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) +#if 1 || 1 // Constants for BGRA // Constants for ABGR @@ -43,9 +43,9 @@ static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u}; -#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) +#endif // 1 || 1 -#ifdef HAS_RGB24TOARGBROW_SSSE3 +#if 1 // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { @@ -113,112 +113,11 @@ static const lvec8 kShuffleNV21 = { }; #endif // HAS_RGB24TOARGBROW_SSSE3 -#ifdef HAS_J400TOARGBROW_SSE2 -void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_J400TOARGBROW_SSE2 +#if 1 -#ifdef HAS_RGB24TOARGBROW_SSSE3 -void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRGB24ToARGB) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile( - "pcmpeqb %%xmm6,%%xmm6 \n" // 0xff000000 - "pslld $0x18,%%xmm6 \n" - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 12(%0),%%xmm1 \n" - "movdqu 24(%0),%%xmm2 \n" - "movdqu 32(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm4,%%xmm2 \n" - "pshufb %%xmm5,%%xmm3 \n" - "por %%xmm6,%%xmm0 \n" - "por %%xmm6,%%xmm1 \n" - "por %%xmm6,%%xmm2 \n" - "por %%xmm6,%%xmm3 \n" - "movdqu %%xmm0,0x00(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToARGB), // %3 - "m"(kShuffleMaskRAWToARGB_0) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( @@ -321,287 +220,19 @@ void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int wi // Same code as RAWToARGB with different shuffler and A in low bits -void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff - "psrld $24,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgba), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToRGBA) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, - uint8_t* dst_rgb24, - int width) { - asm volatile( - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - "movdqa %5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x4(%0),%%xmm1 \n" - "movdqu 0x8(%0),%%xmm2 \n" - "lea 0x18(%0),%0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToRGB24_0), // %3 - "m"(kShuffleMaskRAWToRGB24_1), // %4 - "m"(kShuffleMaskRAWToRGB24_2) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x20802080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $10,%%xmm4 \n" - "psrlw $5,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "pand %%xmm4,%%xmm0 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} -void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x42004200,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "movdqa %%xmm3,%%xmm4 \n" - "psrlw $0x6,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psllw $0x1,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} -void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "mov $0xf0f0f0f,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x4,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "pand %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "psllw $0x4,%%xmm1 \n" - "psrlw $0x4,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,0x00(%1,%0,2) \n" - "movdqu %%xmm1,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRGB24) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); -} -void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRAW) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); -} #ifdef HAS_ARGBTORGB24ROW_AVX2 // vpermd for 12+12 to 24 @@ -746,89 +377,9 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif -void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, - uint8_t* dst, - uint32_t dither4, - int width) { - asm volatile( - "movd %3,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "movdqa %%xmm6,%%xmm7 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "punpckhwd %%xmm7,%%xmm7 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "paddusb %%xmm6,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, @@ -876,75 +427,9 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, } #endif // HAS_ARGBTORGB565DITHERROW_AVX2 -void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1b,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x5,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pslld $0xa,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "pslld $0xf,%%xmm7 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "psrad $0x10,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x6,%%xmm2 \n" - "psrld $0x9,%%xmm3 \n" - "pand %%xmm7,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm6,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); -} -void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xc,%%xmm4 \n" - "movdqa %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm3 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "psrlq $0x4,%%xmm0 \n" - "psrlq $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} #endif // HAS_RGB24TOARGBROW_SSSE3 /* @@ -980,83 +465,9 @@ static const uint32_t kMaskRB10 = 0x3ff003ff; static const uint32_t kMaskAG10 = 0xc000ff00; static const uint32_t kMulAG10 = 64 * 65536 + 1028; -void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleRB30), // %3 - "m"(kMulRB10), // %4 - "m"(kMaskRB10), // %5 - "m"(kMaskAG10), // %6 - "m"(kMulAG10) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" // shuffler for RB - "movd %4,%%xmm3 \n" // multipler for RB - "movd %5,%%xmm4 \n" // mask for R10 B10 - "movd %6,%%xmm5 \n" // mask for AG - "movd %7,%%xmm6 \n" // multipler for AG - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "sub %0,%1 \n" - - "1: \n" - "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" // R0B0 - "pand %%xmm5,%%xmm0 \n" // A0G0 - "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 - "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 - "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 - "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 - "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 - "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels - "add $0x10,%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleBR30), // %3 reversed shuffler - "m"(kMulRB10), // %4 - "m"(kMaskRB10), // %5 - "m"(kMaskAG10), // %6 - "m"(kMulAG10) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} #ifdef HAS_ARGBTOAR30ROW_AVX2 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { @@ -1140,99 +551,13 @@ static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3, static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}; -void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ar64, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ar64), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} -void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, - uint16_t* dst_ab64, - int width) { - asm volatile( - "movdqa %3,%%xmm2 \n" - "movdqa %4,%%xmm3 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm0 \n" - "pshufb %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToAB64Lo), // %3 - "m"(kShuffleARGBToAB64Hi) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, - uint8_t* dst_argb, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrlw $8,%%xmm0 \n" - "psrlw $8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1"); -} -void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, - uint8_t* dst_argb, - int width) { - asm volatile("movdqa %3,%%xmm2 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrlw $8,%%xmm0 \n" - "psrlw $8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "pshufb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} + + + #ifdef HAS_ARGBTOAR64ROW_AVX2 void ARGBToAR64Row_AVX2(const uint8_t* src_argb, @@ -1407,37 +732,9 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, // clang-format on -#ifdef HAS_ARGBTOYROW_SSSE3 -// Convert 16 ARGB pixels (64 bytes) to 16 Y values. -void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kArgbI601Constants); -} -#endif // HAS_ARGBTOYROW_SSSE3 -#ifdef HAS_ARGBTOYJROW_SSSE3 -// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. -// Same as ARGBToYRow but different coefficients, no add 16. -void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kArgbJPEGConstants); -} -#endif // HAS_ARGBTOYJROW_SSSE3 -#ifdef HAS_ABGRTOYJROW_SSSE3 -// Convert 16 ABGR pixels (64 bytes) to 16 YJ values. -// Same as ABGRToYRow but different coefficients, no add 16. -void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_abgr, dst_y, width, &kAbgrJPEGConstants); -} -#endif // HAS_ABGRTOYJROW_SSSE3 - -#ifdef HAS_RGBATOYJROW_SSSE3 -// Convert 16 RGBA pixels (64 bytes) to 16 YJ values. -// Same as ARGBToYRow but different coefficients, no add 16. -void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_rgba, dst_y, width, &kRgbaJPEGConstants); -} -#endif // HAS_RGBATOYJROW_SSSE3 #if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \ defined(HAS_ARGBEXTRACTALPHAROW_AVX2) @@ -1497,31 +794,6 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; #endif -#ifdef HAS_ARGBTOYROW_SSSE3 -void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $15,%%xmm5 \n" - "packsswb %%xmm5,%%xmm5 \n" - "movdqa 0(%3),%%xmm4 \n" - "movdqa 0x60(%3),%%xmm7 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "phaddw %%xmm6,%%xmm6 \n" - "psubw %%xmm6,%%xmm7 \n" - LABELALIGN "" - RGBTOY(xmm7) - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(c) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif #ifdef HAS_ARGBTOYROW_AVX2 void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, @@ -1620,75 +892,6 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, } #endif -#ifdef HAS_ARGBTOUV444ROW_SSSE3 -void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0x8000 - "psllw $15,%%xmm5 \n" - "movdqa 0x20(%4),%%xmm3 \n" // kRGBToU - "movdqa 0x40(%4),%%xmm4 \n" // kRGBToV - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "movdqa %%xmm5,%%xmm1 \n" - "movdqa %%xmm5,%%xmm6 \n" - "psubw %%xmm0,%%xmm1 \n" - "psubw %%xmm2,%%xmm6 \n" - "psrlw $0x8,%%xmm1 \n" - "psrlw $0x8,%%xmm6 \n" - "packuswb %%xmm6,%%xmm1 \n" - "movdqu %%xmm1,(%1) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "movdqa %%xmm5,%%xmm1 \n" - "movdqa %%xmm5,%%xmm6 \n" - "psubw %%xmm0,%%xmm1 \n" - "psubw %%xmm2,%%xmm6 \n" - "psrlw $0x8,%%xmm1 \n" - "psrlw $0x8,%%xmm6 \n" - "packuswb %%xmm6,%%xmm1 \n" - "movdqu %%xmm1,0x00(%1,%2,1) \n" - - "lea 0x40(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "subl $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 -#if defined(__i386__) - "+m"(width) // %3 -#else - "+rm"(width) // %3 -#endif - : "r"(c) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_ARGBTOUV444ROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_AVX2 @@ -1842,7 +1045,7 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, } #endif // HAS_ARGBTOUV444ROW_AVX512BW -#ifdef HAS_ARGBTOUVROW_SSSE3 +#if 1 // ARGBARGB to AARRGGBB shuffle static const lvec8 kShuffleAARRGGBB = { @@ -1852,73 +1055,7 @@ static const lvec8 kShuffleAARRGGBB = { // 8x2 -> 4x1 ARGB pixels converted to 4 U and 4 V // ARGBToUV does rounding average of 4 ARGB pixels -void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - asm volatile( - "movdqa 0x20(%5),%%xmm4 \n" // RGBToU - "movdqa 0x40(%5),%%xmm5 \n" // RGBToV - "pcmpeqb %%xmm6,%%xmm6 \n" // 0x0101 - "pabsb %%xmm6,%%xmm6 \n" - "movdqa %6,%%xmm7 \n" // kShuffleAARRGGBB - "sub %1,%2 \n" - "1: \n" - "movdqu (%0),%%xmm0 \n" // Read 8x2 ARGB Pixels - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "pshufb %%xmm7,%%xmm0 \n" // aarrggbb - "pshufb %%xmm7,%%xmm1 \n" - "pshufb %%xmm7,%%xmm2 \n" - "pshufb %%xmm7,%%xmm3 \n" - "pmaddubsw %%xmm6,%%xmm0 \n" // 8x2 -> 4x2 - "pmaddubsw %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm6,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" // 4x2 -> 4x1 - "paddw %%xmm3,%%xmm1 \n" - "pxor %%xmm2,%%xmm2 \n" // 0 for vpavgw - "psrlw $1,%%xmm0 \n" - "psrlw $1,%%xmm1 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm2,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" // mutates - - "movdqa %%xmm6,%%xmm2 \n" - "psllw $15,%%xmm2 \n" // 0x8000 - "movdqa %%xmm0,%%xmm1 \n" - "pmaddubsw %%xmm5,%%xmm1 \n" // 4 V - "pmaddubsw %%xmm4,%%xmm0 \n" // 4 U - "phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv - "psubw %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,(%1) \n" // Write 4 U's - "pshufd $0x55,%%xmm2,%%xmm2 \n" // Copy V to low 4 bytes - "movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's - - "lea 0x20(%0),%0 \n" - "lea 0x4(%1),%1 \n" - "subl $0x8,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 -#if defined(__i386__) - "+m"(width) // %3 -#else - "+rm"(width) // %3 -#endif - : "r"((ptrdiff_t)(src_stride_argb)), // %4 - "r"(c), // %5 - "m"(kShuffleAARRGGBB) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #endif // HAS_ARGBTOUVROW_SSSE3 @@ -2004,15 +1141,6 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, // VG -0.7344 coefficient = -94 // VR 0.875 coefficient = 112 -#ifdef HAS_ARGBTOUV444ROW_SSSE3 -void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUV444MatrixRow_SSSE3(src_argb, dst_u, dst_v, width, - &kArgbI601Constants); -} -#endif // HAS_ARGBTOUV444ROW_SSSE3 #ifdef HAS_ARGBTOYROW_AVX2 @@ -2089,43 +1217,6 @@ void ARGBToUV444Row_AVX512BW(const uint8_t* src_argb, } #endif // HAS_ARGBTOUV444ROW_AVX512BW -#ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width, - &kArgbI601Constants); -} - -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_abgr, src_stride_abgr, dst_u, dst_v, width, - &kAbgrI601Constants); -} - -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_bgra, src_stride_bgra, dst_u, dst_v, width, - &kBgraI601Constants); -} - -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_rgba, src_stride_rgba, dst_u, dst_v, width, - &kRgbaI601Constants); -} -#endif // HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_AVX2 void ARGBToUVRow_AVX2(const uint8_t* src_argb, @@ -2147,15 +1238,6 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr, } #endif // HAS_ARGBTOUVROW_AVX2 -#ifdef HAS_ARGBTOUVJ444ROW_SSSE3 -void ARGBToUVJ444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUV444MatrixRow_SSSE3(src_argb, dst_u, dst_v, width, - &kArgbJPEGConstants); -} -#endif // HAS_ARGBTOUVJ444ROW_SSSE3 #ifdef HAS_ARGBTOUVJ444ROW_AVX2 void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb, @@ -2176,27 +1258,7 @@ void ARGBToUVJ444Row_AVX512BW(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVJ444ROW_AVX512BW -#ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width, - &kArgbJPEGConstants); -} -#endif // HAS_ARGBTOUVJROW_SSSE3 -#ifdef HAS_ABGRTOUVJROW_SSSE3 -void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_SSSE3(src_abgr, src_stride_abgr, dst_u, dst_v, width, - &kAbgrJPEGConstants); -} -#endif // HAS_ABGRTOUVJROW_SSSE3 #ifdef HAS_ARGBTOUVJROW_AVX2 void ARGBToUVJRow_AVX2(const uint8_t* src_argb, @@ -2343,19 +1405,13 @@ void ABGRToUVJRow_AVX512BW(const uint8_t* src_abgr, #endif // HAS_ABGRTOUVJROW_AVX512BW #endif // HAS_ARGBTOUVROW_AVX512BW -void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_bgra, dst_y, width, &kBgraI601Constants); -} -void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_abgr, dst_y, width, &kAbgrI601Constants); -} -void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_SSSE3(src_rgba, dst_y, width, &kRgbaI601Constants); -} -#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) + + + +#if 1 || defined(HAS_I422TOARGBROW_AVX2) // Read 8 UV from 444 #define READYUV444 \ @@ -2682,707 +1738,55 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \ "lea 0x20(%[dst_ar30]), %[dst_ar30] \n" -void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV444 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#ifdef HAS_I444ALPHATOARGBROW_SSSE3 -void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA444 YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -#endif // HAS_I444ALPHATOARGBROW_SSSE3 -void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" - "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB(yuvconstants) - STORERGB24 - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] -#if defined(__i386__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), - [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); -} -void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" - "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUV444 - YUVTORGB(yuvconstants) - STORERGB24 - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] -#if defined(__i386__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] - [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), - [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); -} -void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} // 10 bit YUV to ARGB -void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV210 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} // 12 bit YUV to ARGB -void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV212 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} // 10 bit YUV to AR30 -void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - LABELALIGN - "1: \n" - READYUV210 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} // 12 bit YUV to AR30 -void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - LABELALIGN - "1: \n" - READYUV212 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} // 10 bit YUV to ARGB -void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV410 - YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#ifdef HAS_I210ALPHATOARGBROW_SSSE3 -// 10 bit YUVA to ARGB -void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA210 YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_I410ALPHATOARGBROW_SSSE3 -// 10 bit YUVA to ARGB -void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - const uint16_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - - LABELALIGN "1: \n" READYUVA410 YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -#endif // 10 bit YUV to AR30 -void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* u_buf, - const uint16_t* v_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - LABELALIGN - "1: \n" - READYUV410 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -#ifdef HAS_I422ALPHATOARGBROW_SSSE3 -void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - const uint8_t* a_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA422 YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [u_buf] "+r"(u_buf), // %[u_buf] - [v_buf] "+r"(v_buf), // %[v_buf] - [a_buf] "+r"(a_buf), // %[a_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) - [width] "+m"(width) // %[width] -#else - [width] "+rm"(width) // %[width] -#endif - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -#endif // HAS_I422ALPHATOARGBROW_SSSE3 -void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV12 YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[uv_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, - const uint8_t* vu_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV21 YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [vu_buf] "+r"(vu_buf), // %[vu_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] - [kShuffleNV21] "m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - "movdqa %[kShuffleYUY2Y],%%xmm6 \n" - "movdqa %[kShuffleYUY2UV],%%xmm7 \n" YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [yuy2_buf] "+r"(yuy2_buf), // %[yuy2_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] - [kShuffleYUY2Y] "m"(kShuffleYUY2Y), [kShuffleYUY2UV] "m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5", "xmm6", "xmm7"); -} -void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - "movdqa %[kShuffleUYVYY],%%xmm6 \n" - "movdqa %[kShuffleUYVYUV],%%xmm7 \n" YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [uyvy_buf] "+r"(uyvy_buf), // %[uyvy_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants), // %[yuvconstants] - [kShuffleUYVYY] "m"(kShuffleUYVYY), [kShuffleUYVYUV] "m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", - "xmm5"); -} -void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP210 YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[u_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP410 YUVTORGB(yuvconstants) - STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf] "+r"(y_buf), // %[y_buf] - [uv_buf] "+r"(uv_buf), // %[u_buf] - [dst_argb] "+r"(dst_argb), // %[dst_argb] - [width] "+rm"(width) // %[width] - : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", - "xmm4", "xmm5"); -} -void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - LABELALIGN - "1: \n" - READP210 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, - const uint16_t* uv_buf, - uint8_t* dst_ar30, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $14,%%xmm5 \n" - "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" // 0 for min - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $6,%%xmm7 \n" // 1023 for max - LABELALIGN - "1: \n" - READP410 - YUVTORGB16(yuvconstants) - STOREAR30 - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB(yuvconstants) - STORERGBA - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} #endif // HAS_I422TOARGBROW_SSSE3 @@ -4556,48 +2960,6 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, } #endif // HAS_P410TOAR30ROW_AVX2 -#ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8_t* y_buf, - uint8_t* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile( - "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 - "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 - "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 - "pslld $0x18,%%xmm4 \n" - - LABELALIGN - "1: \n" - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "paddsw %%xmm3,%%xmm0 \n" - "psraw $6, %%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - - // Step 2: Weave into ARGB - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "por %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : "r"(yuvconstants) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif // HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). @@ -4643,29 +3005,12 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, } #endif // HAS_I400TOARGBROW_AVX2 -#ifdef HAS_MIRRORROW_SSSE3 +#if 1 // Shuffle table for reversing the bytes. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,1),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 @@ -4691,29 +3036,12 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_MIRRORROW_AVX2 -#ifdef HAS_MIRRORUVROW_SSSE3 +#if 1 // Shuffle table for reversing the UV. static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; -void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,2),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorUV) // %3 - : "memory", "cc", "xmm0", "xmm5"); -} #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_MIRRORUVROW_AVX2 @@ -4739,40 +3067,14 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { } #endif // HAS_MIRRORUVROW_AVX2 -#ifdef HAS_MIRRORSPLITUVROW_SSSE3 +#if 1 // Shuffle table for reversing the bytes of UV channels. static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile( - "movdqa %4,%%xmm1 \n" - "lea -0x10(%0,%3,2),%0 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $8,%3 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorSplitUV) // %4 - : "memory", "cc", "xmm0", "xmm1"); -} #endif // HAS_MIRRORSPLITUVROW_SSSE3 -#ifdef HAS_RGB24MIRRORROW_SSSE3 +#if 1 // Shuffle first 5 pixels to last 5 mirrored. first byte zero static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, @@ -4783,64 +3085,9 @@ static const uvec8 kShuffleMirrorRGB1 = { 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; // Shuffle 5 pixels at a time (15 bytes) -void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - src_rgb24 += width * 3 - 48; - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // first 5 - "movdqu 15(%0),%%xmm1 \n" // next 5 - "movdqu 30(%0),%%xmm2 \n" // next 5 - "movdqu 32(%0),%%xmm3 \n" // last 1 special - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm4,%%xmm2 \n" - "pshufb %%xmm5,%%xmm3 \n" - "lea -0x30(%0),%0 \n" - "movdqu %%xmm0,32(%1) \n" // last 5 - "movdqu %%xmm1,17(%1) \n" // next 5 - "movdqu %%xmm2,2(%1) \n" // next 5 - "movlpd %%xmm3,0(%1) \n" // first 1 - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_rgb24), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorRGB0), // %3 - "m"(kShuffleMirrorRGB1) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} #endif // HAS_RGB24MIRRORROW_SSSE3 -#ifdef HAS_ARGBMIRRORROW_SSE2 - -void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("lea -0x10(%0,%2,4),%0 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc", "xmm0"); -} -#endif // HAS_ARGBMIRRORROW_SSE2 #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. @@ -4903,86 +3150,8 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, } #endif // HAS_SPLITUVROW_AVX2 -#ifdef HAS_SPLITUVROW_SSE2 -void SplitUVRow_SSE2(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SPLITUVROW_SSE2 -#ifdef HAS_DETILEROW_SSE2 -void DetileRow_SSE2(const uint8_t* src, - ptrdiff_t src_tile_stride, - uint8_t* dst, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "sub $0x10,%2 \n" - "lea (%0,%3),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(src_tile_stride) // %3 - : "cc", "memory", "xmm0"); -} -#endif // HAS_DETILEROW_SSE2 - -#ifdef HAS_DETILEROW_16_SSE2 -void DetileRow_16_SSE2(const uint16_t* src, - ptrdiff_t src_tile_stride, - uint16_t* dst, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(src_tile_stride) // %3 - : "cc", "memory", "xmm0", "xmm1"); -} -#endif // HAS_DETILEROW_SSE2 #ifdef HAS_DETILEROW_16_AVX void DetileRow_16_AVX(const uint16_t* src, @@ -5006,40 +3175,8 @@ void DetileRow_16_AVX(const uint16_t* src, } #endif // HAS_DETILEROW_AVX -#ifdef HAS_DETILETOYUY2_SSE2 -// Read 16 Y, 8 UV, and write 8 YUYV. -void DetileToYUY2_SSE2(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" // Load 16 Y - "sub $0x10,%3 \n" - "lea (%0,%4),%0 \n" - "movdqu (%1),%%xmm1 \n" // Load 8 UV - "lea (%1,%5),%1 \n" - "movdqu %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_yuy2), // %2 - "+r"(width) // %3 - : "r"(src_y_tile_stride), // %4 - "r"(src_uv_tile_stride) // %5 - : "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list - ); -} -#endif -#ifdef HAS_DETILESPLITUVROW_SSSE3 +#if 1 // TODO(greenjustin): Look into generating these constants instead of loading // them since this can cause branch mispredicts for fPIC code on 32-bit // machines. @@ -5048,31 +3185,7 @@ static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14, // TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very // slow on older SSE2 processors. -void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, - ptrdiff_t src_tile_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqu %4,%%xmm1 \n" - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea (%0, %5),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "movhps %%xmm0,(%2) \n" - "lea 0x8(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "m"(kDeinterlaceUV), // %4 - "r"(src_tile_stride) // %5 - : "cc", "memory", "xmm0", "xmm1"); -} + #endif // HAS_DETILESPLITUVROW_SSSE3 #ifdef HAS_MERGEUVROW_AVX512BW @@ -5131,34 +3244,6 @@ void MergeUVRow_AVX2(const uint8_t* src_u, } #endif // HAS_MERGEUVROW_AVX2 -#ifdef HAS_MERGEUVROW_SSE2 -void MergeUVRow_SSE2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - asm volatile("sub %0,%1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_16_AVX2 void MergeUVRow_16_AVX2(const uint16_t* src_u, @@ -5319,34 +3404,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y, // 16384 = 10 bits // 4096 = 12 bits // 256 = 16 bits -void Convert16To8Row_SSSE3(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width) { - asm volatile( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - // 32 pixels per loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "add $0x20,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} #ifdef HAS_CONVERT16TO8ROW_AVX2 void Convert16To8Row_AVX2(const uint16_t* src_y, @@ -5415,36 +3473,7 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, // 512 = 9 bits // 1024 = 10 bits // 4096 = 12 bits -void Convert8To16Row_SSE2(const uint8_t* src_y, - uint16_t* dst_y, - int scale, - int width) { - asm volatile( - "movd %3,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - // 32 pixels per loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "add $0x10,%0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "add $0x20,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} #ifdef HAS_CONVERT8TO16ROW_AVX2 void Convert8To16Row_AVX2(const uint8_t* src_y, @@ -5478,7 +3507,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, } #endif // HAS_CONVERT8TO16ROW_AVX2 -#ifdef HAS_SPLITRGBROW_SSSE3 +#if 1 // Shuffle table for converting RGB to Planar. static const uvec8 kSplitRGBShuffle[9] = { {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, @@ -5500,59 +3529,10 @@ static const uvec8 kSplitRGBShuffle[9] = { {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u}}; -void SplitRGBRow_SSSE3(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb 0(%5), %%xmm0 \n" - "pshufb 16(%5), %%xmm1 \n" - "pshufb 32(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb 48(%5),%%xmm0 \n" - "pshufb 64(%5),%%xmm1 \n" - "pshufb 80(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "pshufb 96(%5), %%xmm0 \n" - "pshufb 112(%5), %%xmm1 \n" - "pshufb 128(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - "lea 0x10(%3),%3 \n" - "lea 0x30(%0),%0 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "r"(&kSplitRGBShuffle[0]) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} #endif // HAS_SPLITRGBROW_SSSE3 -#ifdef HAS_SPLITRGBROW_SSE41 +#if 1 // Shuffle table for converting RGB to Planar, SSE4.1. Note: these are used for // the AVX2 implementation as well. static const uvec8 kSplitRGBShuffleSSE41[5] = { @@ -5563,46 +3543,7 @@ static const uvec8 kSplitRGBShuffleSSE41[5] = { {0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u}, }; -void SplitRGBRow_SSE41(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "movdqa 48(%5), %%xmm0 \n" - "1: \n" - "movdqu (%0),%%xmm1 \n" - "movdqu 0x10(%0),%%xmm2 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm1, %%xmm4 \n" - "pblendvb %%xmm3, %%xmm1 \n" - "pblendvb %%xmm2, %%xmm3 \n" - "pblendvb %%xmm4, %%xmm2 \n" - "palignr $0xF, %%xmm0, %%xmm0 \n" - "pblendvb %%xmm2, %%xmm1 \n" - "pblendvb %%xmm3, %%xmm2 \n" - "pblendvb %%xmm4, %%xmm3 \n" - "palignr $0x1, %%xmm0, %%xmm0 \n" - "pshufb 0(%5), %%xmm1 \n" - "pshufb 16(%5), %%xmm2 \n" - "pshufb 32(%5), %%xmm3 \n" - "movdqu %%xmm1,(%1) \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm2,(%2) \n" - "lea 0x10(%2),%2 \n" - "movdqu %%xmm3,(%3) \n" - "lea 0x10(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "r"(&kSplitRGBShuffleSSE41[0]) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} + #endif // HAS_SPLITRGBROW_SSE41 #ifdef HAS_SPLITRGBROW_AVX2 @@ -5669,7 +3610,7 @@ void SplitRGBRow_AVX2(const uint8_t* src_rgb, } #endif // HAS_SPLITRGBROW_AVX2 -#ifdef HAS_MERGERGBROW_SSSE3 +#if 1 // Shuffle table for converting Planar to RGB. static const uvec8 kMergeRGBShuffle[9] = { {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u, @@ -5691,137 +3632,10 @@ static const uvec8 kMergeRGBShuffle[9] = { {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u, 15u}}; -void MergeRGBRow_SSSE3(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb (%5), %%xmm0 \n" - "pshufb 16(%5), %%xmm1 \n" - "pshufb 32(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb 48(%5), %%xmm0 \n" - "pshufb 64(%5), %%xmm1 \n" - "pshufb 80(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,16(%3) \n" - - "movdqu (%0),%%xmm0 \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%2),%%xmm2 \n" - "pshufb 96(%5), %%xmm0 \n" - "pshufb 112(%5), %%xmm1 \n" - "pshufb 128(%5), %%xmm2 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,32(%3) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "lea 0x10(%2),%2 \n" - "lea 0x30(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_rgb), // %3 - "+r"(width) // %4 - : "r"(&kMergeRGBShuffle[0]) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} #endif // HAS_MERGERGBROW_SSSE3 -#ifdef HAS_MERGEARGBROW_SSE2 -void MergeARGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, - int width) { - asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - LABELALIGN - "1: \n" - - "movq (%0,%2),%%xmm0 \n" // B - "movq (%0),%%xmm1 \n" // R - "movq (%0,%1),%%xmm2 \n" // G - "punpcklbw %%xmm1,%%xmm0 \n" // BR - "movq (%0,%3),%%xmm1 \n" // A - "punpcklbw %%xmm1,%%xmm2 \n" // GA - "movdqa %%xmm0,%%xmm1 \n" // BR - "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) - "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) - "movdqu %%xmm0,(%4) \n" - "movdqu %%xmm1,16(%4) \n" - - "lea 8(%0),%0 \n" - "lea 32(%4),%4 \n" - "sub $0x8,%5 \n" - "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif - -#ifdef HAS_MERGEXRGBROW_SSE2 -void MergeXRGBRow_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, - int width) { - asm volatile( - "1: \n" - - "movq (%2),%%xmm0 \n" // B - "movq (%0),%%xmm1 \n" // R - "movq (%1),%%xmm2 \n" // G - "punpcklbw %%xmm1,%%xmm0 \n" // BR - "pcmpeqd %%xmm1,%%xmm1 \n" // A(255) - "punpcklbw %%xmm1,%%xmm2 \n" // GA - "movdqa %%xmm0,%%xmm1 \n" // BR - "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) - "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,16(%3) \n" - - "lea 8(%0),%0 \n" - "lea 8(%1),%1 \n" - "lea 8(%2),%2 \n" - "lea 32(%3),%3 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_r), // %0 - "+r"(src_g), // %1 - "+r"(src_b), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_MERGEARGBROW_SSE2 #ifdef HAS_MERGEARGBROW_AVX2 void MergeARGBRow_AVX2(const uint8_t* src_r, @@ -5910,191 +3724,11 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r, } #endif // HAS_MERGEARGBROW_AVX2 -#ifdef HAS_SPLITARGBROW_SSE2 -void SplitARGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile( - "sub %1,%2 \n" - "sub %1,%3 \n" - "sub %1,%4 \n" - LABELALIGN - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 - "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B - "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%1,%3) \n" // B - "movhps %%xmm0,(%1,%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - "movhps %%xmm2,(%1,%4) \n" // A - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "sub $0x8,%5 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 - "+rm"(width) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif - -#ifdef HAS_SPLITXRGBROW_SSE2 -void SplitXRGBRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 - "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B - "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) - "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%3) \n" // B - "movhps %%xmm0,(%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "lea 8(%2),%2 \n" - "lea 8(%3),%3 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; -#ifdef HAS_SPLITARGBROW_SSSE3 -void SplitARGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - uint8_t* dst_a, - int width) { - asm volatile( - "movdqa %6,%%xmm3 \n" - "sub %1,%2 \n" - "sub %1,%3 \n" - "sub %1,%4 \n" - LABELALIGN - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) - "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%1,%3) \n" // B - "movhps %%xmm0,(%1,%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - "movhps %%xmm2,(%1,%4) \n" // A - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "subl $0x8,%5 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(dst_a), // %4 -#if defined(__i386__) - "+m"(width) // %5 -#else - "+rm"(width) // %5 -#endif - : "m"(kShuffleMaskARGBSplit) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} -#endif - -#ifdef HAS_SPLITXRGBROW_SSSE3 -void SplitXRGBRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - - LABELALIGN - "1: \n" - - "movdqu (%0),%%xmm0 \n" // 00-0F - "movdqu 16(%0),%%xmm1 \n" // 10-1F - "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) - "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) - "movdqa %%xmm0,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) - "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) - "movlps %%xmm0,(%3) \n" // B - "movhps %%xmm0,(%2) \n" // G - "movlps %%xmm2,(%1) \n" // R - - "lea 32(%0),%0 \n" - "lea 8(%1),%1 \n" - "lea 8(%2),%2 \n" - "lea 8(%3),%3 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "m"(kShuffleMaskARGBSplit) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} -#endif #ifdef HAS_SPLITARGBROW_AVX2 static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7}; @@ -6500,45 +4134,6 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, } #endif -#ifdef HAS_COPYROW_SSE2 -void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "test $0xf,%0 \n" - "jne 2f \n" - "test $0xf,%1 \n" - "jne 2f \n" - - LABELALIGN - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 9f \n" - - LABELALIGN - "2: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 2b \n" - - LABELALIGN "9: \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_AVX void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { @@ -6595,40 +4190,6 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_COPYROW_ERMS -#ifdef HAS_ARGBCOPYALPHAROW_SSE2 -// width in pixels -void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBCOPYALPHAROW_SSE2 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels @@ -6658,31 +4219,6 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_ARGBCOPYALPHAROW_AVX2 -#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 -// width in pixels -void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - asm volatile( - "1: \n" - "movdqu (%0), %%xmm0 \n" - "movdqu 0x10(%0), %%xmm1 \n" - "lea 0x20(%0), %0 \n" - "psrld $0x18, %%xmm0 \n" - "psrld $0x18, %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "packuswb %%xmm0, %%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1), %1 \n" - "sub $0x8, %2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 static const uvec8 kShuffleAlphaShort_AVX2 = { @@ -6725,42 +4261,6 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBEXTRACTALPHAROW_AVX2 -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -// width in pixels -void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm2 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpckhwd %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "movdqu (%1),%%xmm4 \n" - "movdqu 0x10(%1),%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels @@ -6822,228 +4322,6 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { } #endif // HAS_SETROW_X86 -#ifdef HAS_YUY2TOYROW_SSE2 -void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} - -void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_uv, - int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(stride_yuy2)) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} - -void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, - int stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_yuy2)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} - -void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} - -void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, - int stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_uyvy)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} - -void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_YUY2TOYROW_AVX2 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { @@ -7279,144 +4557,15 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, } #endif // HAS_YUY2TOYROW_AVX2 -#ifdef HAS_ARGBBLENDROW_SSSE3 +#if 1 // Shuffle table for isolating alpha. static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time -void ARGBBlendRow_SSSE3(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - // 4 pixel loop. - LABELALIGN - "40: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" - - // 1 pixel loop. - "91: \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" - "99: \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : "m"(kShuffleAlpha) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #endif // HAS_ARGBBLENDROW_SSSE3 -#ifdef HAS_BLENDPLANEROW_SSSE3 -// Blend 8 pixels at a time. -// unsigned version of math -// =((A2*C2)+(B2*(255-C2))+255)/256 -// signed version of math -// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_SSSE3(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "mov $0x807f807f,%%eax \n" - "movd %%eax,%%xmm7 \n" - "pshufd $0x0,%%xmm7,%%xmm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%2),%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm0 \n" - "movq (%0,%2,1),%%xmm1 \n" - "movq (%1,%2,1),%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "psubb %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "paddw %%xmm7,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%3,%2,1) \n" - "lea 0x8(%2),%2 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(alpha), // %2 - "+r"(dst), // %3 - "+rm"(width) // %4 - ::"memory", - "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); -} -#endif // HAS_BLENDPLANEROW_SSSE3 #ifdef HAS_BLENDPLANEROW_AVX2 // Blend 32 pixels at a time. @@ -7479,57 +4628,14 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, } #endif // HAS_BLENDPLANEROW_AVX2 -#ifdef HAS_ARGBATTENUATEROW_SSSE3 +#if 1 // Shuffle table duplicating alpha. static const vec8 kAttenuateShuffle = {6, -128, 6, -128, 6, -128, -128, -128, 14, -128, 14, -128, 14, -128, -128, -128}; // Attenuate 4 pixels at a time. -void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "punpcklbw %%xmm6,%%xmm7 \n" - "sub %0,%1 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqa %%xmm6,%%xmm0 \n" - "movdqa %%xmm6,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufb %%xmm4,%%xmm2 \n" // a,a,a,0 - "pshufb %%xmm4,%%xmm3 \n" - "pmullw %%xmm2,%%xmm0 \n" // rgb * alpha - "pmullw %%xmm3,%%xmm1 \n" - "paddw %%xmm7,%%xmm0 \n" // + 255 - "paddw %%xmm7,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "pand %%xmm5,%%xmm6 \n" - "por %%xmm6,%%xmm0 \n" - "movdqu %%xmm0,(%0,%1) \n" - "lea 0x10(%0),%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kAttenuateShuffle) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #endif // HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_AVX2 @@ -7584,50 +4690,6 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBATTENUATEROW_AVX2 -#ifdef HAS_ARGBUNATTENUATEROW_SSE2 -// Unattenuate 4 pixels at a time. -void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - uintptr_t alpha; - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movzb 0x03(%0),%3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x07(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movzb 0x0b(%0),%3 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movd 0x00(%4,%3,4),%%xmm2 \n" - "movzb 0x0f(%0),%3 \n" - "movd 0x00(%4,%3,4),%%xmm3 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "=&r"(alpha) // %3 - : "r"(fixed_invtbl8) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. @@ -7697,56 +4759,8 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBUNATTENUATEROW_AVX2 -#ifdef HAS_ARGBGRAYROW_SSSE3 -// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psubb %%xmm5,%%xmm0 \n" - "psubb %%xmm5,%%xmm1 \n" - "movdqu %%xmm4,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "movdqu %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "phaddw %%xmm0,%%xmm6 \n" - "paddw %%xmm5,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "psrld $0x18,%%xmm2 \n" - "psrld $0x18,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movdqa %%xmm6,%%xmm3 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm2,%%xmm3 \n" - "movdqa %%xmm6,%%xmm1 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm1 \n" - "movdqu %%xmm6,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kSub128) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_ARGBGRAYROW_SSSE3 - -#ifdef HAS_ARGBSEPIAROW_SSSE3 +#if 1 // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 @@ -7761,250 +4775,12 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { - asm volatile( - "movdqa %2,%%xmm2 \n" - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm6 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "phaddw %%xmm6,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqu (%0),%%xmm5 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "m"(kARGBToSepiaB), // %2 - "m"(kARGBToSepiaG), // %3 - "m"(kARGBToSepiaR) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} #endif // HAS_ARGBSEPIAROW_SSSE3 -#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 -// Tranform 8 ARGB pixels (32 bytes) with color matrix. -// Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - asm volatile( - "movdqu (%3),%%xmm5 \n" - "pshufd $0x00,%%xmm5,%%xmm2 \n" - "pshufd $0x55,%%xmm5,%%xmm3 \n" - "pshufd $0xaa,%%xmm5,%%xmm4 \n" - "pshufd $0xff,%%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm7,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm6 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm1 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x10(%0),%%xmm7 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm6 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "punpcklwd %%xmm1,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm6 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm6,0x10(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 -#ifdef HAS_ARGBQUANTIZEROW_SSE2 -// Quantize 4 ARGB pixels (16 bytes). -void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, - int scale, - int interval_size, - int interval_offset, - int width) { - asm volatile( - "movd %2,%%xmm2 \n" - "movd %3,%%xmm3 \n" - "movd %4,%%xmm4 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshufd $0x44,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "pshufd $0x44,%%xmm3,%%xmm3 \n" - "pshuflw $0x40,%%xmm4,%%xmm4 \n" - "pshufd $0x44,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "pslld $0x18,%%xmm6 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "pmullw %%xmm3,%%xmm0 \n" - "movdqu (%0),%%xmm7 \n" - "pmullw %%xmm3,%%xmm1 \n" - "pand %%xmm6,%%xmm7 \n" - "paddw %%xmm4,%%xmm0 \n" - "paddw %%xmm4,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqu %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" - "sub $0x4,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBQUANTIZEROW_SSE2 - -#ifdef HAS_ARGBSHADEROW_SSE2 -// Shade 4 pixels at a time by specified value. -void ARGBShadeRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - asm volatile( - "movd %3,%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm2 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_ARGBSHADEROW_SSE2 - -#ifdef HAS_ARGBMULTIPLYROW_SSE2 -// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_ARGBMULTIPLYROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. @@ -8042,33 +4818,6 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBMULTIPLYROW_AVX2 -#ifdef HAS_ARGBADDROW_SSE2 -// Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_ARGBADDROW_SSE2 #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. @@ -8098,33 +4847,6 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBADDROW_AVX2 -#ifdef HAS_ARGBSUBTRACTROW_SSE2 -// Subtract 2 rows of ARGB pixels, 4 pixels at a time. -void ARGBSubtractRow_SSE2(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" - "psubusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_ARGBSUBTRACTROW_SSE2 #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. @@ -8154,626 +4876,19 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBSUBTRACTROW_AVX2 -#ifdef HAS_SOBELXROW_SSE2 -// SobelX as a matrix is -// -1 0 1 -// -2 0 2 -// -1 0 1 -void SobelXRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "pxor %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x2(%0),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "movq 0x02(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x00(%0,%2,1),%%xmm2 \n" - "movq 0x02(%0,%2,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%3,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SOBELXROW_SSE2 -#ifdef HAS_SOBELYROW_SSE2 -// SobelY as a matrix is -// -1 -2 -1 -// 0 0 0 -// 1 2 1 -void SobelYRow_SSE2(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - asm volatile( - "sub %0,%1 \n" - "sub %0,%2 \n" - "pxor %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x00(%0,%1,1),%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq 0x1(%0),%%xmm1 \n" - "movq 0x01(%0,%1,1),%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq 0x2(%0),%%xmm2 \n" - "movq 0x02(%0,%1,1),%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,0x00(%0,%2,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SOBELYROW_SSE2 -#ifdef HAS_SOBELROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into ARGB. -// A = 255 -// R = Sobel -// G = Sobel -// B = Sobel -void SobelRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm2 \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm1 \n" - "punpckhwd %%xmm2,%%xmm2 \n" - "por %%xmm5,%%xmm1 \n" - "por %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklwd %%xmm0,%%xmm3 \n" - "punpckhwd %%xmm0,%%xmm0 \n" - "por %%xmm5,%%xmm3 \n" - "por %%xmm5,%%xmm0 \n" - "movdqu %%xmm1,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "movdqu %%xmm3,0x20(%2) \n" - "movdqu %%xmm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} -#endif // HAS_SOBELROW_SSE2 -#ifdef HAS_SOBELTOPLANEROW_SSE2 -// Adds Sobel X and Sobel Y and stores Sobel into a plane. -void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_SOBELTOPLANEROW_SSE2 - -#ifdef HAS_SOBELXYROW_SSE2 -// Mixes Sobel X, Sobel Y and Sobel into ARGB. -// A = 255 -// R = Sobel X -// G = Sobel -// B = Sobel Y -void SobelXYRow_SSE2(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - asm volatile( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "paddusb %%xmm1,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "punpckhbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm2,%%xmm4 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm4,%%xmm6 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "movdqa %%xmm1,%%xmm7 \n" - "punpcklwd %%xmm0,%%xmm7 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm6,(%2) \n" - "movdqu %%xmm4,0x10(%2) \n" - "movdqu %%xmm7,0x20(%2) \n" - "movdqu %%xmm1,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_SOBELXYROW_SSE2 - -#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 -// Creates a table of cumulative sums where each value is a sum of all values -// above and to the left of the value, inclusive of the value. -void ComputeCumulativeSumRow_SSE2(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width) { - asm volatile( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "test $0xf,%1 \n" - "jne 49f \n" - - // 4 pixel loop. - LABELALIGN - "40: \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm3 \n" - "punpckhbw %%xmm1,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "punpcklwd %%xmm1,%%xmm4 \n" - "punpckhwd %%xmm1,%%xmm5 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm0 \n" - "movdqu 0x10(%2),%%xmm3 \n" - "paddd %%xmm0,%%xmm3 \n" - "paddd %%xmm4,%%xmm0 \n" - "movdqu 0x20(%2),%%xmm4 \n" - "paddd %%xmm0,%%xmm4 \n" - "paddd %%xmm5,%%xmm0 \n" - "movdqu 0x30(%2),%%xmm5 \n" - "lea 0x40(%2),%2 \n" - "paddd %%xmm0,%%xmm5 \n" - "movdqu %%xmm2,(%1) \n" - "movdqu %%xmm3,0x10(%1) \n" - "movdqu %%xmm4,0x20(%1) \n" - "movdqu %%xmm5,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" - - // 1 pixel loop. - LABELALIGN - "10: \n" - "movd (%0),%%xmm2 \n" - "lea 0x4(%0),%0 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu (%2),%%xmm2 \n" - "lea 0x10(%2),%2 \n" - "paddd %%xmm0,%%xmm2 \n" - "movdqu %%xmm2,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - - "19: \n" - : "+r"(row), // %0 - "+r"(cumsum), // %1 - "+r"(previous_cumsum), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 - -#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, - const int32_t* botleft, - int width, - int area, - uint8_t* dst, - int count) { - asm volatile( - "movd %5,%%xmm5 \n" - "cvtdq2ps %%xmm5,%%xmm5 \n" - "rcpss %%xmm5,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "cmpl $0x80,%5 \n" - "ja 40f \n" - - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrld $0x10,%%xmm6 \n" - "cvtdq2ps %%xmm6,%%xmm6 \n" - "addps %%xmm6,%%xmm5 \n" - "mulps %%xmm4,%%xmm5 \n" - "cvtps2dq %%xmm5,%%xmm5 \n" - "packssdw %%xmm5,%%xmm5 \n" - - // 4 pixel small loop. - LABELALIGN - "4: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 4b \n" - "jmp 49f \n" - - // 4 pixel loop - LABELALIGN - "40: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm1,%%xmm1 \n" - "mulps %%xmm4,%%xmm0 \n" - "mulps %%xmm4,%%xmm1 \n" - "cvtdq2ps %%xmm2,%%xmm2 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "cvtps2dq %%xmm1,%%xmm1 \n" - "cvtps2dq %%xmm2,%%xmm2 \n" - "cvtps2dq %%xmm3,%%xmm3 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" - - // 1 pixel loop - LABELALIGN - "10: \n" - "movdqu (%0),%%xmm0 \n" - "psubd 0x00(%0,%4,4),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "paddd 0x00(%1,%4,4),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "mulps %%xmm4,%%xmm0 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - "19: \n" - : "+r"(topleft), // %0 - "+r"(botleft), // %1 - "+r"(dst), // %2 - "+rm"(count) // %3 - : "r"((ptrdiff_t)(width)), // %4 - "rm"(area) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 - -#ifdef HAS_ARGBAFFINEROW_SSE2 +#if 1 // Copy ARGB pixels from source image with slope to a row of destination. LIBYUV_API -void ARGBAffineRow_SSE2(const uint8_t* src_argb, - int src_argb_stride, - uint8_t* dst_argb, - const float* src_dudv, - int width) { - ptrdiff_t src_argb_stride_temp = src_argb_stride; - intptr_t temp; - asm volatile( - "movq (%3),%%xmm2 \n" - "movq 0x08(%3),%%xmm7 \n" - "shl $0x10,%1 \n" - "add $0x4,%1 \n" - "movd %1,%%xmm5 \n" - "sub $0x4,%4 \n" - "jl 49f \n" - "pshufd $0x44,%%xmm7,%%xmm7 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "movdqa %%xmm2,%%xmm0 \n" - "addps %%xmm7,%%xmm0 \n" - "movlhps %%xmm0,%%xmm2 \n" - "movdqa %%xmm7,%%xmm4 \n" - "addps %%xmm4,%%xmm4 \n" - "movdqa %%xmm2,%%xmm3 \n" - "addps %%xmm4,%%xmm3 \n" - "addps %%xmm4,%%xmm4 \n" - - // 4 pixel loop - LABELALIGN - "40: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 - "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 - "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts - "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm1 \n" - "addps %%xmm4,%%xmm2 \n" - "movq %%xmm1,(%2) \n" - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd 0x00(%0,%5,1),%%xmm6 \n" - "punpckldq %%xmm6,%%xmm0 \n" - "addps %%xmm4,%%xmm3 \n" - "movq %%xmm0,0x08(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" - - "49: \n" - "add $0x3,%4 \n" - "jl 19f \n" - - // 1 pixel loop - LABELALIGN - "10: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "pmaddwd %%xmm5,%%xmm0 \n" - "addps %%xmm7,%%xmm2 \n" - "movd %%xmm0,%k1 \n" - "movd 0x00(%0,%1,1),%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x04(%2),%2 \n" - "sub $0x1,%4 \n" - "jge 10b \n" - "19: \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_stride_temp), // %1 - "+r"(dst_argb), // %2 - "+r"(src_dudv), // %3 - "+rm"(width), // %4 - "=&r"(temp) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #endif // HAS_ARGBAFFINEROW_SSE2 -#ifdef HAS_INTERPOLATEROW_SSSE3 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - asm volatile( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "psubb %%xmm4,%%xmm0 \n" - "psubb %%xmm4,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm3 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "pmaddubsw %%xmm1,%%xmm3 \n" - "paddw %%xmm4,%%xmm2 \n" - "paddw %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu 0x00(%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu %%xmm0,0x00(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+rm"(width), // %2 - "+r"(source_y_fraction) // %3 - : "r"(src_stride) // %4 - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 @@ -8853,33 +4968,6 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, } #endif // HAS_INTERPOLATEROW_AVX2 -#ifdef HAS_ARGBSHUFFLEROW_SSSE3 -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - asm volatile("movdqu (%3),%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -#endif // HAS_ARGBSHUFFLEROW_SSSE3 #ifdef HAS_ARGBSHUFFLEROW_AVX2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. @@ -8910,73 +4998,7 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBSHUFFLEROW_AVX2 -#ifdef HAS_I422TOYUY2ROW_SSE2 -void I422ToYUY2Row_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_yuy2, - int width) { - asm volatile("sub %1,%2 \n" - LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "add $0x10,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_I422TOYUY2ROW_SSE2 - -#ifdef HAS_I422TOUYVYROW_SSE2 -void I422ToUYVYRow_SSE2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uyvy, - int width) { - asm volatile("sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "add $0x10,%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,(%3) \n" - "movdqu %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); -} -#endif // HAS_I422TOUYVYROW_SSE2 #ifdef HAS_I422TOYUY2ROW_AVX2 void I422ToYUY2Row_AVX2(const uint8_t* src_y, @@ -9052,60 +5074,6 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, } #endif // HAS_I422TOUYVYROW_AVX2 -#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, - uint8_t* dst_argb, - const float* poly, - int width) { - asm volatile("pxor %%xmm3,%%xmm3 \n" - - // 2 pixel loop. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps 0x10(%3),%%xmm0 \n" - "mulps 0x10(%3),%%xmm4 \n" - "addps (%3),%%xmm0 \n" - "addps (%3),%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps 0x20(%3),%%xmm2 \n" - "mulps 0x20(%3),%%xmm6 \n" - "mulps 0x30(%3),%%xmm1 \n" - "mulps 0x30(%3),%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6"); -} -#endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, @@ -9148,43 +5116,9 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 -#ifdef HAS_HALFFLOATROW_SSE2 +#if 1 static float kScaleBias = 1.9259299444e-34f; -void HalfFloatRow_SSE2(const uint16_t* src, - uint16_t* dst, - float scale, - int width) { - scale *= kScaleBias; - asm volatile( - "movd %3,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm2 \n" // 8 shorts - "add $0x10,%0 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 - "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats - "punpckhwd %%xmm5,%%xmm3 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "psrld $0xd,%%xmm2 \n" - "psrld $0xd,%%xmm3 \n" - "packssdw %%xmm3,%%xmm2 \n" - "movdqu %%xmm2,-0x10(%0,%1,1) \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(scale) // %3 - : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); -} #endif // HAS_HALFFLOATROW_SSE2 #ifdef HAS_HALFFLOATROW_AVX2 @@ -9357,106 +5291,6 @@ void RGBColorTableRow_X86(uint8_t* dst_argb, } #endif // HAS_RGBCOLORTABLEROW_X86 -#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 -// Tranform RGB pixels with luma table. -void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - const uint8_t* luma, - uint32_t lumacoeff) { - uintptr_t pixel_temp; - uintptr_t table_temp; - asm volatile( - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0x8,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%2),%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "phaddw %%xmm0,%%xmm0 \n" - "pand %%xmm4,%%xmm0 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb (%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,(%3) \n" - "movzb 0x1(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x1(%3) \n" - "movzb 0x2(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x2(%3) \n" - "movzb 0x3(%2),%0 \n" - "mov %b0,0x3(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x4(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x4(%3) \n" - "movzb 0x5(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x5(%3) \n" - "movzb 0x6(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x6(%3) \n" - "movzb 0x7(%2),%0 \n" - "mov %b0,0x7(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - - "movzb 0x8(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x8(%3) \n" - "movzb 0x9(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0x9(%3) \n" - "movzb 0xa(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xa(%3) \n" - "movzb 0xb(%2),%0 \n" - "mov %b0,0xb(%3) \n" - - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - - "movzb 0xc(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xc(%3) \n" - "movzb 0xd(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xd(%3) \n" - "movzb 0xe(%2),%0 \n" - "movzb 0x00(%1,%0,1),%0 \n" - "mov %b0,0xe(%3) \n" - "movzb 0xf(%2),%0 \n" - "mov %b0,0xf(%3) \n" - "lea 0x10(%2),%2 \n" - "lea 0x10(%3),%3 \n" - "sub $0x4,%4 \n" - "jg 1b \n" - : "=&d"(pixel_temp), // %0 - "=&a"(table_temp), // %1 - "+r"(src_argb), // %2 - "+r"(dst_argb), // %3 - "+rm"(width) // %4 - : "r"(luma), // %5 - "rm"(lumacoeff) // %6 - : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 static const uvec8 kYUV24Shuffle[3] = { {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12}, @@ -9466,40 +5300,7 @@ static const uvec8 kYUV24Shuffle[3] = { // Convert biplanar NV21 to packed YUV24 // NV21 has VU in memory for chroma. // YUV24 is VUY in memory -void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - asm volatile( - "sub %0,%1 \n" - "movdqa (%4),%%xmm4 \n" // 3 shuffler constants - "movdqa 16(%4),%%xmm5 \n" - "movdqa 32(%4),%%xmm6 \n" - "1: \n" - "movdqu (%0),%%xmm2 \n" // load 16 Y values - "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values - "lea 16(%0),%0 \n" - "movdqa %%xmm2,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3 - "shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5 - "shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7 - "pshufb %%xmm4, %%xmm0 \n" // weave into YUV24 - "pshufb %%xmm5, %%xmm1 \n" - "pshufb %%xmm6, %%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm1,16(%2) \n" - "movdqu %%xmm2,32(%2) \n" - "lea 48(%2),%2 \n" - "sub $16,%3 \n" // 16 pixels per loop - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_yuv24), // %2 - "+r"(width) // %3 - : "r"(&kYUV24Shuffle[0]) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} + // Convert biplanar NV21 to packed YUV24 // NV21 has VU in memory for chroma. @@ -9587,34 +5388,14 @@ void NV21ToYUV24Row_AVX512(const uint8_t* src_y, #endif // HAS_NV21ToYUV24ROW_AVX512 -#ifdef HAS_SWAPUVROW_SSSE3 +#if 1 // Shuffle table for reversing the bytes. static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; // Convert UV plane of NV12 to VU of NV21. -void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("movdqu %3,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : "m"(kShuffleUVToVU) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} #endif // HAS_SWAPUVROW_SSSE3 #ifdef HAS_SWAPUVROW_AVX2 @@ -9642,50 +5423,7 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { } #endif // HAS_SWAPUVROW_AVX2 -void HalfMergeUVRow_SSSE3(const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_uv, - int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 - "pabsb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // load 16 U values - "movdqu (%1),%%xmm1 \n" // load 16 V values - "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row - "movdqu 0(%1,%5,1),%%xmm3 \n" - "lea 0x10(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" // half size - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x10(%1),%1 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" // store 8 UV pixels - "lea 0x10(%2),%2 \n" - "sub $0x10,%3 \n" // 16 src pixels per loop - "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(src_stride_u)), // %4 - "r"((ptrdiff_t)(src_stride_v)) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} void HalfMergeUVRow_AVX2(const uint8_t* src_u, int src_stride_u, @@ -9733,25 +5471,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { - asm volatile( - "pxor %%xmm1,%%xmm1 \n" - LABELALIGN - "1: \n" - "movd (%0),%%xmm0 \n" // load float - "maxss %%xmm1, %%xmm0 \n" // clamp to zero - "add 4, %0 \n" - "movd %%xmm0, (%1) \n" // store float - "add 4, %1 \n" - "sub $0x4,%2 \n" // 1 float per loop - "jg 1b \n" - : "+r"(src_x), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} #ifdef HAS_CONVERT16TO8ROW_AVX2 void Convert8To8Row_AVX2(const uint8_t* src_y, diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 93bc431bc..270bae4e3 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -1419,17 +1419,9 @@ static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, } #endif -#ifdef HAS_RGB24TOYJROW_RVV -#endif -#ifdef HAS_RAWTOYJROW_RVV -#endif -#ifdef HAS_RGB24TOYROW_RVV -#endif -#ifdef HAS_RAWTOYROW_RVV -#endif // Blend src_argb over src_argb1 and store to dst_argb. // dst_argb may be src_argb or src_argb1. diff --git a/source/row_win.cc b/source/row_win.cc index 77070d031..9d200c847 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -102,21 +102,9 @@ extern "C" { _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ dst_argb += 32; -#if defined(HAS_I422TOARGBROW_SSSE3) -#endif -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) -#endif - -#if defined(HAS_I444TOARGBROW_SSSE3) - -#endif - -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) - -#endif #if defined(HAS_ARGBTOYROW_AVX2) diff --git a/source/scale.cc b/source/scale.cc index 9c1e9b264..7d1948ddb 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -81,7 +81,7 @@ static void ScalePlaneDown2(int src_width, : ScaleRowDown2Box_SME; } #endif -#if defined(HAS_SCALEROWDOWN2_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowDown2 = filtering == kFilterNone @@ -185,7 +185,7 @@ static void ScalePlaneDown2_16(int src_width, : ScaleRowDown2Box_16_SME; } #endif -#if defined(HAS_SCALEROWDOWN2_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering == kFilterNone @@ -284,7 +284,7 @@ static void ScalePlaneDown4(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN4_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; @@ -353,7 +353,7 @@ static void ScalePlaneDown4_16(int src_width, filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; } #endif -#if defined(HAS_SCALEROWDOWN4_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; @@ -442,23 +442,15 @@ static void ScalePlaneDown34(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN34_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { if (dst_width % 24 == 0) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_SSSE3; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; } } else { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; } } } @@ -534,14 +526,10 @@ static void ScalePlaneDown34_16(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN34_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3; } } #endif @@ -630,22 +618,14 @@ static void ScalePlaneDown38(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN38_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; } if (dst_width % 12 == 0 && !filtering) { - ScaleRowDown38_3 = ScaleRowDown38_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_SSSE3; } if (dst_width % 6 == 0 && filtering) { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; } } #endif @@ -740,14 +720,10 @@ static void ScalePlaneDown38_16(int src_width, } } #endif -#if defined(HAS_SCALEROWDOWN38_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; } } #endif @@ -926,11 +902,9 @@ static int ScalePlaneBox(int src_width, : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) = ScaleAddRow_C; -#if defined(HAS_SCALEADDROW_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleAddRow = ScaleAddRow_Any_SSE2; if (IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_SSE2; } } #endif @@ -1015,9 +989,8 @@ static int ScalePlaneBox_16(int src_width, void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr, int src_width) = ScaleAddRow_16_C; -#if defined(HAS_SCALEADDROW_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { - ScaleAddRow = ScaleAddRow_16_SSE2; } #endif @@ -1076,11 +1049,9 @@ static int ScalePlaneBilinearDown(int src_width, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -1119,9 +1090,8 @@ static int ScalePlaneBilinearDown(int src_width, } #endif -#if defined(HAS_SCALEFILTERCOLS_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_SSSE3; } #endif #if defined(HAS_SCALEFILTERCOLS_NEON) @@ -1196,19 +1166,15 @@ static int ScalePlaneBilinearDown_16(int src_width, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_16_Any_SSE2; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; } } #endif -#if defined(HAS_INTERPOLATEROW_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; } } #endif @@ -1234,9 +1200,8 @@ static int ScalePlaneBilinearDown_16(int src_width, } #endif -#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_16_SSSE3; } #endif if (y > max_y) { @@ -1290,11 +1255,9 @@ static int ScalePlaneBilinearUp(int src_width, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -1328,9 +1291,8 @@ static int ScalePlaneBilinearUp(int src_width, if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_C; } -#if defined(HAS_SCALEFILTERCOLS_SSSE3) +#if 0 if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_SSSE3; } #endif #if defined(HAS_SCALEFILTERCOLS_NEON) @@ -1351,9 +1313,8 @@ static int ScalePlaneBilinearUp(int src_width, #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; -#if defined(HAS_SCALECOLS_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_SSE2; } #endif } @@ -1439,15 +1400,13 @@ static void ScalePlaneUp2_Linear(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; } #endif -#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; } #endif @@ -1504,15 +1463,13 @@ static void ScalePlaneUp2_Bilinear(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; } #endif -#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; } #endif @@ -1570,9 +1527,8 @@ static void ScalePlaneUp2_12_Linear(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif @@ -1625,9 +1581,8 @@ static void ScalePlaneUp2_12_Bilinear(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; } #endif @@ -1673,9 +1628,8 @@ static void ScalePlaneUp2_16_Linear(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; } #endif @@ -1723,9 +1677,8 @@ static void ScalePlaneUp2_16_Bilinear(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2; } #endif @@ -1779,19 +1732,15 @@ static int ScalePlaneBilinearUp_16(int src_width, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_16_Any_SSE2; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; } } #endif -#if defined(HAS_INTERPOLATEROW_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; } } #endif @@ -1820,16 +1769,14 @@ static int ScalePlaneBilinearUp_16(int src_width, if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_16_C; } -#if defined(HAS_SCALEFILTERCOLS_16_SSSE3) +#if 0 if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleFilterCols = ScaleFilterCols_16_SSSE3; } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_16_C; -#if defined(HAS_SCALECOLS_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleFilterCols = ScaleColsUp2_16_SSE2; } #endif } @@ -1917,9 +1864,8 @@ static void ScalePlaneSimple(int src_width, if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_C; -#if defined(HAS_SCALECOLS_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_SSE2; } #endif } @@ -1954,9 +1900,8 @@ static void ScalePlaneSimple_16(int src_width, if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_16_C; -#if defined(HAS_SCALECOLS_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleCols = ScaleColsUp2_16_SSE2; } #endif } diff --git a/source/scale_any.cc b/source/scale_any.cc index c380bebbc..f7f457e3a 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -49,7 +49,7 @@ extern "C" { dst_ptr + n * BPP, r + 1); \ } -#ifdef HAS_SCALEROWDOWN2_SSSE3 +#if 0 SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3, @@ -70,7 +70,7 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3, 1, 15) #endif -#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 +#if 0 SDANY(ScaleUVRowDown2Box_Any_SSSE3, ScaleUVRowDown2Box_SSSE3, ScaleUVRowDown2Box_C, @@ -167,7 +167,7 @@ SDANY(ScaleRowDown2Box_Any_LSX, 1, 31) #endif -#ifdef HAS_SCALEROWDOWN4_SSSE3 +#if 0 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, @@ -203,7 +203,7 @@ SDANY(ScaleRowDown4Box_Any_LSX, 1, 15) #endif -#ifdef HAS_SCALEROWDOWN34_SSSE3 +#if 0 SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, ScaleRowDown34_C, @@ -284,7 +284,7 @@ SDANY(ScaleRowDown34_1_Box_Any_LSX, 1, 47) #endif -#ifdef HAS_SCALEROWDOWN38_SSSE3 +#if 0 SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, ScaleRowDown38_C, @@ -345,7 +345,7 @@ SDANY(ScaleRowDown38_2_Box_Any_LSX, 11) #endif -#ifdef HAS_SCALEARGBROWDOWN2_SSE2 +#if 0 SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2, ScaleARGBRowDown2_C, @@ -420,7 +420,7 @@ SDANY(ScaleARGBRowDown2Box_Any_LSX, dst_ptr + n * BPP, r); \ } -#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 +#if 0 SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2, ScaleARGBRowDownEven_C, @@ -484,7 +484,7 @@ SDAANY(ScaleUVRowDownEven_Any_NEON, memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \ } -#ifdef HAS_SCALEADDROW_SSE2 +#if 0 SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15) #endif #ifdef HAS_SCALEADDROW_AVX2 @@ -510,7 +510,7 @@ SAROW(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, 1, 2, 15) SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ } -#ifdef HAS_SCALEADDROW_SSE2 +#if 0 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) #endif #ifdef HAS_SCALEADDROW_AVX2 @@ -597,7 +597,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_C, 0, uint16_t) -#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 +#if 0 SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, ScaleRowUp2_Linear_SSE2, ScaleRowUp2_Linear_C, @@ -605,7 +605,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, uint8_t) #endif -#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 +#if 0 SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, ScaleRowUp2_Linear_SSSE3, ScaleRowUp2_Linear_C, @@ -613,7 +613,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, uint8_t) #endif -#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 +#if 0 SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, ScaleRowUp2_Linear_12_SSSE3, ScaleRowUp2_Linear_16_C, @@ -621,7 +621,7 @@ SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, uint16_t) #endif -#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 +#if 0 SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, ScaleRowUp2_Linear_16_SSE2, ScaleRowUp2_Linear_16_C, @@ -725,7 +725,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C, 0, uint16_t) -#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 +#if 0 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, ScaleRowUp2_Bilinear_SSE2, ScaleRowUp2_Bilinear_C, @@ -733,7 +733,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, uint8_t) #endif -#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 +#if 0 SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, ScaleRowUp2_Bilinear_12_SSSE3, ScaleRowUp2_Bilinear_16_C, @@ -741,7 +741,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, uint16_t) #endif -#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 +#if 0 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, ScaleRowUp2_Bilinear_16_SSE2, ScaleRowUp2_Bilinear_16_C, @@ -749,7 +749,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, uint16_t) #endif -#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 +#if 0 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, ScaleRowUp2_Bilinear_SSSE3, ScaleRowUp2_Bilinear_C, @@ -837,7 +837,7 @@ SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C, 0, uint16_t) -#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 +#if 0 SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3, ScaleUVRowUp2_Linear_SSSE3, ScaleUVRowUp2_Linear_C, @@ -853,7 +853,7 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2, uint8_t) #endif -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +#if 0 SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41, ScaleUVRowUp2_Linear_16_SSE41, ScaleUVRowUp2_Linear_16_C, @@ -935,7 +935,7 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C, 0, uint16_t) -#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 +#if 0 SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3, ScaleUVRowUp2_Bilinear_SSSE3, ScaleUVRowUp2_Bilinear_C, @@ -951,7 +951,7 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2, uint8_t) #endif -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +#if 0 SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41, ScaleUVRowUp2_Bilinear_16_SSE41, ScaleUVRowUp2_Bilinear_16_C, diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 506409c15..02951eccc 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -67,7 +67,7 @@ static void ScaleARGBDown2(int src_width, src_argb += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 4; } -#if defined(HAS_SCALEARGBROWDOWN2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDown2 = filtering == kFilterNone @@ -180,11 +180,9 @@ static int ScaleARGBDown4Box(int src_width, (void)dx; assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. -#if defined(HAS_SCALEARGBROWDOWN2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; } } #endif @@ -246,7 +244,7 @@ static void ScaleARGBDownEven(int src_width, assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4; -#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 : ScaleARGBRowDownEven_Any_SSE2; @@ -331,11 +329,9 @@ static int ScaleARGBBilinearDown(int src_width, clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. src_argb += xl * 4; x -= (int)(xl << 16); -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -373,9 +369,8 @@ static int ScaleARGBBilinearDown(int src_width, InterpolateRow = InterpolateRow_RVV; } #endif -#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif #if defined(HAS_SCALEARGBFILTERCOLS_NEON) @@ -454,11 +449,9 @@ static int ScaleARGBBilinearUp(int src_width, int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -500,9 +493,8 @@ static int ScaleARGBBilinearUp(int src_width, ScaleARGBFilterCols = filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } -#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) +#if 0 if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif #if defined(HAS_SCALEARGBFILTERCOLS_NEON) @@ -526,9 +518,8 @@ static int ScaleARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBFilterCols_RVV; } #endif -#if defined(HAS_SCALEARGBCOLS_SSE2) +#if 0 if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBCols_SSE2; } #endif #if defined(HAS_SCALEARGBCOLS_NEON) @@ -549,9 +540,8 @@ static int ScaleARGBBilinearUp(int src_width, #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; -#if defined(HAS_SCALEARGBCOLSUP2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif } @@ -638,11 +628,9 @@ static int ScaleYUVToARGBBilinearUp(int src_width, void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(src_width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -706,11 +694,9 @@ static int ScaleYUVToARGBBilinearUp(int src_width, void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -756,9 +742,8 @@ static int ScaleYUVToARGBBilinearUp(int src_width, ScaleARGBFilterCols = filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } -#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) +#if 0 if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif #if defined(HAS_SCALEARGBFILTERCOLS_NEON) @@ -782,9 +767,8 @@ static int ScaleYUVToARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBFilterCols_RVV; } #endif -#if defined(HAS_SCALEARGBCOLS_SSE2) +#if 0 if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleARGBFilterCols = ScaleARGBCols_SSE2; } #endif #if defined(HAS_SCALEARGBCOLS_NEON) @@ -805,9 +789,8 @@ static int ScaleYUVToARGBBilinearUp(int src_width, #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; -#if defined(HAS_SCALEARGBCOLSUP2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif } @@ -914,9 +897,8 @@ static void ScaleARGBSimple(int src_width, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; (void)src_height; -#if defined(HAS_SCALEARGBCOLS_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleARGBCols = ScaleARGBCols_SSE2; } #endif #if defined(HAS_SCALEARGBCOLS_NEON) @@ -937,9 +919,8 @@ static void ScaleARGBSimple(int src_width, #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; -#if defined(HAS_SCALEARGBCOLSUP2_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBCols = ScaleARGBColsUp2_SSE2; } #endif } diff --git a/source/scale_common.cc b/source/scale_common.cc index 537f030aa..367292158 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1636,11 +1636,9 @@ void ScalePlaneVertical(int src_height, assert(dst_width > 0); assert(dst_height > 0); src_argb += (x >> 16) * bpp; -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -1718,19 +1716,15 @@ void ScalePlaneVertical_16(int src_height, assert(dst_width > 0); assert(dst_height > 0); src_argb += (x >> 16) * wpp; -#if defined(HAS_INTERPOLATEROW_16_SSE2) +#if 0 if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_16_Any_SSE2; if (IS_ALIGNED(dst_width_words, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; } } #endif -#if defined(HAS_INTERPOLATEROW_16_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_16_Any_SSSE3; if (IS_ALIGNED(dst_width_words, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; } } #endif diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 6a2524230..58fa88da3 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -94,101 +94,11 @@ static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt -void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 - "pabsb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); -} -void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 - "pabsb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} #ifdef HAS_SCALEROWDOWN2_AVX2 void ScaleRowDown2_AVX2(const uint8_t* src_ptr, @@ -291,89 +201,9 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, } #endif // HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5"); -} -void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - ptrdiff_t stridex3; - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "pabsw %%xmm4,%%xmm5 \n" - "pabsb %%xmm4,%%xmm4 \n" // 0x0101 - "psllw $0x3,%%xmm5 \n" // 0x0008 - "lea 0x00(%4,%4,2),%3 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%4,2),%%xmm2 \n" - "movdqu 0x10(%0,%4,2),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "=&r"(stridex3) // %3 - : "r"(src_stride) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} #ifdef HAS_SCALEROWDOWN4_AVX2 void ScaleRowDown4_AVX2(const uint8_t* src_ptr, @@ -465,310 +295,17 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, } #endif // HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kShuf0), // %0 - "m"(kShuf1), // %1 - "m"(kShuf2) // %2 - ); - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm2 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - asm volatile( - "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - asm volatile( - "1: \n" - "movdqu (%0),%%xmm6 \n" - "movdqu 0x00(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqu 0x10(%0),%%xmm6 \n" - "movdqu 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - (void)src_stride; - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1,0x8(%1) \n" - "lea 0xc(%1),%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kShuf38a), // %3 - "m"(kShuf38b) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); -} -void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" - : - : "m"(kShufAb0), // %0 - "m"(kShufAb1), // %1 - "m"(kShufAb2), // %2 - "m"(kScaleAb2) // %3 - ); - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,(%1) \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - : - : "m"(kShufAc), // %0 - "m"(kShufAc3), // %1 - "m"(kScaleAc33) // %2 - ); - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%3,1),%%xmm6 \n" - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqu 0x00(%0,%3,2),%%xmm6 \n" - "lea 0x10(%0),%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movd %%xmm6,(%1) \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} + + + + static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; @@ -776,611 +313,13 @@ static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3}; -#ifdef HAS_SCALEROWUP2_LINEAR_SSE2 -void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm0,%%xmm0 \n" // 0 - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $1,%%xmm6 \n" // all 2 - LABELALIGN - "1: \n" - "movq (%0),%%xmm1 \n" // 01234567 - "movq 1(%0),%%xmm2 \n" // 12345678 - "movdqa %%xmm1,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "paddw %%xmm6,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) - "paddw %%xmm5,%%xmm5 \n" - "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) - "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) - "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm2,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm6,%%xmm1 \n" - "paddw %%xmm3,%%xmm3 \n" - "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1) \n" - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif -#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2 -void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "1: \n" - "pxor %%xmm0,%%xmm0 \n" // 0 - // above line - "movq (%0),%%xmm1 \n" // 01234567 - "movq 1(%0),%%xmm2 \n" // 12345678 - "movdqa %%xmm1,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm4 \n" // near+far - "movdqa %%xmm3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) - "paddw %%xmm5,%%xmm5 \n" // 2*near - "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) - "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm2,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - // below line - "movq (%0,%3),%%xmm6 \n" // 01234567 - "movq 1(%0,%3),%%xmm2 \n" // 12345678 - "movdqa %%xmm6,%%xmm3 \n" - "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 - "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677 - "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 - - "movdqa %%xmm6,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm7 \n" - "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16) - "paddw %%xmm7,%%xmm5 \n" // near+far - "movdqa %%xmm3,%%xmm7 \n" - "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16) - "paddw %%xmm7,%%xmm7 \n" // 2*near - "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo) - - "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16) - "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm6,%%xmm2 \n" // near+far - "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) - - // xmm4 xmm1 - // xmm5 xmm2 - "pcmpeqw %%xmm0,%%xmm0 \n" - "psrlw $15,%%xmm0 \n" - "psllw $3,%%xmm0 \n" // all 8 - - "movdqa %%xmm4,%%xmm3 \n" - "movdqa %%xmm5,%%xmm6 \n" - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo) - "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) - "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm3 \n" // ^ div by 16 - - "movdqa %%xmm1,%%xmm7 \n" - "movdqa %%xmm2,%%xmm6 \n" - "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi) - "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) - "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm7 \n" // ^ div by 16 - - "packuswb %%xmm7,%%xmm3 \n" - "movdqu %%xmm3,(%1) \n" // save above line - - "movdqa %%xmm5,%%xmm3 \n" - "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo) - "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 - - "movdqa %%xmm2,%%xmm3 \n" - "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi) - "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) - "psrlw $4,%%xmm2 \n" // ^ div by 16 - - "packuswb %%xmm2,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // save below line - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3 -void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "movdqa %3,%%xmm5 \n" - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // 01234567 (16) - "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) - - "movdqa %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) - "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) - - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far) - "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far) - - "paddw %%xmm4,%%xmm1 \n" // far+2 - "paddw %%xmm4,%%xmm3 \n" // far+2 - "paddw %%xmm0,%%xmm1 \n" // near+far+2 - "paddw %%xmm2,%%xmm3 \n" // near+far+2 - "paddw %%xmm0,%%xmm0 \n" // 2*near - "paddw %%xmm2,%%xmm2 \n" // 2*near - "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi) - - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,16(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearShuffleFar) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3 -void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" - "psllw $3,%%xmm7 \n" // all 8 - "movdqa %5,%%xmm6 \n" - - LABELALIGN - "1: \n" - // above line - "movdqu (%0),%%xmm0 \n" // 01234567 (16) - "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) - "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far) - "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far) - "paddw %%xmm0,%%xmm1 \n" // near+far - "paddw %%xmm2,%%xmm3 \n" // near+far - "paddw %%xmm0,%%xmm0 \n" // 2*near - "paddw %%xmm2,%%xmm2 \n" // 2*near - "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo) - "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi) - - // below line - "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16) - "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16) - "movdqa %%xmm1,%%xmm3 \n" - "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16) - "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16) - "movdqa %%xmm3,%%xmm5 \n" - "movdqa %%xmm1,%%xmm4 \n" - "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far) - "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far) - "paddw %%xmm1,%%xmm4 \n" // near+far - "paddw %%xmm3,%%xmm5 \n" // near+far - "paddw %%xmm1,%%xmm1 \n" // 2*near - "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) - "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 - "movdqu %%xmm4,(%1) \n" - - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi) - "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm4 \n" // ^ div by 16 - "movdqu %%xmm4,0x10(%1) \n" - - "movdqa %%xmm1,%%xmm4 \n" - "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo) - "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm1 \n" // ^ div by 16 - "movdqu %%xmm1,(%1,%4,2) \n" - - "movdqa %%xmm3,%%xmm4 \n" - "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi) - "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm3 \n" // ^ div by 16 - "movdqu %%xmm3,0x10(%1,%4,2) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearShuffleFar) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2 -void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqd %%xmm4,%%xmm4 \n" - "psrld $31,%%xmm4 \n" - "pslld $1,%%xmm4 \n" // all 2 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0123 (16b) - "movq 2(%0),%%xmm1 \n" // 1234 (16b) - - "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) - "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) - - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - - "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) - - "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) - "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) - "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - - "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "packssdw %%xmm1,%%xmm0 \n" - "pshufd $0b11011000,%%xmm0,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2 -void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pxor %%xmm7,%%xmm7 \n" - "pcmpeqd %%xmm6,%%xmm6 \n" - "psrld $31,%%xmm6 \n" - "pslld $3,%%xmm6 \n" // all 8 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) - "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) - "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0),%%xmm0 \n" // 0123 (16b) - "movq 2(%0),%%xmm1 \n" // 1234 (16b) - "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) - "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) - "paddd %%xmm0,%%xmm2 \n" // near+far (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0,%3,2),%%xmm2 \n" - "movq 2(%0,%3,2),%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) - "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) - "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) - "paddd %%xmm2,%%xmm4 \n" // near+far (lo) - "paddd %%xmm3,%%xmm5 \n" // near+far (hi) - "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) - "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) - "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) - "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm1,%%xmm0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) - "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) - "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) - "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) - - "packssdw %%xmm0,%%xmm4 \n" - "pshufd $0b11011000,%%xmm4,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packssdw %%xmm2,%%xmm5 \n" - "pshufd $0b11011000,%%xmm5,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4,2) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif - -#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3 -void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - "movdqa %3,%%xmm3 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 01234567 - "movq 1(%0),%%xmm1 \n" // 12345678 - "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 - "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 - "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 - "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi) - "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo) - "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} -#endif - -#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3 -void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $3,%%xmm6 \n" // all 8 - "movdqa %5,%%xmm7 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 01234567 - "movq 1(%0),%%xmm1 \n" // 12345678 - "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 - "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 - "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 - "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi) - "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo) - - "movq (%0,%3),%%xmm1 \n" - "movq 1(%0,%3),%%xmm4 \n" - "punpcklwd %%xmm1,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm4 \n" - "movdqa %%xmm1,%%xmm3 \n" - "punpckhdq %%xmm4,%%xmm3 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) - "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm2,%%xmm0 \n" - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) - "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) - - "packuswb %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif #ifdef HAS_SCALEROWUP2_LINEAR_AVX2 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, @@ -1756,34 +695,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, #endif // Reads 16xN bytes and produces 16 shorts at a time. -void ScaleAddRow_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" - // 16 pixel loop. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" // src_ptr += 16 - "movdqu (%1),%%xmm0 \n" - "movdqu 0x10(%1),%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); -} #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. @@ -1825,383 +737,31 @@ static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - intptr_t x0, x1, temp_pixel; - asm volatile( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" // 0x007f007f - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" // 0x00010001 - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movzwl 0x00(%1,%4,1),%k2 \n" - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + - // 1 - "paddusb %%xmm7,%%xmm1 \n" - "pmaddubsw %%xmm0,%%xmm1 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "paddw %9,%%xmm1 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,%k2 \n" - "mov %w2,(%0) \n" - "lea 0x2(%0),%0 \n" - "subl $0x2,%5 \n" - "jge 2b \n" - - LABELALIGN - "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - "movzwl 0x00(%1,%3,1),%k2 \n" - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm2 \n" - "paddusb %%xmm7,%%xmm2 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "paddw %9,%%xmm2 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,%k2 \n" - "mov %b2,(%0) \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "=&a"(temp_pixel), // %2 - "=&r"(x0), // %3 - "=&r"(x1), // %4 -#if defined(__x86_64__) - "+rm"(dst_width) // %5 -#else - "+m"(dst_width) // %5 -#endif - : "rm"(x), // %6 - "rm"(dx), // %7 -#if defined(__x86_64__) - "x"(kFsub80), // %8 - "x"(kFadd40) // %9 -#else - "m"(kFsub80), // %8 - "m"(kFadd40) // %9 -#endif - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - (void)x; - (void)dx; - asm volatile( - "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} -void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - asm volatile( - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x00(%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} + + + // Reads 4 pixels at a time. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx); - ptrdiff_t src_stepx_x12; - (void)src_stride; - asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" - LABELALIGN - "1: \n" - "movd (%0),%%xmm0 \n" - "movd 0x00(%0,%1,1),%%xmm1 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%0,%1,2),%%xmm2 \n" - "movd 0x00(%0,%4,1),%%xmm3 \n" - "lea 0x00(%0,%1,4),%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width), // %3 - "=&r"(src_stepx_x12) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} // Blends four 2x2 to 4x1. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx); - ptrdiff_t src_stepx_x12; - asm volatile( - "lea 0x00(,%1,4),%1 \n" - "lea 0x00(%1,%1,2),%4 \n" - "lea 0x00(%0,%5,1),%5 \n" - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movhps 0x00(%0,%1,1),%%xmm0 \n" - "movq 0x00(%0,%1,2),%%xmm1 \n" - "movhps 0x00(%0,%4,1),%%xmm1 \n" - "lea 0x00(%0,%1,4),%0 \n" - "movq (%5),%%xmm2 \n" - "movhps 0x00(%5,%1,1),%%xmm2 \n" - "movq 0x00(%5,%1,2),%%xmm3 \n" - "movhps 0x00(%5,%4,1),%%xmm3 \n" - "lea 0x00(%5,%1,4),%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+rm"(dst_width), // %3 - "=&r"(src_stepx_x12), // %4 - "+r"(src_stride) // %5 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); -} -void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - intptr_t x0, x1; - asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x11,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x5,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "cmp $0x0,%4 \n" - "jl 99f \n" - "sub $0x4,%4 \n" - "jl 49f \n" - LABELALIGN - "40: \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "pextrw $0x7,%%xmm2,%k1 \n" - "paddd %%xmm3,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd 0x00(%3,%0,4),%%xmm1 \n" - "movd 0x00(%3,%1,4),%%xmm4 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" - - "49: \n" - "test $0x2,%4 \n" - "je 29f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd 0x00(%3,%1,4),%%xmm1 \n" - "pextrw $0x5,%%xmm2,%k0 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%2) \n" - "lea 0x8(%2),%2 \n" - "29: \n" - "test $0x1,%4 \n" - "je 99f \n" - "movd 0x00(%3,%0,4),%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "99: \n" - : "=&a"(x0), // %0 - "=&d"(x1), // %1 - "+r"(dst_argb), // %2 - "+r"(src_argb), // %3 - "+r"(dst_width) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - (void)x; - (void)dx; - asm volatile( - "1: \n" - "movdqu (%1),%%xmm0 \n" - "lea 0x10(%1),%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpckldq %%xmm0,%%xmm0 \n" - "punpckhdq %%xmm1,%%xmm1 \n" - "movdqu %%xmm0,(%0) \n" - "movdqu %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1"); -} // Shuffle table for arranging 2 pixels into pairs for pmaddubsw static const uvec8 kShuffleColARGB = { @@ -2215,80 +775,7 @@ static const uvec8 kShuffleFractions = { }; // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version -void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - intptr_t x0, x1; - asm volatile( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" - : - : "m"(kShuffleColARGB), // %0 - "m"(kShuffleFractions) // %1 - ); - asm volatile( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movhps 0x00(%1,%4,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%0) \n" - "lea 0x8(%0),%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" - - LABELALIGN - "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - "movq 0x00(%1,%3,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%0) \n" - - LABELALIGN "99: \n" - - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+rm"(dst_width), // %2 - "=&r"(x0), // %3 - "=&r"(x1) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} // Divide num by div and return as 16.16 fixed point result. int FixedDiv_X86(int num, int div) { @@ -2321,7 +808,7 @@ int FixedDiv1_X86(int num, int div) { return num; } -#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \ +#if 1 || \ defined(HAS_SCALEUVROWDOWN2BOX_AVX2) // Shuffle table for splitting UV into upper and lower part of register. @@ -2332,46 +819,6 @@ static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, 0x80, 0x80, 0x80, 0x80}; #endif -#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 - -void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5, %%xmm5 \n" // zero - "movdqa %4,%%xmm1 \n" // split shuffler - "movdqa %5,%%xmm3 \n" // merge shuffler - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" // 8 UV row 0 - "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 - "lea 0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv - "pshufb %%xmm1,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add - "pmaddubsw %%xmm4,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" // vertical add - "psrlw $0x1,%%xmm0 \n" // round - "pavgw %%xmm5,%%xmm0 \n" - "pshufb %%xmm3,%%xmm0 \n" // merge uv - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" // 4 UV - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kShuffleSplitUV), // %4 - "m"(kShuffleMergeUV) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, @@ -2417,129 +864,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3, 1, 1, 3, 1, 3}; -#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 -void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm4,%%xmm4 \n" - "psrlw $15,%%xmm4 \n" - "psllw $1,%%xmm4 \n" // all 2 - "movdqa %3,%%xmm3 \n" - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 00112233 (1u1v) - "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) - "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) - "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) - "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi) - "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo) - "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) - "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kUVLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 -void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $3,%%xmm6 \n" // all 8 - "movdqa %5,%%xmm7 \n" - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 00112233 (1u1v) - "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) - "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) - "movdqa %%xmm0,%%xmm2 \n" - "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) - "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) - "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi) - "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo) - - "movq (%0,%3),%%xmm1 \n" - "movq 2(%0,%3),%%xmm4 \n" - "punpcklbw %%xmm4,%%xmm1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "punpckhdq %%xmm1,%%xmm3 \n" - "punpckldq %%xmm1,%%xmm1 \n" - "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) - "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) - - // xmm0 xmm2 - // xmm1 xmm3 - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm2,%%xmm0 \n" - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm1 \n" - "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) - "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) - "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) - "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) - - "packuswb %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packuswb %%xmm1,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 4 uv to 8 uv - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kUVLinearMadd31) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif #ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 @@ -2665,148 +990,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 -void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqd %%xmm4,%%xmm4 \n" - "psrld $31,%%xmm4 \n" - "pslld $1,%%xmm4 \n" // all 2 - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - - "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v) - "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v) - - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far) - - "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) - "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) - "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) - "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - - "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) - "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "packusdw %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 2 uv to 4 uv - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} -#endif - -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 -void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "pxor %%xmm7,%%xmm7 \n" - "pcmpeqd %%xmm6,%%xmm6 \n" - "psrld $31,%%xmm6 \n" - "pslld $3,%%xmm6 \n" // all 8 - - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) - "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) - "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) - "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) - "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) - "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) - "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) - "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) - "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) - "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) - - "movq (%0,%3,2),%%xmm2 \n" - "movq 4(%0,%3,2),%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo) - "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi) - "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo) - "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi) - "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo) - "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi) - "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) - "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - - "movdqa %%xmm0,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) - "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) - "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) - "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) - "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) - - "movdqa %%xmm2,%%xmm5 \n" - "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) - "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) - "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) - - "movdqa %%xmm1,%%xmm0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) - "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) - "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) - "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) - "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) - - "movdqa %%xmm3,%%xmm2 \n" - "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) - "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) - "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) - "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) - - "packusdw %%xmm0,%%xmm4 \n" - "movdqu %%xmm4,(%1) \n" // store above - "packusdw %%xmm2,%%xmm5 \n" - "movdqu %%xmm5,(%1,%4,2) \n" // store below - - "lea 0x8(%0),%0 \n" - "lea 0x10(%1),%1 \n" // 2 uv to 4 uv - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 3d41a2398..f80e476f2 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -88,11 +88,9 @@ static void ScaleUVDown2(int src_width, src_uv += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 2; } -#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && filtering) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; } } #endif @@ -138,7 +136,7 @@ static void ScaleUVDown2(int src_width, #endif // This code is not enabled. Only box filter is available at this time. -#if defined(HAS_SCALEUVROWDOWN2_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleUVRowDown2 = filtering == kFilterNone @@ -200,11 +198,9 @@ static int ScaleUVDown4Box(int src_width, assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. -#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; } } #endif @@ -276,7 +272,7 @@ static void ScaleUVDownEven(int src_width, assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2; -#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3 : ScaleUVRowDownEven_Any_SSSE3; @@ -363,11 +359,9 @@ static int ScaleUVBilinearDown(int src_width, clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2. src_uv += xl * 2; x -= (int)(xl << 16); -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -405,9 +399,8 @@ static int ScaleUVBilinearDown(int src_width, InterpolateRow = InterpolateRow_RVV; } #endif -#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; } #endif #if defined(HAS_SCALEUVFILTERCOLS_NEON) @@ -473,11 +466,9 @@ static int ScaleUVBilinearUp(int src_width, int dst_width, int x, int dx) = filtering ? ScaleUVFilterCols_C : ScaleUVCols_C; const int max_y = (src_height - 1) << 16; -#if defined(HAS_INTERPOLATEROW_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 8)) { - InterpolateRow = InterpolateRow_SSSE3; } } #endif @@ -518,9 +509,8 @@ static int ScaleUVBilinearUp(int src_width, if (src_width >= 32768) { ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C; } -#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) +#if 0 if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; } #endif #if defined(HAS_SCALEUVFILTERCOLS_NEON) @@ -531,9 +521,8 @@ static int ScaleUVBilinearUp(int src_width, } } #endif -#if defined(HAS_SCALEUVCOLS_SSSE3) +#if 0 if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVFilterCols = ScaleUVCols_SSSE3; } #endif #if defined(HAS_SCALEUVCOLS_NEON) @@ -546,9 +535,8 @@ static int ScaleUVBilinearUp(int src_width, #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleUVFilterCols = ScaleUVColsUp2_C; -#if defined(HAS_SCALEUVCOLSUP2_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { - ScaleUVFilterCols = ScaleUVColsUp2_SSSE3; } #endif } @@ -636,9 +624,8 @@ static void ScaleUVLinearUp2(int src_width, (void)src_width; assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3; } #endif @@ -696,9 +683,8 @@ static void ScaleUVBilinearUp2(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 +#if 0 if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3; } #endif @@ -757,9 +743,8 @@ static void ScaleUVLinearUp2_16(int src_width, (void)src_width; assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41; } #endif @@ -811,9 +796,8 @@ static void ScaleUVBilinearUp2_16(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 +#if 0 if (TestCpuFlag(kCpuHasSSE41)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41; } #endif @@ -865,9 +849,8 @@ static void ScaleUVSimple(int src_width, int x, int dx) = (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C; (void)src_height; -#if defined(HAS_SCALEUVCOLS_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { - ScaleUVCols = ScaleUVCols_SSSE3; } #endif #if defined(HAS_SCALEUVCOLS_NEON) @@ -880,9 +863,8 @@ static void ScaleUVSimple(int src_width, #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleUVCols = ScaleUVColsUp2_C; -#if defined(HAS_SCALEUVCOLSUP2_SSSE3) +#if 0 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { - ScaleUVCols = ScaleUVColsUp2_SSSE3; } #endif } diff --git a/source/scale_win.cc b/source/scale_win.cc index 32c0506fa..9b31a335a 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -90,109 +90,13 @@ static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0}; // Reads 32 pixels, throws half away and writes 16 pixels. -__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - ret - } -} // Blends 32x1 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pavgw xmm0, xmm5 // (x + 1) / 2 - pavgw xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - ret - } -} // Blends 32x2 rectangle to 16x1. -__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add - paddw xmm1, xmm3 - psrlw xmm0, 1 - psrlw xmm1, 1 - pavgw xmm0, xmm5 // (x + 1) / 2 - pavgw xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg wloop - - pop esi - ret - } -} #ifdef HAS_SCALEROWDOWN2_AVX2 // Reads 64 pixels, throws half away and writes 32 pixels. @@ -310,94 +214,10 @@ __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, #endif // HAS_SCALEROWDOWN2_AVX2 // Point samples 32 pixels to 8 pixels. -__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 - psrld xmm5, 24 - pslld xmm5, 16 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm0, 8 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - - ret - } -} // Blends 32x4 rectangle to 8x1. -__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm4, xmm4 // constant 0x0101 - psrlw xmm4, 15 - movdqa xmm5, xmm4 - packuswb xmm4, xmm4 - psllw xmm5, 3 // constant 0x0008 - wloop: - movdqu xmm0, [eax] // average rows - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - pmaddubsw xmm0, xmm4 // horizontal add - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add rows 0, 1 - paddw xmm1, xmm3 - movdqu xmm2, [eax + esi * 2] - movdqu xmm3, [eax + esi * 2 + 16] - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 2 - paddw xmm1, xmm3 - movdqu xmm2, [eax + edi] - movdqu xmm3, [eax + edi + 16] - lea eax, [eax + 32] - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 3 - paddw xmm1, xmm3 - phaddw xmm0, xmm1 - paddw xmm0, xmm5 // + 8 for round - psrlw xmm0, 4 // /16 for average of 4 * 4 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - - pop edi - pop esi - ret - } -} #ifdef HAS_SCALEROWDOWN4_AVX2 // Point samples 64 pixels to 16 pixels. @@ -500,38 +320,7 @@ __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. -__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm3, xmmword ptr kShuf0 - movdqa xmm4, xmmword ptr kShuf1 - movdqa xmm5, xmmword ptr kShuf2 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm1 - palignr xmm1, xmm0, 8 - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + 8], xmm1 - movq qword ptr [edx + 16], xmm2 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - ret - } -} // Blends 32x2 rectangle to 24x1 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. @@ -548,295 +337,24 @@ __declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShuf01 - movdqa xmm3, xmmword ptr kShuf11 - movdqa xmm4, xmmword ptr kShuf21 - movdqa xmm5, xmmword ptr kMadd01 - movdqa xmm6, xmmword ptr kMadd11 - movdqa xmm7, xmmword ptr kRound34 - wloop: - movdqu xmm0, [eax] // pixels 0..7 - movdqu xmm1, [eax + esi] - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 - movdqu xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, xmmword ptr kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - pop esi - ret - } -} // Note that movdqa+palign may be better than movdqu. -__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShuf01 - movdqa xmm3, xmmword ptr kShuf11 - movdqa xmm4, xmmword ptr kShuf21 - movdqa xmm5, xmmword ptr kMadd01 - movdqa xmm6, xmmword ptr kMadd11 - movdqa xmm7, xmmword ptr kRound34 - wloop: - movdqu xmm0, [eax] // pixels 0..7 - movdqu xmm1, [eax + esi] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 - movdqu xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, xmmword ptr kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx+24] - sub ecx, 24 - jg wloop - - pop esi - ret - } -} // 3/8 point sampler // Scale 32 pixels to 12 -__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm4, xmmword ptr kShuf38a - movdqa xmm5, xmmword ptr kShuf38b - xloop: - movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 - lea eax, [eax + 32] - pshufb xmm0, xmm4 - pshufb xmm1, xmm5 - paddusb xmm0, xmm1 - - movq qword ptr [edx], xmm0 // write 12 pixels - movhlps xmm1, xmm0 - movd [edx + 8], xmm1 - lea edx, [edx + 12] - sub ecx, 12 - jg xloop - - ret - } -} // Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShufAc - movdqa xmm3, xmmword ptr kShufAc3 - movdqa xmm4, xmmword ptr kScaleAc33 - pxor xmm5, xmm5 - xloop: - movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 - movdqu xmm6, [eax + esi] - movhlps xmm1, xmm0 - movhlps xmm7, xmm6 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - movdqu xmm6, [eax + esi * 2] - lea eax, [eax + 16] - movhlps xmm7, xmm6 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - - movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - pshufb xmm6, xmm2 - - movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - pshufb xmm7, xmm3 - paddusw xmm6, xmm7 - - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 - packuswb xmm6, xmm6 - - movd [edx], xmm6 // write 6 pixels - psrlq xmm6, 16 - movd [edx + 2], xmm6 - lea edx, [edx + 6] - sub ecx, 6 - jg xloop - - pop esi - ret - } -} // Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, xmmword ptr kShufAb0 - movdqa xmm3, xmmword ptr kShufAb1 - movdqa xmm4, xmmword ptr kShufAb2 - movdqa xmm5, xmmword ptr kScaleAb2 - xloop: - movdqu xmm0, [eax] // average 2 rows into xmm0 - movdqu xmm1, [eax + esi] - lea eax, [eax + 16] - pavgb xmm0, xmm1 - - movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 - pshufb xmm1, xmm2 - movdqa xmm6, xmm0 - pshufb xmm6, xmm3 - paddusw xmm1, xmm6 - pshufb xmm0, xmm4 - paddusw xmm1, xmm0 - - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 - packuswb xmm1, xmm1 - - movd [edx], xmm1 // write 6 pixels - psrlq xmm1, 16 - movd [edx + 2], xmm1 - lea edx, [edx + 6] - sub ecx, 6 - jg xloop - - pop esi - ret - } -} // Reads 16 bytes and accumulates to 16 shorts at a time. -__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, - uint16_t* dst_ptr, - int src_width) { - __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr - mov ecx, [esp + 12] // src_width - pxor xmm5, xmm5 - // sum rows - xloop: - movdqu xmm3, [eax] // read 16 bytes - lea eax, [eax + 16] - movdqu xmm0, [edx] // read 16 words from destination - movdqu xmm1, [edx + 16] - movdqa xmm2, xmm3 - punpcklbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - paddusw xmm0, xmm2 // sum 16 words - paddusw xmm1, xmm3 - movdqu [edx], xmm0 // write 16 words to destination - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 16 - jg xloop - ret - } -} #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. @@ -880,369 +398,28 @@ static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - __asm { - push ebx - push esi - push edi - mov edi, [esp + 12 + 4] // dst_ptr - mov esi, [esp + 12 + 8] // src_ptr - mov ecx, [esp + 12 + 12] // dst_width - movd xmm2, [esp + 12 + 16] // x - movd xmm3, [esp + 12 + 20] // dx - mov eax, 0x04040000 // shuffle to line up fractions with pixel. - movd xmm5, eax - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pcmpeqb xmm7, xmm7 // generate 0x0001 - psrlw xmm7, 15 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm1, 9 // 7 bit fractions. - movzx ebx, word ptr [esi + edx] // 2 source x1 pixels - movd xmm4, ebx - pshufb xmm1, xmm5 // 0011 - punpcklwd xmm0, xmm4 - psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm1, xmm6 // 0..7f and 7f..0 - paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm1, xmm1 // 8 bits, 2 pixels. - movd ebx, xmm1 - mov [edi], bx - lea edi, [edi + 2] - sub ecx, 2 // 2 pixels - jge xloop2 - - xloop29: - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - movzx ebx, word ptr [esi + eax] // 2 source x0 pixels - movd xmm0, ebx - psrlw xmm2, 9 // 7 bit fractions. - pshufb xmm2, xmm5 // 0011 - psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm2, xmm6 // 0..7f and 7f..0 - paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm2, xmm0 // 16 bit - paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm2, xmm2 // 8 bits - movd ebx, xmm2 - mov [edi], bl - - xloop99: - - pop edi - pop esi - pop ebx - ret - } -} // Reads 16 pixels, duplicates them and writes 32 pixels. -__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width - wloop: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 - punpckhbw xmm1, xmm1 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 32 - jg wloop - - ret - } -} // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - shufps xmm0, xmm1, 0xdd - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - ret - } -} // Blends 8x1 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - ret - } -} // Blends 8x2 rectangle to 4x1. -__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // dst_width - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop esi - ret - } -} // Reads 4 pixels at a time. -__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - __asm { - push ebx - push edi - mov eax, [esp + 8 + 4] // src_argb - // src_stride ignored - mov ebx, [esp + 8 + 12] // src_stepx - mov edx, [esp + 8 + 16] // dst_argb - mov ecx, [esp + 8 + 20] // dst_width - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - wloop: - movd xmm0, [eax] - movd xmm1, [eax + ebx] - punpckldq xmm0, xmm1 - movd xmm2, [eax + ebx * 2] - movd xmm3, [eax + edi] - lea eax, [eax + ebx * 4] - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop edi - pop ebx - ret - } -} // Blends four 2x2 to 4x1. -__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov esi, [esp + 12 + 8] // src_stride - mov ebx, [esp + 12 + 12] // src_stepx - mov edx, [esp + 12 + 16] // dst_argb - mov ecx, [esp + 12 + 20] // dst_width - lea esi, [eax + esi] // row1 pointer - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - wloop: - movq xmm0, qword ptr [eax] // row0 4 pairs - movhps xmm0, qword ptr [eax + ebx] - movq xmm1, qword ptr [eax + ebx * 2] - movhps xmm1, qword ptr [eax + edi] - lea eax, [eax + ebx * 4] - movq xmm2, qword ptr [esi] // row1 4 pairs - movhps xmm2, qword ptr [esi + ebx] - movq xmm3, qword ptr [esi + ebx * 2] - movhps xmm3, qword ptr [esi + edi] - lea esi, [esi + ebx * 4] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg wloop - - pop edi - pop esi - pop ebx - ret - } -} // Column scaling unfiltered. SSE2 version. -__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - push edi - push esi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 - pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 - paddd xmm2, xmm0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 2 - pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 - paddd xmm2, xmm0 // x3 x2 x1 x0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 4 - pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 - - pextrw eax, xmm2, 1 // get x0 integer. - pextrw edx, xmm2, 3 // get x1 integer. - - cmp ecx, 0 - jle xloop99 - sub ecx, 4 - jl xloop49 - - // 4 Pixel loop. - xloop4: - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - pextrw edx, xmm2, 7 // get x3 integer. - paddd xmm2, xmm3 // x += dx - punpckldq xmm0, xmm1 // x0 x1 - - movd xmm1, [esi + eax * 4] // 1 source x2 pixels - movd xmm4, [esi + edx * 4] // 1 source x3 pixels - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - punpckldq xmm1, xmm4 // x2 x3 - punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 - movdqu [edi], xmm0 - lea edi, [edi + 16] - sub ecx, 4 // 4 pixels - jge xloop4 - - xloop49: - test ecx, 2 - je xloop29 - - // 2 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x0 pixels - movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - punpckldq xmm0, xmm1 // x0 x1 - - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - - xloop29: - test ecx, 1 - je xloop99 - - // 1 Pixels. - movd xmm0, [esi + eax * 4] // 1 source x2 pixels - movd dword ptr [edi], xmm0 - xloop99: - - pop esi - pop edi - ret - } -} // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. // TODO(fbarchard): Port to Neon @@ -1258,104 +435,10 @@ static const uvec8 kShuffleFractions = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; -__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - movdqa xmm4, xmmword ptr kShuffleColARGB - movdqa xmm5, xmmword ptr kShuffleFractions - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - psrlw xmm1, 9 // 7 bit fractions. - movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels - pshufb xmm1, xmm5 // 0000000011111111 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 2 // 2 pixels - jge xloop2 - - xloop29: - - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - psrlw xmm2, 9 // 7 bit fractions. - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - pshufb xmm2, xmm5 // 00000000 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. - psrlw xmm0, 7 - packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. - movd [edi], xmm0 - - xloop99: - - pop edi - pop esi - ret - } -} // Reads 4 pixels, duplicates them and writes 8 pixels. -__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - __asm { - mov edx, [esp + 4] // dst_argb - mov eax, [esp + 8] // src_argb - mov ecx, [esp + 12] // dst_width - wloop: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpckldq xmm0, xmm0 - punpckhdq xmm1, xmm1 - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg wloop - - ret - } -} // Divide num by div and return as 16.16 fixed point result. __declspec(naked) int FixedDiv_X86(int num, int div) { diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index c29562cb8..d1dce2ff2 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -240,21 +240,15 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) { if (has_avx2) { h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); } else { - int has_sse42 = TestCpuFlag(kCpuHasSSE42); - if (has_sse42) { h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - if (has_ssse3) { h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } } } -#elif defined(HAS_HAMMINGDISTANCE_SSE42) - int has_sse42 = TestCpuFlag(kCpuHasSSE42); - if (has_sse42) { +#elif 0 h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); @@ -361,21 +355,15 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { if (has_avx2) { h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); } else { - int has_sse42 = TestCpuFlag(kCpuHasSSE42); - if (has_sse42) { h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - if (has_ssse3) { h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } } } -#elif defined(HAS_HAMMINGDISTANCE_SSE42) - int has_sse42 = TestCpuFlag(kCpuHasSSE42); - if (has_sse42) { +#elif 0 h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc index 177f3a669..141ae7cfb 100644 --- a/unit_test/convert_argb_test.cc +++ b/unit_test/convert_argb_test.cc @@ -762,9 +762,6 @@ TESTATOA(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1) TESTATOA(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) // TESTATOA(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1) TESTATOA(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) -#ifdef LITTLE_ENDIAN_ONLY_TEST -// TESTATOA(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) -#endif TESTATOA(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) // TESTATOA(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) // TESTATOA(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) @@ -1686,12 +1683,10 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { ARGBToAR30Row_C(src, dst_c, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { ARGBToAR30Row_AVX2(src, dst_opt, kPixels); } else if (has_ssse3) { - ARGBToAR30Row_SSSE3(src, dst_opt, kPixels); } else { ARGBToAR30Row_C(src, dst_opt, kPixels); } @@ -1720,12 +1715,10 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { ABGRToAR30Row_C(src, dst_c, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { ABGRToAR30Row_AVX2(src, dst_opt, kPixels); } else if (has_ssse3) { - ABGRToAR30Row_SSSE3(src, dst_opt, kPixels); } else { ABGRToAR30Row_C(src, dst_opt, kPixels); } diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index 80186de7a..1d39bc20e 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -168,10 +168,6 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { defined(_M_X64) int has_x86 = TestCpuFlag(kCpuHasX86); if (has_x86) { - int has_sse2 = TestCpuFlag(kCpuHasSSE2); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - int has_sse41 = TestCpuFlag(kCpuHasSSE41); - int has_sse42 = TestCpuFlag(kCpuHasSSE42); int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_erms = TestCpuFlag(kCpuHasERMS); @@ -190,10 +186,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8); int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8); printf("Has X86 0x%x\n", has_x86); - printf("Has SSE2 0x%x\n", has_sse2); printf("Has SSSE3 0x%x\n", has_ssse3); - printf("Has SSE4.1 0x%x\n", has_sse41); - printf("Has SSE4.2 0x%x\n", has_sse42); printf("Has AVX 0x%x\n", has_avx); printf("Has AVX2 0x%x\n", has_avx2); printf("Has ERMS 0x%x\n", has_erms); diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 2e26b4cf6..80674b94a 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1536,17 +1536,11 @@ TEST_F(LibYUVPlanarTest, TestAffine) { EXPECT_EQ(96u, interpolate_pixels_C[128][0]); EXPECT_EQ(191u, interpolate_pixels_C[255][3]); -#if defined(HAS_ARGBAFFINEROW_SSE2) +#if 0 SIMD_ALIGNED(uint8_t interpolate_pixels_Opt[1280][4]); - ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0], - uv_step, 1280); EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4)); - int has_sse2 = TestCpuFlag(kCpuHasSSE2); - if (has_sse2) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) { - ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0], - uv_step, 1280); } } #endif @@ -3916,14 +3910,11 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) { dst_pixels_y_c, 16384, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { Convert16To8Row_AVX2(reinterpret_cast(src_pixels_y), dst_pixels_y_opt, 16384, kPixels); } else if (has_ssse3) { - Convert16To8Row_SSSE3(reinterpret_cast(src_pixels_y), - dst_pixels_y_opt, 16384, kPixels); } else { Convert16To8Row_C(reinterpret_cast(src_pixels_y), dst_pixels_y_opt, 16384, kPixels); @@ -4020,16 +4011,11 @@ TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) { 1024, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_sse2 = TestCpuFlag(kCpuHasSSE2); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { Convert8To16Row_AVX2(src_pixels_y, reinterpret_cast(dst_pixels_y_opt), 1024, kPixels); - } else if (has_sse2) { - Convert8To16Row_SSE2(src_pixels_y, - reinterpret_cast(dst_pixels_y_opt), 1024, - kPixels); } else { Convert8To16Row_C(src_pixels_y, reinterpret_cast(dst_pixels_y_opt), 1024, diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc index abc08efa8..a2d6daa00 100644 --- a/unit_test/rotate_test.cc +++ b/unit_test/rotate_test.cc @@ -892,10 +892,8 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Test) { Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4, (uint8_t*)dst_pixels_opt, width * 4, width); } else -#elif defined(HAS_TRANSPOSE4X4_32_SSE2) +#elif 0 if (TestCpuFlag(kCpuHasSSE2)) { - Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4, - (uint8_t*)dst_pixels_opt, width * 4, width); } else #endif { @@ -938,8 +936,6 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Opt) { Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4, (uint8_t*)dst_pixels_opt, width * 4, width); } else if (TestCpuFlag(kCpuHasSSE2)) { - Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4, - (uint8_t*)dst_pixels_opt, width * 4, width); } else #endif { diff --git a/unit_test/scale_plane_test.cc b/unit_test/scale_plane_test.cc index 979c70aad..c9b773f52 100644 --- a/unit_test/scale_plane_test.cc +++ b/unit_test/scale_plane_test.cc @@ -43,7 +43,7 @@ namespace libyuv { #ifdef ENABLE_ROW_TESTS -#ifdef HAS_SCALEROWDOWN2_SSSE3 +#if 0 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]); SIMD_ALIGNED(uint8_t dst_pixels_opt[64]); @@ -52,7 +52,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt)); memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); if (!has_ssse3) { printf("Warning SSSE3 not detected; Skipping test.\n"); } else { @@ -114,7 +113,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { EXPECT_EQ(0u, dst_pixels_c[63]); // Test regular half size SSSE3. - ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); EXPECT_EQ(64u, dst_pixels_opt[0]); EXPECT_EQ(25u, dst_pixels_opt[1]); @@ -125,7 +123,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { // Compare C and SSSE3 match. ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); - ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); for (int i = 0; i < 64; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } diff --git a/util/psnr.cc b/util/psnr.cc index c7bee7f97..170a4b835 100644 --- a/util/psnr.cc +++ b/util/psnr.cc @@ -106,128 +106,11 @@ static uint32_t SumSquareError_NEON(const uint8_t* src_a, return sse; } #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -#define HAS_SUMSQUAREERROR_SSE2 -__declspec(naked) static uint32_t SumSquareError_SSE2(const uint8_t* /*src_a*/, - const uint8_t* /*src_b*/, - int /*count*/) { - __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count - pxor xmm0, xmm0 - pxor xmm5, xmm5 - sub edx, eax - wloop: - movdqu xmm1, [eax] - movdqu xmm2, [eax + edx] - lea eax, [eax + 16] - movdqu xmm3, xmm1 - psubusb xmm1, xmm2 - psubusb xmm2, xmm3 - por xmm1, xmm2 - movdqu xmm2, xmm1 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm0, xmm1 - paddd xmm0, xmm2 - sub ecx, 16 - ja wloop - - pshufd xmm1, xmm0, 0EEh - paddd xmm0, xmm1 - pshufd xmm1, xmm0, 01h - paddd xmm0, xmm1 - movd eax, xmm0 - ret - } -} #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) -#define HAS_SUMSQUAREERROR_SSE2 -static uint32_t SumSquareError_SSE2(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse; - asm volatile( // NOLINT - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" - "1: \n" - "movdqu (%0),%%xmm1 \n" - "movdqu (%0,%1,1),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "movdqu %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqu %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" - - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=g"(sse) // %3 - : - : "memory", "cc" -#if defined(__SSE2__) - , - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); // NOLINT - return sse; -} #endif // LIBYUV_DISABLE_X86 etc -#if defined(HAS_SUMSQUAREERROR_SSE2) -#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) -static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile( // NOLINT - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), - "=d"(cpu_info[3]) - : "a"(info_type)); -} -// For gcc/clang but not clangcl. -#elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__)) -static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile( // NOLINT - "cpuid \n" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), - "=d"(cpu_info[3]) - : "a"(info_type)); -} -#endif - -static int CpuHasSSE2() { -#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) - int cpu_info[4]; - __cpuid(cpu_info, 1); - if (cpu_info[3] & 0x04000000) { - return 1; - } -#endif - return 0; -} -#endif // HAS_SUMSQUAREERROR_SSE2 - static uint32_t SumSquareError_C(const uint8_t* src_a, const uint8_t* src_b, int count) { @@ -246,11 +129,6 @@ double ComputeSumSquareError(const uint8_t* src_a, int count) = SumSquareError_C; #if defined(HAS_SUMSQUAREERROR_NEON) SumSquareError = SumSquareError_NEON; -#endif -#if defined(HAS_SUMSQUAREERROR_SSE2) - if (CpuHasSSE2()) { - SumSquareError = SumSquareError_SSE2; - } #endif const int kBlockSize = 1 << 15; uint64_t sse = 0;