diff --git a/README.chromium b/README.chromium index 1407f963e..a805c91be 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1934 +Version: 1928 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h index 8adec16dc..c0473fd70 100644 --- a/include/libyuv/convert_from_argb.h +++ b/include/libyuv/convert_from_argb.h @@ -456,40 +456,6 @@ int ARGBToUYVY(const uint8_t* src_argb, int width, int height); -// RAW to NV21 with Matrix -LIBYUV_API -int RAWToNV21Matrix(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - const struct ArgbConstants* argbconstants, - int width, - int height); - -// RAW to NV21 -LIBYUV_API -int RAWToNV21(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -// RGB24 to NV12 -LIBYUV_API -int RGB24ToNV12(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - // RAW to JNV21 full range NV21 LIBYUV_API int RAWToJNV21(const uint8_t* src_raw, diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 20bf78198..852736a97 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -487,9 +487,6 @@ int NV21ToNV12(const uint8_t* src_y, int width, int height); -// Alias -#define NV12ToNV21 NV21ToNV12 - LIBYUV_API int YUY2ToY(const uint8_t* src_yuy2, int src_stride_yuy2, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 40272cf5a..b47d42eed 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -140,13 +140,6 @@ extern "C" { // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__) || \ - defined(_M_X64) || defined(_M_X86)) -#define HAS_ARGBTOUVMATRIXROW_AVX2 -#define HAS_MERGEUVROW_AVX2 -#endif - #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ defined(GCC_HAS_AVX2)) @@ -170,6 +163,7 @@ extern "C" { #define HAS_I444TORGB24ROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 #define HAS_J422TOARGBROW_AVX2 +#define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 #define HAS_NV12TORGB24ROW_AVX2 @@ -200,6 +194,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ (defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_ENABLE_ROWWIN) +#define HAS_RAWTOYJROW_SSSE3 #define HAS_AB64TOARGBROW_SSSE3 #define HAS_ABGRTOAR30ROW_SSSE3 #define HAS_ABGRTOYJROW_SSSE3 @@ -250,9 +245,11 @@ extern "C" { // TODO: port row_win to use 8 bit coefficients. #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 -#define HAS_ARGBTOYMATRIXROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 +#define HAS_RGB24TOYJROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 // TODO: adjust row_win to use 8 bit negative coefficients. @@ -300,7 +297,6 @@ extern "C" { #define HAS_ARGBTOUV444MATRIXROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 -#define HAS_ARGBTOYMATRIXROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 @@ -334,6 +330,8 @@ extern "C" { #define HAS_P210TOARGBROW_AVX2 #define HAS_P410TOAR30ROW_AVX2 #define HAS_P410TOARGBROW_AVX2 +#define HAS_RAWTOYJROW_AVX2 +#define HAS_RGB24TOYJROW_AVX2 #define HAS_RGBATOYJROW_AVX2 #define HAS_SPLITARGBROW_AVX2 #define HAS_SPLITRGBROW_AVX2 @@ -356,13 +354,7 @@ extern "C" { defined(_M_X64) || defined(_M_X86)) && \ ((defined(_MSC_VER) && !defined(__clang__)) || \ defined(LIBYUV_ENABLE_ROWWIN)) -#define HAS_RAWTOARGBROW_AVX2 -#if defined(__x86_64__) || defined(_M_X64) -#define HAS_RAWTOARGBROW_AVX512BW -#define HAS_RGB24TOARGBROW_AVX512BW -#endif #define HAS_ARGBTOYROW_AVX2 -#define HAS_ARGBTOYMATRIXROW_AVX2 #define HAS_ABGRTOYROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ABGRTOYJROW_AVX2 @@ -378,10 +370,6 @@ extern "C" { (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) && \ !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_COPYROW_AVX512BW -#if defined(__x86_64__) || defined(_M_X64) -#define HAS_RAWTOARGBROW_AVX512BW -#define HAS_RGB24TOARGBROW_AVX512BW -#endif #define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_CONVERT16TO8ROW_AVX512BW #define HAS_MERGEUVROW_AVX512BW @@ -395,7 +383,6 @@ extern "C" { #define HAS_ARGBTOUV444ROW_AVX512BW #define HAS_ARGBTOUV444MATRIXROW_AVX512BW #define HAS_ARGBTOYROW_AVX512BW -#define HAS_ARGBTOYMATRIXROW_AVX512BW #define HAS_ARGBTOUVJ444ROW_AVX512BW #define HAS_ARGBTOUVROW_AVX512BW #define HAS_ARGBTOUVJROW_AVX512BW @@ -433,7 +420,6 @@ extern "C" { #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJ444ROW_NEON #define HAS_ARGBTOUVJROW_NEON -#define HAS_ARGBTOUVMATRIXROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #if !defined(__aarch64__) @@ -496,9 +482,13 @@ extern "C" { #define HAS_RAWTORGBAROW_NEON #define HAS_RAWTOUVJROW_NEON #define HAS_RAWTOUVROW_NEON +#define HAS_RAWTOYJROW_NEON +#define HAS_RAWTOYROW_NEON #define HAS_RGB24TOARGBROW_NEON #define HAS_RGB24TOUVJROW_NEON #define HAS_RGB24TOUVROW_NEON +#define HAS_RGB24TOYJROW_NEON +#define HAS_RGB24TOYROW_NEON #define HAS_RGB565TOARGBROW_NEON #define HAS_RGB565TOUVROW_NEON #define HAS_RGB565TOYROW_NEON @@ -569,7 +559,6 @@ extern "C" { #define HAS_ARGBSEPIAROW_NEON_DOTPROD #define HAS_ARGBTOYJROW_NEON_DOTPROD #define HAS_ARGBTOYROW_NEON_DOTPROD -#define HAS_ARGBTOYMATRIXROW_NEON_DOTPROD #define HAS_BGRATOYROW_NEON_DOTPROD #define HAS_RGBATOYJROW_NEON_DOTPROD #define HAS_RGBATOYROW_NEON_DOTPROD @@ -580,7 +569,6 @@ extern "C" { #define HAS_ARGBTOUV444ROW_NEON_I8MM #define HAS_ARGBTOUVJ444ROW_NEON_I8MM #define HAS_ARGBTOUVJROW_NEON_I8MM -#define HAS_ARGBTOUVMATRIXROW_NEON_I8MM #define HAS_ARGBTOUVROW_NEON_I8MM #define HAS_BGRATOUVROW_NEON_I8MM #define HAS_RGBATOUVROW_NEON_I8MM @@ -596,7 +584,6 @@ extern "C" { #define HAS_ARGBTORGB565DITHERROW_SVE2 #define HAS_ARGBTORGB565ROW_SVE2 #define HAS_ARGBTOUVJROW_SVE2 -#define HAS_ARGBTOUVMATRIXROW_SVE2 #define HAS_ARGBTOUVROW_SVE2 #define HAS_AYUVTOUVROW_SVE2 #define HAS_AYUVTOVUROW_SVE2 @@ -648,7 +635,6 @@ extern "C" { #define HAS_ABGRTOUVROW_SME #define HAS_ARGBMULTIPLYROW_SME #define HAS_ARGBTOUVJROW_SME -#define HAS_ARGBTOUVMATRIXROW_SME #define HAS_ARGBTOUVROW_SME #define HAS_BGRATOUVROW_SME #define HAS_CONVERT16TO8ROW_SME @@ -757,8 +743,10 @@ extern "C" { #define HAS_RAWTOARGBROW_LSX #define HAS_RAWTORGB24ROW_LSX #define HAS_RAWTOUVROW_LSX +#define HAS_RAWTOYROW_LSX #define HAS_RGB24TOARGBROW_LSX #define HAS_RGB24TOUVROW_LSX +#define HAS_RGB24TOYROW_LSX #define HAS_RGB565TOARGBROW_LSX #define HAS_RGB565TOUVROW_LSX #define HAS_RGB565TOYROW_LSX @@ -778,9 +766,10 @@ extern "C" { #define HAS_YUY2TOUV422ROW_LSX #define HAS_YUY2TOYROW_LSX #define HAS_ARGBTOYROW_LSX -#define HAS_ARGBTOYMATRIXROW_LSX #define HAS_ABGRTOYJROW_LSX #define HAS_RGBATOYJROW_LSX +#define HAS_RGB24TOYJROW_LSX +#define HAS_RAWTOYJROW_LSX #endif #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) @@ -813,7 +802,6 @@ extern "C" { #define HAS_ARGBTOUVROW_LASX #define HAS_ARGBTOYJROW_LASX #define HAS_ARGBTOYROW_LASX -#define HAS_ARGBTOYMATRIXROW_LASX #define HAS_ABGRTOYJROW_LASX #define HAS_ABGRTOYROW_LASX #define HAS_I422ALPHATOARGBROW_LASX @@ -832,8 +820,10 @@ extern "C" { #define HAS_NV21TOARGBROW_LASX #define HAS_RAWTOARGBROW_LASX #define HAS_RAWTOUVROW_LASX +#define HAS_RAWTOYROW_LASX #define HAS_RGB24TOARGBROW_LASX #define HAS_RGB24TOUVROW_LASX +#define HAS_RGB24TOYROW_LASX #define HAS_RGB565TOARGBROW_LASX #define HAS_RGB565TOUVROW_LASX #define HAS_RGB565TOYROW_LASX @@ -846,6 +836,8 @@ extern "C" { #define HAS_RGBATOYROW_LASX #define HAS_RGBATOYJROW_LASX #define HAS_BGRATOYROW_LASX +#define HAS_RGB24TOYJROW_LASX +#define HAS_RAWTOYJROW_LASX #endif #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) @@ -875,6 +867,10 @@ extern "C" { #define HAS_BGRATOYROW_RVV #define HAS_COPYROW_RVV #define HAS_INTERPOLATEROW_RVV +#define HAS_RAWTOYJROW_RVV +#define HAS_RAWTOYROW_RVV +#define HAS_RGB24TOYJROW_RVV +#define HAS_RGB24TOYROW_RVV #define HAS_RGBATOYJROW_RVV #define HAS_RGBATOYMATRIXROW_RVV #define HAS_RGBATOYROW_RVV @@ -896,7 +892,8 @@ extern "C" { // __riscv_vcreate_v_u8m2x3 // __riscv_vcreate_v_u8m2x4 // __riscv_vcreate_v_u8m4x2 -#if defined(LIBYUV_RVV_HAS_VCREATE) +#if !defined(LIBYUV_RVV_HAS_TUPLE_TYPE) || \ + (defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VCREATE)) #define HAS_AB64TOARGBROW_RVV #define HAS_AR64TOAB64ROW_RVV #define HAS_ARGBATTENUATEROW_RVV @@ -1779,6 +1776,12 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYRow_AVX512BW(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width); +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width); void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width); @@ -1844,43 +1847,6 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); -void ARGBToUVMatrixRow_Any_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); -void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); -void ARGBToUVMatrixRow_Any_NEON_I8MM(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); -void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); -void ARGBToUVMatrixRow_SME(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); - void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2131,6 +2097,10 @@ void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr, void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width); void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, @@ -2141,19 +2111,31 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width); void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width); void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width); void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width); void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width); +void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width); void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); @@ -2215,42 +2197,6 @@ void ARGBToYMatrixRow_Any_AVX512BW(const uint8_t* src_argb, int width, const struct ArgbConstants* c); -void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); - -void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); -void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); -void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); -void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); - -void ARGBToYMatrixRow_Any_NEON_DotProd(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); -void ARGBToYMatrixRow_Any_LSX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); -void ARGBToYMatrixRow_Any_LASX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); - - void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2305,6 +2251,10 @@ void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); @@ -2324,6 +2274,14 @@ void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON_DotProd(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -2352,6 +2310,10 @@ void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_NEON_DotProd(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -2365,21 +2327,29 @@ void ABGRToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4029,7 +3999,6 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, int width); void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); @@ -4121,9 +4090,6 @@ void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); -void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h index f7e2123a7..e47b9fe5e 100644 --- a/include/libyuv/row_sve.h +++ b/include/libyuv/row_sve.h @@ -2019,7 +2019,7 @@ static const int8_t kABGRToUVJCoefficients[] = { 43, 85, -128, 0, -128, 107, 21, 0, }; -#define ARGBTOUVMATRIX_SVE \ +#define ABCDTOUVMATRIX_SVE \ "ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \ "ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \ "ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \ @@ -2113,7 +2113,7 @@ static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb, "ptrue p4.d \n" "ptrue p5.h \n" "1: \n" // - ARGBTOUVMATRIX_SVE + ABCDTOUVMATRIX_SVE "b.gt 1b \n" "2: \n" @@ -2126,7 +2126,7 @@ static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb, "whilelt p3.d, %w[vl2], %w[width] \n" "whilelt p4.d, %w[vl3], %w[width] \n" "whilelt p5.h, wzr, %w[width] \n" // - ARGBTOUVMATRIX_SVE + ABCDTOUVMATRIX_SVE "b.gt 3b \n" "99: \n" diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b745710eb..06231806f 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1934 +#define LIBYUV_VERSION 1928 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/psnr.o b/psnr.o new file mode 100644 index 000000000..bb3fe2adc Binary files /dev/null and b/psnr.o differ diff --git a/source/convert.cc b/source/convert.cc index d9fb54778..07a58f602 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -13,7 +13,6 @@ #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" -#include "libyuv/convert_from_argb.h" #include "libyuv/rotate.h" #include "libyuv/row.h" #include "libyuv/scale.h" // For ScalePlane() @@ -22,8 +21,6 @@ #ifdef __cplusplus namespace libyuv { -extern const struct ArgbConstants kArgbI601Constants; -extern const struct ArgbConstants kArgbJPEGConstants; extern "C" { #endif @@ -725,7 +722,7 @@ int I010ToNV12(const uint16_t* src_y, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } @@ -1165,7 +1162,7 @@ int I422ToNV21(const uint8_t* src_y, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } @@ -2181,96 +2178,7 @@ int ARGBToI420Matrix(const uint8_t* src_argb, void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c) = -ARGBToUVMatrixRow_C; - -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) - if (TestCpuFlag(kCpuHasNeonDotProd)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; - } -#endif - -#if defined(HAS_ARGBTOUVMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) - if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SME) - if (TestCpuFlag(kCpuHasSME)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; - } -#endif + ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; @@ -2286,6 +2194,14 @@ ARGBToUVMatrixRow_C; ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOYMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; + } + } #endif if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 || height == 0) { @@ -3015,76 +2931,21 @@ int RGB24ToI420(const uint8_t* src_rgb24, int width, int height) { int y; +#if defined(HAS_RGB24TOYROW) + void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB24ToUVRow_C; + void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = + RGB24ToYRow_C; +#else void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB24ToARGBRow_C; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; - } - } + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; #endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) - if (TestCpuFlag(kCpuHasNeonDotProd)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; - } -#endif - - if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } @@ -3095,6 +2956,48 @@ int RGB24ToI420(const uint8_t* src_rgb24, src_stride_rgb24 = -src_stride_rgb24; } +#if defined(HAS_RGB24TOYROW) + +// Neon version does direct RGB24 to YUV. +#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; + RGB24ToYRow = RGB24ToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_NEON; + RGB24ToUVRow = RGB24ToUVRow_NEON; + } + } +#endif +#if defined(HAS_RGB24TOYROW_LSX) && defined(HAS_RGB24TOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToUVRow = RGB24ToUVRow_Any_LSX; + RGB24ToYRow = RGB24ToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_LSX; + RGB24ToUVRow = RGB24ToUVRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOYROW_LASX) && defined(HAS_RGB24TOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToUVRow = RGB24ToUVRow_Any_LASX; + RGB24ToYRow = RGB24ToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToYRow = RGB24ToYRow_LASX; + RGB24ToUVRow = RGB24ToUVRow_LASX; + } + } +#endif +#if defined(HAS_RGB24TOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToYRow = RGB24ToYRow_RVV; + } +#endif + +// Other platforms do intermediate conversion from RGB24 to ARGB. +#else // HAS_RGB24TOYROW + #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; @@ -3103,54 +3006,28 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_RGB24TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - RGB24ToARGBRow = RGB24ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToARGBRow = RGB24ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - RGB24ToARGBRow = RGB24ToARGBRow_SVE2; - } -#endif -#if defined(HAS_RGB24TOARGBROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_LSX; + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_RGB24TOARGBROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX; +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - RGB24ToARGBRow = RGB24ToARGBRow_LASX; + ARGBToYRow = ARGBToYRow_AVX2; } } #endif -#if defined(HAS_RGB24TOARGBROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - RGB24ToARGBRow = RGB24ToARGBRow_RVV; +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToYRow = ARGBToYRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToYRow = ARGBToYRow_AVX512BW; + } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) @@ -3177,31 +3054,47 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif +#endif // HAS_RGB24TOYROW { +#if !defined(HAS_RGB24TOYROW) // Allocate 2 rows of ARGB. const int row_size = (width * 4 + 31) & ~31; align_buffer_64(row, row_size * 2); if (!row) return 1; +#endif for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RGB24TOYROW) + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); + RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); +#else RGB24ToARGBRow(src_rgb24, row, width); RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width); ARGBToUVRow(row, row_size, dst_u, dst_v, width); - ARGBToYMatrixRow(row, dst_y, width, &kArgbI601Constants); - ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, &kArgbI601Constants); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); +#endif src_rgb24 += src_stride_rgb24 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { +#if defined(HAS_RGB24TOYROW) + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); + RGB24ToYRow(src_rgb24, dst_y, width); +#else RGB24ToARGBRow(src_rgb24, row, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYMatrixRow(row, dst_y, width, &kArgbI601Constants); + ARGBToYRow(row, dst_y, width); +#endif } +#if !defined(HAS_RGB24TOYROW) free_aligned_buffer_64(row); +#endif } return 0; } @@ -3296,56 +3189,6 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_RGB24TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - RGB24ToARGBRow = RGB24ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToARGBRow = RGB24ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - RGB24ToARGBRow = RGB24ToARGBRow_SVE2; - } -#endif -#if defined(HAS_RGB24TOARGBROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_LSX; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - RGB24ToARGBRow = RGB24ToARGBRow_LASX; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - RGB24ToARGBRow = RGB24ToARGBRow_RVV; - } -#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -3451,76 +3294,20 @@ int RAWToI420(const uint8_t* src_raw, int width, int height) { int y; +#if defined(HAS_RAWTOYROW) + void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, + uint8_t* dst_v, int width) = RAWToUVRow_C; + void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYRow_C; +#else void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; - } - } + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; #endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) - if (TestCpuFlag(kCpuHasNeonDotProd)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; - } -#endif - - if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } @@ -3531,6 +3318,48 @@ int RAWToI420(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } +#if defined(HAS_RAWTOYROW) + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVRow = RAWToUVRow_Any_NEON; + RAWToYRow = RAWToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_NEON; + RAWToUVRow = RAWToUVRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOYROW_LSX) && defined(HAS_RAWTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToUVRow = RAWToUVRow_Any_LSX; + RAWToYRow = RAWToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_LSX; + RAWToUVRow = RAWToUVRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYROW_LASX) && defined(HAS_RAWTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToUVRow = RAWToUVRow_Any_LASX; + RAWToYRow = RAWToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYRow = RAWToYRow_LASX; + RAWToUVRow = RAWToUVRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYRow = RAWToYRow_RVV; + } +#endif + +// Other platforms do intermediate conversion from RAW to ARGB. +#else // HAS_RAWTOYROW + #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; @@ -3547,46 +3376,28 @@ int RAWToI420(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToARGBRow = RAWToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToARGBRow = RAWToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - RAWToARGBRow = RAWToARGBRow_SVE2; - } -#endif -#if defined(HAS_RAWTOARGBROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - RAWToARGBRow = RAWToARGBRow_Any_LSX; +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_LSX; + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_RAWTOARGBROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - RAWToARGBRow = RAWToARGBRow_Any_LASX; +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - RAWToARGBRow = RAWToARGBRow_LASX; + ARGBToYRow = ARGBToYRow_AVX2; } } #endif -#if defined(HAS_RAWTOARGBROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - RAWToARGBRow = RAWToARGBRow_RVV; +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToYRow = ARGBToYRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToYRow = ARGBToYRow_AVX512BW; + } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) @@ -3613,31 +3424,47 @@ int RAWToI420(const uint8_t* src_raw, } } #endif +#endif // HAS_RAWTOYROW { +#if !defined(HAS_RAWTOYROW) // Allocate 2 rows of ARGB. const int row_size = (width * 4 + 31) & ~31; align_buffer_64(row, row_size * 2); if (!row) return 1; +#endif for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RAWTOYROW) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); + RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else RAWToARGBRow(src_raw, row, width); RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width); ARGBToUVRow(row, row_size, dst_u, dst_v, width); - ARGBToYMatrixRow(row, dst_y, width, &kArgbI601Constants); - ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, &kArgbI601Constants); + ARGBToYRow(row, dst_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); +#endif src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { +#if defined(HAS_RAWTOYROW) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); + RAWToYRow(src_raw, dst_y, width); +#else RAWToARGBRow(src_raw, row, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYMatrixRow(row, dst_y, width, &kArgbI601Constants); + ARGBToYRow(row, dst_y, width); +#endif } +#if !defined(HAS_RAWTOYROW) free_aligned_buffer_64(row); +#endif } return 0; } @@ -3744,48 +3571,6 @@ int RAWToJ420(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToARGBRow = RAWToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToARGBRow = RAWToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - RAWToARGBRow = RAWToARGBRow_SVE2; - } -#endif -#if defined(HAS_RAWTOARGBROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - RAWToARGBRow = RAWToARGBRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_LSX; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - RAWToARGBRow = RAWToARGBRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - RAWToARGBRow = RAWToARGBRow_LASX; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - RAWToARGBRow = RAWToARGBRow_RVV; - } -#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -4028,14 +3813,6 @@ int RAWToI444(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; - } - } -#endif #if defined(HAS_RAWTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToARGBRow = RAWToARGBRow_Any_NEON; @@ -4247,14 +4024,6 @@ int RAWToJ444(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; - } - } -#endif #if defined(HAS_RAWTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToARGBRow = RAWToARGBRow_Any_NEON; @@ -4913,72 +4682,8 @@ int RGB24ToJ400(const uint8_t* src_rgb24, int width, int height) { int y; - void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = - RGB24ToARGBRow_C; - void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) - if (TestCpuFlag(kCpuHasNeonDotProd)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; - } -#endif - + void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) = + RGB24ToYJRow_C; if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) { return -1; } @@ -4993,78 +4698,56 @@ int RGB24ToJ400(const uint8_t* src_rgb24, height = 1; src_stride_rgb24 = dst_stride_yj = 0; } -#if defined(HAS_RGB24TOARGBROW_SSSE3) +#if defined(HAS_RGB24TOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; + RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; + RGB24ToYJRow = RGB24ToYJRow_SSSE3; } } #endif -#if defined(HAS_RGB24TOARGBROW_AVX2) +#if defined(HAS_RGB24TOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2; + RGB24ToYJRow = RGB24ToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - RGB24ToARGBRow = RGB24ToARGBRow_AVX2; + RGB24ToYJRow = RGB24ToYJRow_AVX2; } } #endif -#if defined(HAS_RGB24TOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_NEON) +#if defined(HAS_RGB24TOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToARGBRow = RGB24ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - RGB24ToARGBRow = RGB24ToARGBRow_SVE2; - } -#endif -#if defined(HAS_RGB24TOARGBROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX; + RGB24ToYJRow = RGB24ToYJRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - RGB24ToARGBRow = RGB24ToARGBRow_LSX; + RGB24ToYJRow = RGB24ToYJRow_NEON; } } #endif -#if defined(HAS_RGB24TOARGBROW_LASX) +#if defined(HAS_RGB24TOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToYJRow = RGB24ToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX; + RGB24ToYJRow = RGB24ToYJRow_Any_LASX; if (IS_ALIGNED(width, 32)) { - RGB24ToARGBRow = RGB24ToARGBRow_LASX; + RGB24ToYJRow = RGB24ToYJRow_LASX; } } #endif -#if defined(HAS_RGB24TOARGBROW_RVV) +#if defined(HAS_RGB24TOYJROW_RVV) if (TestCpuFlag(kCpuHasRVV)) { - RGB24ToARGBRow = RGB24ToARGBRow_RVV; + RGB24ToYJRow = RGB24ToYJRow_RVV; } #endif -{ - // Allocate 1 row of ARGB. - const int row_size = (width * 4 + 31) & ~31; - align_buffer_64(row, row_size); - if (!row) - return 1; - for (y = 0; y < height; ++y) { - RGB24ToARGBRow(src_rgb24, row, width); - ARGBToYMatrixRow(row, dst_yj, width, &kArgbJPEGConstants); - src_rgb24 += src_stride_rgb24; - dst_yj += dst_stride_yj; - } - free_aligned_buffer_64(row); + for (y = 0; y < height; ++y) { + RGB24ToYJRow(src_rgb24, dst_yj, width); + src_rgb24 += src_stride_rgb24; + dst_yj += dst_stride_yj; } return 0; } @@ -5078,76 +4761,12 @@ int RAWToJ400(const uint8_t* src_raw, int width, int height) { int y; - void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = - RAWToARGBRow_C; - void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = ARGBToYMatrixRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) - if (TestCpuFlag(kCpuHasNeonDotProd)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; - } -#endif - - + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) = + RAWToYJRow_C; if (!src_raw || !dst_yj || width <= 0 || height == 0) { return -1; } + if (height < 0) { height = -height; src_raw = src_raw + (height - 1) * src_stride_raw; @@ -5160,79 +4779,56 @@ int RAWToJ400(const uint8_t* src_raw, src_stride_raw = dst_stride_yj = 0; } -#if defined(HAS_RAWTOARGBROW_SSSE3) +#if defined(HAS_RAWTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + RAWToYJRow = RAWToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; + RAWToYJRow = RAWToYJRow_SSSE3; } } #endif -#if defined(HAS_RAWTOARGBROW_AVX2) +#if defined(HAS_RAWTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX2; + RAWToYJRow = RAWToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - RAWToARGBRow = RAWToARGBRow_AVX2; + RAWToYJRow = RAWToYJRow_AVX2; } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_NEON) +#if defined(HAS_RAWTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - RAWToARGBRow = RAWToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToARGBRow = RAWToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - RAWToARGBRow = RAWToARGBRow_SVE2; - } -#endif -#if defined(HAS_RAWTOARGBROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - RAWToARGBRow = RAWToARGBRow_Any_LSX; + RAWToYJRow = RAWToYJRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_LSX; + RAWToYJRow = RAWToYJRow_NEON; } } #endif -#if defined(HAS_RAWTOARGBROW_LASX) +#if defined(HAS_RAWTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToYJRow = RAWToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { - RAWToARGBRow = RAWToARGBRow_Any_LASX; + RAWToYJRow = RAWToYJRow_Any_LASX; if (IS_ALIGNED(width, 32)) { - RAWToARGBRow = RAWToARGBRow_LASX; + RAWToYJRow = RAWToYJRow_LASX; } } #endif -#if defined(HAS_RAWTOARGBROW_RVV) +#if defined(HAS_RAWTOYJROW_RVV) if (TestCpuFlag(kCpuHasRVV)) { - RAWToARGBRow = RAWToARGBRow_RVV; + RAWToYJRow = RAWToYJRow_RVV; } #endif - { - // Allocate 1 row of ARGB. - const int row_size = (width * 4 + 31) & ~31; - align_buffer_64(row, row_size); - if (!row) - return 1; - - for (y = 0; y < height; ++y) { - RAWToARGBRow(src_raw, row, width); - ARGBToYMatrixRow(row, dst_yj, width, &kArgbJPEGConstants); - src_raw += src_stride_raw; - dst_yj += dst_stride_yj; - } - free_aligned_buffer_64(row); + for (y = 0; y < height; ++y) { + RAWToYJRow(src_raw, dst_yj, width); + src_raw += src_stride_raw; + dst_yj += dst_stride_yj; } return 0; } diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 7672a6692..794f24903 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3638,22 +3638,6 @@ int RGB24ToARGB(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_RGB24TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - RGB24ToARGBRow = RGB24ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_RGB24TOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW; - } - } -#endif #if defined(HAS_RGB24TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; @@ -3688,7 +3672,8 @@ int RGB24ToARGB(const uint8_t* src_rgb24, RGB24ToARGBRow = RGB24ToARGBRow_RVV; } #endif -for (y = 0; y < height; ++y) { + + for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); src_rgb24 += src_stride_rgb24; dst_argb += dst_stride_argb; @@ -3738,14 +3723,6 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; - } - } -#endif #if defined(HAS_RAWTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToARGBRow = RAWToARGBRow_Any_NEON; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 2c66611e6..7f7be08ea 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -199,70 +199,7 @@ int ARGBToI444Matrix(const uint8_t* src_argb, void (*ARGBToUV444MatrixRow)(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c) = -ARGBToUV444MatrixRow_C; - -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) - if (TestCpuFlag(kCpuHasNeonDotProd)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; - } -#endif - + ARGBToUV444MatrixRow_C; #if defined(HAS_ARGBTOUV444MATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_SSSE3; @@ -287,6 +224,14 @@ ARGBToUV444MatrixRow_C; } } #endif +#if defined(HAS_ARGBTOYMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; + } + } +#endif #if defined(HAS_ARGBTOUV444MATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_NEON; @@ -510,96 +455,7 @@ int ARGBToI422Matrix(const uint8_t* src_argb, void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c) = -ARGBToUVMatrixRow_C; - -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) - if (TestCpuFlag(kCpuHasNeonDotProd)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; - } -#endif - -#if defined(HAS_ARGBTOUVMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) - if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SME) - if (TestCpuFlag(kCpuHasSME)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; - } -#endif + ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; @@ -615,6 +471,14 @@ ARGBToUVMatrixRow_C; ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOYMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; + } + } #endif if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 || height == 0) { @@ -795,7 +659,7 @@ int ARGBToNV12(const uint8_t* src_argb, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } @@ -877,96 +741,7 @@ int ARGBToNV12Matrix(const uint8_t* src_argb, void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c) = -ARGBToUVMatrixRow_C; - -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) - if (TestCpuFlag(kCpuHasNeonDotProd)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; - } -#endif - -#if defined(HAS_ARGBTOUVMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) - if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SME) - if (TestCpuFlag(kCpuHasSME)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; - } -#endif + ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; @@ -982,6 +757,14 @@ ARGBToUVMatrixRow_C; ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOYMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; + } + } #endif void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) = MergeUVRow_C; @@ -1006,7 +789,7 @@ ARGBToUVMatrixRow_C; #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } @@ -1240,7 +1023,7 @@ int ARGBToNV21(const uint8_t* src_argb, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } @@ -1460,7 +1243,7 @@ int ABGRToNV12(const uint8_t* src_abgr, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } @@ -1673,7 +1456,7 @@ int ABGRToNV21(const uint8_t* src_abgr, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } @@ -4117,93 +3900,41 @@ int ARGBToAB64(const uint8_t* src_argb, return 0; } -// Convert RAW to NV21 with Matrix. +// Enabled if 1 pass is available +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV) +#define HAS_RAWTOYJROW +#endif + +// RAW to JNV21 full range NV21 LIBYUV_API -int RAWToNV21Matrix(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - const struct ArgbConstants* argbconstants, - int width, - int height) { +int RAWToJNV21(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; +#if defined(HAS_RAWTOYJROW) + void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + RAWToUVJRow_C; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYJRow_C; +#else void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; - void (*ARGBToUVMatrixRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width, - const struct ArgbConstants* c) = - ARGBToUVMatrixRow_C; - void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = ARGBToYMatrixRow_C; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYJRow_C; +#endif void (*MergeUVRow)(const uint8_t* src_uj, const uint8_t* src_vj, uint8_t* dst_vu, int width) = MergeUVRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) - if (TestCpuFlag(kCpuHasNeonDotProd)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; - } -#endif - - - if (!src_raw || !dst_y || !dst_vu || !argbconstants || width <= 0 || height == 0) { + if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -4213,6 +3944,44 @@ int RAWToNV21Matrix(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } +#if defined(HAS_RAWTOYJROW) + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVJRow = RAWToUVJRow_Any_NEON; + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_NEON; + RAWToUVJRow = RAWToUVJRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToYJRow = RAWToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToYJRow = RAWToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif + +// Other platforms do intermediate conversion from RAW to ARGB. +#else // HAS_RAWTOYJROW + #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; @@ -4229,99 +3998,47 @@ int RAWToNV21Matrix(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToARGBRow = RAWToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToARGBRow = RAWToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - RAWToARGBRow = RAWToARGBRow_SVE2; - } -#endif -#if defined(HAS_RAWTOARGBROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - RAWToARGBRow = RAWToARGBRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_LSX; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - RAWToARGBRow = RAWToARGBRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - RAWToARGBRow = RAWToARGBRow_LASX; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - RAWToARGBRow = RAWToARGBRow_RVV; - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) - if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SME) - if (TestCpuFlag(kCpuHasSME)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; + ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW; +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW; + ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif - +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVJRow = ARGBToUVJRow_AVX512BW; + } + } +#endif +#endif // HAS_RAWTOYJROW #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow = MergeUVRow_Any_SSE2; @@ -4333,7 +4050,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } @@ -4372,86 +4089,58 @@ int RAWToNV21Matrix(const uint8_t* src_raw, MergeUVRow = MergeUVRow_RVV; } #endif - { - // Allocate 2 rows of ARGB. - const int row_size = (width * 4 + 31) & ~31; - align_buffer_64(row, row_size * 2); - // Allocate 1 row of U and 1 row of V. - align_buffer_64(row_u, halfwidth); - align_buffer_64(row_v, halfwidth); - - if (!row || !row_u || !row_v) { - free_aligned_buffer_64(row); - free_aligned_buffer_64(row_u); - free_aligned_buffer_64(row_v); +#if defined(HAS_RAWTOYJROW) + // Allocate a row of uv. + const int row_uv_size = ((halfwidth + 31) & ~31); + align_buffer_64(row_uj, row_uv_size * 2); + uint8_t* row_vj = row_uj + row_uv_size; +#else + // Allocate row of uv and 2 rows of ARGB. + const int row_size = ((width * 4 + 31) & ~31); + const int row_uv_size = ((halfwidth + 31) & ~31); + align_buffer_64(row_uj, row_uv_size * 2 + row_size * 2); + uint8_t* row_vj = row_uj + row_uv_size; + uint8_t* row = row_vj + row_uv_size; +#endif + if (!row_uj) return 1; - } for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width); + MergeUVRow(row_vj, row_uj, dst_vu, halfwidth); + RAWToYJRow(src_raw, dst_y, width); + RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else RAWToARGBRow(src_raw, row, width); RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width); - ARGBToUVMatrixRow(row, row_size, row_u, row_v, width, argbconstants); - MergeUVRow(row_v, row_u, dst_vu, halfwidth); - ARGBToYMatrixRow(row, dst_y, width, argbconstants); - ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, argbconstants); + ARGBToUVJRow(row, row_size, row_uj, row_vj, width); + MergeUVRow(row_vj, row_uj, dst_vu, halfwidth); + ARGBToYJRow(row, dst_y, width); + ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width); +#endif src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; dst_vu += dst_stride_vu; } if (height & 1) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, 0, row_uj, row_vj, width); + MergeUVRow(row_vj, row_uj, dst_vu, halfwidth); + RAWToYJRow(src_raw, dst_y, width); +#else RAWToARGBRow(src_raw, row, width); - ARGBToUVMatrixRow(row, 0, row_u, row_v, width, argbconstants); - MergeUVRow(row_v, row_u, dst_vu, halfwidth); - ARGBToYMatrixRow(row, dst_y, width, argbconstants); + ARGBToUVJRow(row, 0, row_uj, row_vj, width); + MergeUVRow(row_vj, row_uj, dst_vu, halfwidth); + ARGBToYJRow(row, dst_y, width); +#endif } - free_aligned_buffer_64(row_v); - free_aligned_buffer_64(row_u); - free_aligned_buffer_64(row); + free_aligned_buffer_64(row_uj); } return 0; } - -LIBYUV_API -int RAWToJNV21(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - return RAWToNV21Matrix(src_raw, src_stride_raw, dst_y, dst_stride_y, dst_vu, - dst_stride_vu, &kArgbJPEGConstants, width, height); -} - -LIBYUV_API -int RAWToNV21(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - return RAWToNV21Matrix(src_raw, src_stride_raw, dst_y, dst_stride_y, dst_vu, - dst_stride_vu, &kArgbI601Constants, width, height); -} - -LIBYUV_API -int RGB24ToNV12(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - return RAWToNV21Matrix(src_rgb24, src_stride_rgb24, dst_y, dst_stride_y, - dst_uv, dst_stride_uv, &kAbgrI601Constants, width, - height); -} - +#undef HAS_RAWTOYJROW #ifdef __cplusplus } // extern "C" diff --git a/source/planar_functions.cc b/source/planar_functions.cc index fde3717a4..96cac25f3 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -693,7 +693,7 @@ void MergeUVPlane(const uint8_t* src_u, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { + if (IS_ALIGNED(width, 16)) { MergeUVRow = MergeUVRow_AVX2; } } diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index 9847ecd48..ae7436b12 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -101,11 +101,11 @@ void TransposeWx8_SSSE3(const uint8_t* src, "movq %%xmm7,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(src_stride)), // %3 - "r"((ptrdiff_t)(dst_stride)) // %4 + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -243,11 +243,11 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src, "movq %%xmm15,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(src_stride)), // %3 - "r"((ptrdiff_t)(dst_stride)) // %4 + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"); @@ -356,13 +356,13 @@ void TransposeUVWx8_SSE2(const uint8_t* src, "movhpd %%xmm8,(%2,%6) \n" "lea (%2,%6,2),%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(src_stride)), // %4 - "r"((ptrdiff_t)(dst_stride_a)), // %5 - "r"((ptrdiff_t)(dst_stride_b)) // %6 + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9"); } diff --git a/source/row_any.cc b/source/row_any.cc index 82a4abe8d..8ac48d3c0 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -616,7 +616,7 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON, ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) #endif #ifdef HAS_MERGEUVROW_AVX2 -ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) +ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15) #endif #ifdef HAS_MERGEUVROW_AVX512BW ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31) @@ -1000,12 +1000,6 @@ ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) #if defined(HAS_RAWTOARGBROW_AVX2) ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31) #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) -ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63) -#endif -#if defined(HAS_RGB24TOARGBROW_AVX512BW) -ANY11(RGB24ToARGBRow_Any_AVX512BW, RGB24ToARGBRow_AVX512BW, 0, 3, 4, 63) -#endif #if defined(HAS_RAWTORGBAROW_SSSE3) ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) #endif @@ -1206,36 +1200,52 @@ ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15) ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31) #endif #ifdef HAS_RGB24TOYROW_NEON +ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_AVX2 +ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31) #endif #ifdef HAS_RGB24TOYJROW_SSSE3 +ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_NEON +ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYROW_LSX +ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_LSX +ANY11(RGB24ToYJRow_Any_LSX, RGB24ToYJRow_LSX, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_LASX +ANY11(RGB24ToYJRow_Any_LASX, RGB24ToYJRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RGB24TOYROW_LASX +ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RAWTOYROW_NEON +ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYJROW_AVX2 +ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31) #endif #ifdef HAS_RAWTOYJROW_SSSE3 +ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYJROW_NEON +ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYROW_LSX +ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYROW_LASX +ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RAWTOYJROW_LSX +ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYJROW_LASX +ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 15) @@ -2264,12 +2274,6 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1)); \ } -#ifdef HAS_ARGBTOUVMATRIXROW_NEON -ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15) -#endif -#ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM -ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15) -#endif #ifdef HAS_ARGBTOUVMATRIXROW_AVX2 ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15) #endif @@ -2320,18 +2324,6 @@ ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63) #ifdef HAS_ARGBTOYMATRIXROW_NEON ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15) #endif -#ifdef HAS_ARGBTOYMATRIXROW_NEON_DOTPROD -ANY11MC(ARGBToYMatrixRow_Any_NEON_DotProd, ARGBToYMatrixRow_NEON_DotProd, 4, 15) -#endif -#ifdef HAS_ARGBTOYMATRIXROW_LSX -ANY11MC(ARGBToYMatrixRow_Any_LSX, ARGBToYMatrixRow_LSX, 4, 15) -#endif -#ifdef HAS_ARGBTOYMATRIXROW_LASX -ANY11MC(ARGBToYMatrixRow_Any_LASX, ARGBToYMatrixRow_LASX, 4, 31) -#endif -#ifdef HAS_ARGBTOYMATRIXROW_RVV -ANY11MC(ARGBToYMatrixRow_Any_RVV, ARGBToYMatrixRow_RVV, 4, 15) -#endif #undef ANY11MC #ifdef HAS_ARGBTOUVROW_AVX2 diff --git a/source/row_common.cc b/source/row_common.cc index b2a0ec12b..8b192a539 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -678,6 +678,8 @@ MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(BGRA, 1, 2, 3, 4) MAKEROWY(ABGR, 0, 1, 2, 4) MAKEROWY(RGBA, 3, 2, 1, 4) +MAKEROWY(RGB24, 2, 1, 0, 3) +MAKEROWY(RAW, 0, 1, 2, 3) #undef MAKEROWY // JPeg uses BT.601-1 full range @@ -751,6 +753,8 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { MAKEROWYJ(ARGB, 2, 1, 0, 4) MAKEROWYJ(ABGR, 0, 1, 2, 4) MAKEROWYJ(RGBA, 3, 2, 1, 4) +MAKEROWYJ(RGB24, 2, 1, 0, 3) +MAKEROWYJ(RAW, 0, 1, 2, 3) #undef MAKEROWYJ static __inline uint8_t RGBToYMatrix(uint8_t r, @@ -4375,21 +4379,69 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, #ifdef HAS_RGB24TOYJROW_AVX2 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); + ARGBToYJRow_AVX2(row, dst_yj, twidth); + src_rgb24 += twidth * 3; + dst_yj += twidth; + width -= twidth; + } } #endif // HAS_RGB24TOYJROW_AVX2 #ifdef HAS_RAWTOYJROW_AVX2 // Convert 32 RAW pixels (128 bytes) to 32 YJ values. +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; +#ifdef HAS_RAWTOARGBROW_AVX2 + RAWToARGBRow_AVX2(src_raw, row, twidth); +#else + RAWToARGBRow_SSSE3(src_raw, row, twidth); +#endif + ARGBToYJRow_AVX2(row, dst_yj, twidth); + src_raw += twidth * 3; + dst_yj += twidth; + width -= twidth; + } } #endif // HAS_RAWTOYJROW_AVX2 #ifdef HAS_RGB24TOYJROW_SSSE3 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. +void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); + ARGBToYJRow_SSSE3(row, dst_yj, twidth); + src_rgb24 += twidth * 3; + dst_yj += twidth; + width -= twidth; + } } #endif // HAS_RGB24TOYJROW_SSSE3 #ifdef HAS_RAWTOYJROW_SSSE3 // Convert 16 RAW pixels (64 bytes) to 16 YJ values. +void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RAWToARGBRow_SSSE3(src_raw, row, twidth); + ARGBToYJRow_SSSE3(row, dst_yj, twidth); + src_raw += twidth * 3; + dst_yj += twidth; + width -= twidth; + } } #endif // HAS_RAWTOYJROW_SSSE3 diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 0da6e2ada..9ed7fce9c 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -262,64 +262,6 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -#ifdef HAS_RAWTOARGBROW_AVX512BW -static const uint32_t kPermdRAWToARGB_AVX512BW[16] = { - 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; - -void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) { - asm volatile( - "vpternlogd $0xff,%%zmm6,%%zmm6,%%zmm6 \n" // 0xffffffff - "vpslld $0x18,%%zmm6,%%zmm6 \n" // 0xff000000 - "movabs $0xffffffffffff,%%rax \n" // 48 bytes mask - "kmovq %%rax,%%k1 \n" - "vmovdqu32 %3,%%zmm5 \n" - "vbroadcasti32x4 %4,%%zmm4 \n" - - LABELALIGN // - "1: \n" - "vmovdqu8 (%0),%%zmm0%{%%k1%}%{z%} \n" - "vmovdqu8 48(%0),%%zmm1%{%%k1%}%{z%} \n" - "vmovdqu8 96(%0),%%zmm2%{%%k1%}%{z%} \n" - "vmovdqu8 144(%0),%%zmm3%{%%k1%}%{z%} \n" - "lea 192(%0),%0 \n" - "vpermd %%zmm0,%%zmm5,%%zmm0 \n" - "vpermd %%zmm1,%%zmm5,%%zmm1 \n" - "vpermd %%zmm2,%%zmm5,%%zmm2 \n" - "vpermd %%zmm3,%%zmm5,%%zmm3 \n" - "vpshufb %%zmm4,%%zmm0,%%zmm0 \n" - "vpshufb %%zmm4,%%zmm1,%%zmm1 \n" - "vpshufb %%zmm4,%%zmm2,%%zmm2 \n" - "vpshufb %%zmm4,%%zmm3,%%zmm3 \n" - "vpord %%zmm6,%%zmm0,%%zmm0 \n" - "vpord %%zmm6,%%zmm1,%%zmm1 \n" - "vpord %%zmm6,%%zmm2,%%zmm2 \n" - "vpord %%zmm6,%%zmm3,%%zmm3 \n" - "vmovdqu32 %%zmm0,(%1) \n" - "vmovdqu32 %%zmm1,0x40(%1) \n" - "vmovdqu32 %%zmm2,0x80(%1) \n" - "vmovdqu32 %%zmm3,0xc0(%1) \n" - "lea 0x100(%1),%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kPermdRAWToARGB_AVX512BW), // %3 - "m"(*shuffler) // %4 - : "memory", "cc", "rax", "k1", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6"); -} - -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - RGBToARGBRow_AVX512BW(src_raw, dst_argb, (const uint32_t*)&kShuffleMaskRAWToARGB, width); -} - -void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, (const uint32_t*)&kShuffleMaskRGB24ToARGB, width); -} -#endif - - // Same code as RAWToARGB with different shuffler and A in low bits void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( @@ -1913,9 +1855,9 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, #else "+rm"(width) // %3 #endif - : "r"((ptrdiff_t)(src_stride_argb)), // %4 - "r"(c), // %5 - "m"(kShuffleAARRGGBB) // %6 + : "r"((intptr_t)(src_stride_argb)), // %4 + "r"(c), // %5 + "m"(kShuffleAARRGGBB) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1988,9 +1930,9 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, #else "+rm"(width) // %3 #endif - : "r"((ptrdiff_t)(src_stride_argb)), // %4 - "r"(c), // %5 - "m"(kShuffleAARRGGBB) // %6 + : "r"((intptr_t)(src_stride_argb)), // %4 + "r"(c), // %5 + "m"(kShuffleAARRGGBB) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -2293,11 +2235,11 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb, #else "+rm"(width) // %3 #endif - : "r"((ptrdiff_t)(src_stride_argb)), // %4 - "r"(c), // %5 - "m"(kShuffleAARRGGBB), // %6 - "m"(kPermdARGBToY_AVX512BW), // %7 - "m"(kPermdARGBToUV_AVX512BW) // %8 + : "r"((intptr_t)(src_stride_argb)), // %4 + "r"(c), // %5 + "m"(kShuffleAARRGGBB), // %6 + "m"(kPermdARGBToY_AVX512BW), // %7 + "m"(kPermdARGBToUV_AVX512BW) // %8 : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm16", "zmm17", "zmm18", "zmm19"); } @@ -4649,7 +4591,7 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN @@ -4670,7 +4612,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_MIRRORROW_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN @@ -4697,7 +4639,7 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN @@ -4718,7 +4660,7 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { #ifdef HAS_MIRRORUVROW_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN @@ -4747,7 +4689,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile( "movdqa %4,%%xmm1 \n" "lea -0x10(%0,%3,2),%0 \n" @@ -4786,7 +4728,7 @@ static const uvec8 kShuffleMirrorRGB1 = { void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); src_rgb24 += width * 3 - 48; asm volatile( "movdqa %3,%%xmm4 \n" @@ -4822,7 +4764,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, #ifdef HAS_ARGBMIRRORROW_SSE2 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("lea -0x10(%0,%2,4),%0 \n" LABELALIGN @@ -4846,7 +4788,7 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { // Shuffle table for reversing the bytes. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("vmovdqu %3,%%ymm5 \n" LABELALIGN @@ -6867,10 +6809,10 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(stride_yuy2)) // %3 + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } @@ -6906,11 +6848,11 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_yuy2)) // %4 + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } @@ -7001,11 +6943,11 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_uyvy)) // %4 + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } @@ -7092,10 +7034,10 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(stride_yuy2)) // %3 + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 : "memory", "cc", "xmm0", "xmm1"); } @@ -7132,11 +7074,11 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_yuy2)) // %4 + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm5"); } @@ -7232,11 +7174,11 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_uyvy)) // %4 + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm5"); } @@ -8596,12 +8538,12 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, "sub $0x1,%3 \n" "jge 10b \n" "19: \n" - : "+r"(topleft), // %0 - "+r"(botleft), // %1 - "+r"(dst), // %2 - "+rm"(count) // %3 - : "r"((ptrdiff_t)(width)), // %4 - "rm"(area) // %5 + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"((intptr_t)(width)), // %4 + "rm"(area) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 @@ -8614,7 +8556,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* src_dudv, int width) { - ptrdiff_t src_argb_stride_temp = src_argb_stride; + intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; asm volatile( "movq (%3),%%xmm2 \n" @@ -8766,11 +8708,11 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, "jg 100b \n" "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+rm"(width), // %2 - "+r"(source_y_fraction) // %3 - : "r"(src_stride) // %4 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+rm"(width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_SSSE3 @@ -8844,11 +8786,11 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, "99: \n" "vzeroupper \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(width), // %2 - "+r"(source_y_fraction) // %3 - : "r"(src_stride) // %4 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_AVX2 @@ -9678,12 +9620,12 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, "lea 0x10(%2),%2 \n" "sub $0x10,%3 \n" // 16 src pixels per loop "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(src_stride_u)), // %4 - "r"((ptrdiff_t)(src_stride_v)) // %5 + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -9724,12 +9666,12 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, "sub $0x20,%3 \n" // 32 src pixels per loop "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(src_stride_u)), // %4 - "r"((ptrdiff_t)(src_stride_v)) // %5 + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } diff --git a/source/row_lasx.cc b/source/row_lasx.cc index 94cb44ed1..19deb9a8f 100644 --- a/source/row_lasx.cc +++ b/source/row_lasx.cc @@ -2013,24 +2013,24 @@ void NV21ToARGBRow_LASX(const uint8_t* src_y, } } -#ifndef ArgbConstants -struct ArgbConstants { +#ifndef RgbConstants +struct RgbConstants { uint8_t kRGBToY[4]; uint16_t kAddY; uint16_t pad; }; -#define ArgbConstants ArgbConstants +#define RgbConstants RgbConstants // RGB to JPeg coefficients // B * 0.1140 coefficient = 29 // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 -static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128, 0}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -2038,20 +2038,20 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0} // R * 0.2578 coefficient = 66 // Add 16.5 = 0x1080 -static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, 0x1080, 0}; -static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080, 0}; -#endif // ArgbConstants +#endif // RgbConstants // ARGB expects first 3 values to contain RGB and 4th value is ignored. -void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, +static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants @@ -2088,7 +2088,7 @@ void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, : "+&r"(src_argb), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c), "r"(shuff) + : "r"(rgbconstants), "r"(shuff) : "memory"); } @@ -2113,7 +2113,7 @@ void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants @@ -2150,7 +2150,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, : "+&r"(src_rgba), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c), "r"(shuff) + : "r"(rgbconstants), "r"(shuff) : "memory"); } @@ -2169,7 +2169,7 @@ void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { int8_t shuff[128] = { 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, @@ -2219,14 +2219,26 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, : "+&r"(src_rgba), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c), // %3 + : "r"(rgbconstants), // %3 "r"(shuff) // %4 : "memory"); } +void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} +void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants); +} +void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants); +} +void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants); +} void ARGBToUVJRow_LASX(const uint8_t* src_argb, int src_stride_argb, diff --git a/source/row_lsx.cc b/source/row_lsx.cc index 41689578a..d3cc2b5d9 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -2798,24 +2798,24 @@ void HalfFloatRow_LSX(const uint16_t* src, } } -#ifndef ArgbConstants -struct ArgbConstants { +#ifndef RgbConstants +struct RgbConstants { uint8_t kRGBToY[4]; uint16_t kAddY; uint16_t pad; }; -#define ArgbConstants ArgbConstants +#define RgbConstants RgbConstants // RGB to JPeg coefficients // B * 0.1140 coefficient = 29 // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 -static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128, 0}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -2823,20 +2823,20 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0} // R * 0.2578 coefficient = 66 // Add 16.5 = 0x1080 -static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, 0x1080, 0}; -static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080, 0}; -#endif // ArgbConstants +#endif // RgbConstants // ARGB expects first 3 values to contain RGB and 4th value is ignored. -void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, +static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants @@ -2870,7 +2870,7 @@ void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, : "+&r"(src_argb), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c) + : "r"(rgbconstants) : "memory"); } @@ -2895,7 +2895,7 @@ void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants @@ -2929,7 +2929,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, : "+&r"(src_rgba), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c) + : "r"(rgbconstants) : "memory"); } @@ -2948,7 +2948,7 @@ void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, @@ -2990,14 +2990,26 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, : "+&r"(src_rgba), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c), // %3 + : "r"(rgbconstants), // %3 "r"(shuff) // %4 : "memory"); } +void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} +void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants); +} +void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants); +} +void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants); +} // undef for unified sources build #undef YUVTORGB_SETUP diff --git a/source/row_neon.cc b/source/row_neon.cc index 895e6f113..6c3118913 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1918,72 +1918,6 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, // clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. -void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vld1.8 {d18}, [%5] \n" // load kRGBToU - "vld1.8 {d19}, [%6] \n" // load kRGBToV - "vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17) - "vmovl.s8 q9, d19 \n" // V coeffs in q9 (d18, d19) - "vdup.16 q10, d16[0] \n" // U0 - "vdup.16 q11, d16[1] \n" // U1 - "vdup.16 q12, d16[2] \n" // U2 - "vdup.16 q13, d18[0] \n" // V0 - "vdup.16 q14, d18[1] \n" // V1 - "vdup.16 q15, d18[2] \n" // V2 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "subs %4, %4, #16 \n" // 16 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #2 \n" // average of 4 - "vrshr.u16 q1, q1, #2 \n" - "vrshr.u16 q2, q2, #2 \n" - - "vmov.u16 q3, #0x8000 \n" // 128.0 - - "vmul.s16 q8, q0, q10 \n" // U = B * U0 - "vmla.s16 q8, q1, q11 \n" // U += G * U1 - "vmla.s16 q8, q2, q12 \n" // U += R * U2 - - "vmul.s16 q9, q0, q13 \n" // V = B * V0 - "vmla.s16 q9, q1, q14 \n" // V += G * V1 - "vmla.s16 q9, q2, q15 \n" // V += R * V2 - - "vsub.u16 q8, q3, q8 \n" // 128.0 - U - "vsub.u16 q9, q3, q9 \n" // 128.0 - V - - "vqshrn.u16 d0, q8, #8 \n" // Saturating shift right - "vqshrn.u16 d1, q9, #8 \n" - - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride_argb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : "r"(&c->kRGBToU), // %5 - "r"(&c->kRGBToV) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2896,7 +2830,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants); } -void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, +static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct ArgbConstants* c) { @@ -2931,9 +2865,21 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, "q12"); } +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kArgbJPEGConstants); +} +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kAbgrJPEGConstants); +} +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kArgbI601Constants); +} +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kAbgrI601Constants); +} // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8_t* dst_ptr, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 19016cc3b..c0fdc6d0d 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -9,7 +9,6 @@ */ #include "libyuv/row.h" -#include "libyuv/convert_from_argb.h" #ifdef __cplusplus namespace libyuv { @@ -2894,26 +2893,14 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. -void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - "ldr q16, [%[c], #16] \n" // kRGBToU - "ldr q17, [%[c], #32] \n" // kRGBToV - "sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit - "sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit - "dup v20.8h, v16.h[0] \n" // U0 (-BU) - "dup v21.8h, v16.h[1] \n" // U1 (-GU) - "dup v22.8h, v16.h[2] \n" // U2 (-RU) - "dup v23.8h, v17.h[0] \n" // V0 (-BV) - "dup v24.8h, v17.h[1] \n" // V1 (-GV) - "dup v26.8h, v17.h[2] \n" // V2 (-RV) - "movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000) - + RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. @@ -2922,7 +2909,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -2932,20 +2919,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "urshr v1.8h, v1.8h, #2 \n" "urshr v2.8h, v2.8h, #2 \n" - // U = B*U0 + G*U1 + R*U2 - "mul v3.8h, v0.8h, v20.8h \n" - "mla v3.8h, v1.8h, v21.8h \n" - "mla v3.8h, v2.8h, v22.8h \n" - - // V = B*V0 + G*V1 + R*V2 - "mul v4.8h, v0.8h, v23.8h \n" - "mla v4.8h, v1.8h, v24.8h \n" - "mla v4.8h, v2.8h, v26.8h \n" - - // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8 - "subhn v0.8b, v25.8h, v3.8h \n" - "subhn v1.8b, v25.8h, v4.8h \n" - + RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" @@ -2954,21 +2928,12 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 - : [c] "r"(c) // %5 + : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26" + "v20", "v21", "v22", "v23", "v24", "v25" ); } -void ARGBToUVRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width, - &kArgbI601Constants); -} - void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -3484,7 +3449,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, } // Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout. -static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src, +static void ABCDToUVMatrixRow_NEON_I8MM(const uint8_t* src, int src_stride, uint8_t* dst_u, uint8_t* dst_v, @@ -3581,25 +3546,12 @@ static const int8_t kRGBAToUVCoefficients[] = { 0, -112, 74, 38, 0, 18, 94, -112, }; -void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - int8_t uvconstants[8] = { - (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], - (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, - uvconstants); -} - void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, kARGBToUVCoefficients); } @@ -3608,7 +3560,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, kABGRToUVCoefficients); } @@ -3617,7 +3569,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width, kBGRAToUVCoefficients); } @@ -3626,7 +3578,7 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width, kRGBAToUVCoefficients); } @@ -3654,7 +3606,7 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, kARGBToUVJCoefficients); } @@ -3663,7 +3615,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, kABGRToUVJCoefficients); } @@ -3763,20 +3715,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); } - +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; +}; // ARGB expects first 3 values to contain RGB and 4th value is ignored. -void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, +static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( - "ldr s0, [%3] \n" // load rgbconstants - "ldr s1, [%3, #48] \n" + "ldr d0, [%3] \n" // load rgbconstants "dup v6.16b, v0.b[0] \n" "dup v7.16b, v0.b[1] \n" "dup v16.16b, v0.b[2] \n" - "dup v17.8h, v1.h[0] \n" + "dup v17.8h, v0.h[2] \n" "1: \n" "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16 // pixels. @@ -3795,21 +3749,20 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 - : "r"(c) // %3 + : "r"(rgbconstants) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"); } -void ARGBToYMatrixRow_NEON_DotProd( +static void ARGBToYMatrixRow_NEON_DotProd( const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( - "ldr s0, [%3] \n" // load rgbconstants - "ldr s1, [%3, #48] \n" + "ldr d0, [%3] \n" // load rgbconstants "dup v16.4s, v0.s[0] \n" - "dup v17.8h, v1.h[0] \n" + "dup v17.8h, v0.h[2] \n" "1: \n" "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16 // pixels. @@ -3831,7 +3784,7 @@ void ARGBToYMatrixRow_NEON_DotProd( : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 - : "r"(c) // %3 + : "r"(rgbconstants) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"); } @@ -3841,10 +3794,12 @@ void ARGBToYMatrixRow_NEON_DotProd( // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 -static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {}, {}, {0x0080}, {}}; -static const struct ArgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, {}, {}, {0x0080}, {}}; +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 0x0080}; +static const struct RgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, + 0x0080}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, {0x0080}, {}}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 0x0080}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -3852,11 +3807,14 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, // R * 0.2578 coefficient = 66 // Add 16.5 = 0x1080 -static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {}, {}, {0x1080}, {}}; -static const struct ArgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, {}, {}, {0x1080}, {}}; +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080}; +static const struct RgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, + 0x1080}; -static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {}, {}, {0x1080}, {}}; -static const struct ArgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, {}, {}, {0x1080}, {}}; +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; +static const struct RgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, + 0x1080}; void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); @@ -3903,14 +3861,13 @@ void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr, static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( - "ldr s0, [%3] \n" // load rgbconstants - "ldr s1, [%3, #48] \n" + "ldr d0, [%3] \n" // load rgbconstants "dup v6.16b, v0.b[0] \n" "dup v7.16b, v0.b[1] \n" "dup v16.16b, v0.b[2] \n" - "dup v17.8h, v1.h[0] \n" + "dup v17.8h, v0.h[2] \n" "1: \n" "ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16 // pixels. @@ -3929,7 +3886,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 - : "r"(c) // %3 + : "r"(rgbconstants) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"); } @@ -3973,10 +3930,10 @@ void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra, &kRawI601DotProdConstants); } -void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, +static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( "ldr d0, [%3] \n" // load rgbconstants "dup v5.16b, v0.b[0] \n" @@ -4000,13 +3957,25 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, : "+r"(src_rgb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 - : "r"(c) // %3 + : "r"(rgbconstants) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); +} +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); +} +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); +} // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8_t* dst_ptr, diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 93bc431bc..0bdcd879b 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -126,6 +126,7 @@ extern "C" { } #endif +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE // Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv #define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ { \ @@ -169,6 +170,45 @@ extern "C" { v_y = __riscv_vle8_v_u8m2(src_y, vl); \ v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } +#else +// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv +#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu +#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } +#endif #ifdef HAS_ARGBTOAR64ROW_RVV void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { @@ -189,6 +229,7 @@ void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { #endif #ifdef HAS_ARGBTOAB64ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { size_t avl = (size_t)width; do { @@ -215,6 +256,29 @@ void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { dst_ab64 += 4 * vl; } while (avl > 0); } +#else +void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { + size_t avl = (size_t)width; + do { + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + vuint8m1_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m1(avl); + __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl); + v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl); + v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl); + v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl); + v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl); + v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl); + v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl); + v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl); + __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl); + avl -= vl; + src_argb += 4 * vl; + dst_ab64 += 4 * vl; + } while (avl > 0); +} +#endif #endif #ifdef HAS_AR64TOARGBROW_RVV @@ -235,6 +299,7 @@ void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { #endif #ifdef HAS_AR64TOAB64ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void AR64ToAB64Row_RVV(const uint16_t* src_ar64, uint16_t* dst_ab64, int width) { @@ -253,9 +318,26 @@ void AR64ToAB64Row_RVV(const uint16_t* src_ar64, dst_ab64 += vl * 4; } while (w > 0); } +#else +void AR64ToAB64Row_RVV(const uint16_t* src_ar64, + uint16_t* dst_ab64, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e16m2(w); + vuint16m2_t v_b, v_g, v_r, v_a; + __riscv_vlseg4e16_v_u16m2(&v_b, &v_g, &v_r, &v_a, src_ar64, vl); + __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r, v_g, v_b, v_a, vl); + w -= vl; + src_ar64 += vl * 4; + dst_ab64 += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_AB64TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { size_t avl = (size_t)width; do { @@ -276,9 +358,29 @@ void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { dst_argb += 4 * vl; } while (avl > 0); } +#else +void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { + size_t avl = (size_t)width; + do { + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + vuint8m1_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e16m2(avl); + __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl); + v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl); + v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl); + v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl); + v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl); + __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + avl -= vl; + src_ab64 += 4 * vl; + dst_argb += 4 * vl; + } while (avl > 0); +} +#endif #endif #ifdef HAS_RAWTOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -296,9 +398,26 @@ void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#else +void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_raw += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#endif #endif #ifdef HAS_RAWTORGBAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -316,9 +435,26 @@ void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#else +void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_raw += vl * 3; + dst_rgba += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#endif #endif #ifdef HAS_RAWTORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { size_t w = (size_t)width; do { @@ -334,9 +470,24 @@ void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { dst_rgb24 += vl * 3; } while (w > 0); } +#else +void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl); + w -= vl; + src_raw += vl * 3; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTORAWROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { size_t w = (size_t)width; do { @@ -352,9 +503,24 @@ void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { dst_raw += vl * 3; } while (w > 0); } +#else +void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl); + w -= vl; + src_argb += vl * 4; + dst_raw += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { @@ -372,9 +538,26 @@ void ARGBToRGB24Row_RVV(const uint8_t* src_argb, dst_rgb24 += vl * 3; } while (w > 0); } +#else +void ARGBToRGB24Row_RVV(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_argb += vl * 4; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTOABGRROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { size_t w = (size_t)width; do { @@ -391,9 +574,24 @@ void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { dst_abgr += vl * 4; } while (w > 0); } +#else +void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a, v_r, v_g, v_b; + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg4e8_v_u8m2(dst_abgr, v_r, v_g, v_b, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_abgr += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTOBGRAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { size_t w = (size_t)width; do { @@ -410,9 +608,24 @@ void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { dst_bgra += vl * 4; } while (w > 0); } +#else +void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a, v_r, v_g, v_b; + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg4e8_v_u8m2(dst_bgra, v_a, v_r, v_g, v_b, vl); + w -= vl; + src_argb += vl * 4; + dst_bgra += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTORGBAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { size_t w = (size_t)width; do { @@ -429,9 +642,24 @@ void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { dst_rgba += vl * 4; } while (w > 0); } +#else +void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a, v_r, v_g, v_b; + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_argb += vl * 4; + dst_rgba += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_RGBATOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { size_t w = (size_t)width; do { @@ -448,9 +676,24 @@ void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { dst_argb += vl * 4; } while (w > 0); } +#else +void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a, v_r, v_g, v_b; + __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_rgba += vl * 4; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_RGB24TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { @@ -470,9 +713,28 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#else +void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_rgb24 += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#endif #endif #ifdef HAS_I444TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I444ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -503,9 +765,40 @@ void I444ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void I444ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_I444ALPHATOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I444AlphaToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -538,9 +831,42 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void I444AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_I444TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I444ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -570,9 +896,39 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#else +void I444ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl; + src_v += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_I422TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I422ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -603,9 +959,40 @@ void I422ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void I422ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_I422ALPHATOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I422AlphaToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -638,9 +1025,42 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void I422AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_I422TORGBAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I422ToRGBARow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -671,9 +1091,40 @@ void I422ToRGBARow_RVV(const uint8_t* src_y, dst_rgba += vl * 4; } while (w > 0); } +#else +void I422ToRGBARow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgba += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_I422TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I422ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -703,10 +1154,39 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#else +void I422ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_I400TOARGBROW_RVV -#if defined(LIBYUV_RVV_HAS_VXRM_ARG) +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) void I400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -787,6 +1267,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y, #endif #ifdef HAS_J400TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -801,6 +1282,22 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#else +void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_y; + v_y = __riscv_vle8_v_u8m2(src_y, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl); + w -= vl; + src_y += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#endif #endif #ifdef HAS_COPYROW_RVV @@ -818,6 +1315,7 @@ void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { #endif #ifdef HAS_NV12TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void NV12ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, @@ -846,9 +1344,38 @@ void NV12ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void NV12ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_NV12TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void NV12ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, @@ -876,9 +1403,37 @@ void NV12ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#else +void NV12ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_NV21TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void NV21ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_argb, @@ -907,9 +1462,38 @@ void NV21ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void NV21ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_NV21TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void NV21ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, @@ -937,6 +1521,33 @@ void NV21ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#else +void NV21ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif // Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 @@ -1056,6 +1667,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, #endif #ifdef HAS_SPLITRGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -1078,9 +1690,32 @@ void SplitRGBRow_RVV(const uint8_t* src_rgb, src_rgb += vl * 3; } while (w > 0); } +#else +void SplitRGBRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_rgb += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_MERGERGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void MergeRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -1101,9 +1736,31 @@ void MergeRGBRow_RVV(const uint8_t* src_r, dst_rgb += vl * 3; } while (w > 0); } +#else +void MergeRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_rgb += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_SPLITARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void SplitARGBRow_RVV(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -1130,9 +1787,35 @@ void SplitARGBRow_RVV(const uint8_t* src_argb, src_argb += vl * 4; } while (w > 0); } +#else +void SplitARGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_a += vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_MERGEARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void MergeARGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -1156,9 +1839,34 @@ void MergeARGBRow_RVV(const uint8_t* src_r, dst_argb += vl * 4; } while (w > 0); } +#else +void MergeARGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + src_a += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_SPLITXRGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void SplitXRGBRow_RVV(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -1181,9 +1889,32 @@ void SplitXRGBRow_RVV(const uint8_t* src_argb, src_argb += vl * 4; } while (w > 0); } +#else +void SplitXRGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_MERGEXRGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void MergeXRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -1206,9 +1937,34 @@ void MergeXRGBRow_RVV(const uint8_t* src_r, vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#else +void MergeXRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_r, v_g, v_b; + v_r = __riscv_vle8_v_u8m2(src_r, vl); + v_g = __riscv_vle8_v_u8m2(src_g, vl); + v_b = __riscv_vle8_v_u8m2(src_b, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#endif #endif #ifdef HAS_SPLITUVROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void SplitUVRow_RVV(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -1227,9 +1983,29 @@ void SplitUVRow_RVV(const uint8_t* src_uv, src_uv += 2 * vl; } while (w > 0); } +#else +void SplitUVRow_RVV(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_u, v_v; + __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl); + __riscv_vse8_v_u8m4(dst_u, v_u, vl); + __riscv_vse8_v_u8m4(dst_v, v_v, vl); + w -= vl; + dst_u += vl; + dst_v += vl; + src_uv += 2 * vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_MERGEUVROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void MergeUVRow_RVV(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -1247,18 +2023,43 @@ void MergeUVRow_RVV(const uint8_t* src_u, dst_uv += 2 * vl; } while (w > 0); } +#else +void MergeUVRow_RVV(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + size_t w = (size_t)width; + do { + vuint8m4_t v_u, v_v; + size_t vl = __riscv_vsetvl_e8m4(w); + v_u = __riscv_vle8_v_u8m4(src_u, vl); + v_v = __riscv_vle8_v_u8m4(src_v, vl); + __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl); + w -= vl; + src_u += vl; + src_v += vl; + dst_uv += 2 * vl; + } while (w > 0); +} +#endif #endif - +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; // RGB to JPeg coefficients // B * 0.1140 coefficient = 29 // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 -static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {0}, {0}, {128}, {0}}; +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0}, {128}, {0}}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -1266,25 +2067,30 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0 // R * 0.2578 coefficient = 66 // Add 16.5 = 0x1080 -static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {0}, {0}, {0x1080}, {0}}; +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; -static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {0}, {0}, {0x1080}, {0}}; +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; // ARGB expects first 3 values to contain RGB and 4th value is ignored #ifdef HAS_ARGBTOYMATRIXROW_RVV -void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +static void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { assert(width != 0); size_t w = (size_t)width; vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant vuint16m4_t v_addy; // vector is to store kAddY size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(c->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(c->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(c->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(c->kAddY[0], vl); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); do { vuint8m2_t v_y; vuint16m4_t v_y_u16; @@ -1304,6 +2110,37 @@ void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, dst_y += vl; } while (w > 0); } +#else +static void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_a, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_argb += 4 * vl; + dst_y += vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTOYROW_RVV @@ -1332,19 +2169,20 @@ void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { // RGBA expects first value to be A and ignored, then 3 values to contain RGB. #ifdef HAS_RGBATOYMATRIXROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE static void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { assert(width != 0); size_t w = (size_t)width; vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant vuint16m4_t v_addy; // vector is to store kAddY size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(c->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(c->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(c->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(c->kAddY[0], vl); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); do { vuint8m2_t v_y; vuint16m4_t v_y_u16; @@ -1364,6 +2202,37 @@ static void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, dst_y += vl; } while (w > 0); } +#else +static void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_a, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgba += 4 * vl; + dst_y += vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_RGBATOYROW_RVV @@ -1385,19 +2254,20 @@ void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) { #endif #ifdef HAS_RGBTOYMATRIXROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { assert(width != 0); size_t w = (size_t)width; vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant vuint16m4_t v_addy; // vector is to store kAddY size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(c->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(c->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(c->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(c->kAddY[0], vl); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); do { vuint8m2_t v_y; vuint16m4_t v_y_u16; @@ -1417,24 +2287,68 @@ static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, dst_y += vl; } while (w > 0); } +#else +static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgb += 3 * vl; + dst_y += vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_RGB24TOYJROW_RVV +void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} #endif #ifdef HAS_RAWTOYJROW_RVV +void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants); +} #endif #ifdef HAS_RGB24TOYROW_RVV +void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants); +} #endif #ifdef HAS_RAWTOYROW_RVV +void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants); +} #endif // Blend src_argb over src_argb1 and store to dst_argb. // dst_argb may be src_argb or src_argb1. // src_argb: RGB values have already been pre-multiplied by the a. #ifdef HAS_ARGBBLENDROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBBlendRow_RVV(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -1481,6 +2395,48 @@ void ARGBBlendRow_RVV(const uint8_t* src_argb, dst_argb += 4 * vl; } while (w > 0); } +#else +void ARGBBlendRow_RVV(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvlmax_e8m2(); + // clamp255((((256 - a) * b) >> 8) + f) + // = b * (256 - a) / 256 + f + // = b - (b * a / 256) + f + vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl); + do { + vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a; + vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a; + vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r; + vuint8m2_t v_dst_b, v_dst_g, v_dst_r; + vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a, + src_argb, vl); + __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a, + src_argb1, vl); + + v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl); + v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl); + v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl); + + v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl); + v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl); + v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl); + + v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl); + v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl); + v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl); + + w -= vl; + src_argb += 4 * vl; + src_argb1 += 4 * vl; + dst_argb += 4 * vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_BLENDPLANEROW_RVV @@ -1518,6 +2474,7 @@ void BlendPlaneRow_RVV(const uint8_t* src0, // Attenuate: (f * a + 255) >> 8 #ifdef HAS_ARGBATTENUATEROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBAttenuateRow_RVV(const uint8_t* src_argb, uint8_t* dst_argb, int width) { @@ -1551,9 +2508,39 @@ void ARGBAttenuateRow_RVV(const uint8_t* src_argb, dst_argb += vl * 4; } while (w > 0); } +#else +void ARGBAttenuateRow_RVV(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_ba_16, v_ga_16, v_ra_16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + // f * a + v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl); + v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl); + v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl); + // f * a + 255 + v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl); + v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl); + v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl); + // (f * a + 255) >> 8 + v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl); + v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl); + v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBEXTRACTALPHAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, uint8_t* dst_a, int width) { @@ -1568,6 +2555,22 @@ void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, dst_a += vl; } while (w > 0); } +#else +void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_b, v_g, v_r, v_a; + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_a += vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBCOPYYTOALPHAROW_RVV diff --git a/source/row_sme.cc b/source/row_sme.cc index fca536dc4..bd61b20bf 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -1120,20 +1120,6 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y, : "cc", "memory", "z0", "z1", "z2", "p0", "p1"); } -__arm_locally_streaming void ARGBToUVMatrixRow_SME( - const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - int8_t uvconstants[8] = { - (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], - (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; - ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, - uvconstants); -} - __arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, diff --git a/source/row_sve.cc b/source/row_sve.cc index 7d8734921..4a51b68fc 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -217,19 +217,6 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y, NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width); } -void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - int8_t uvconstants[8] = { - (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], - (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; - ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, - uvconstants); -} - void ARGBToUVRow_SVE2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, diff --git a/source/row_win.cc b/source/row_win.cc index 77070d031..e680ffd9d 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -122,10 +122,8 @@ extern "C" { #if defined(__clang__) || defined(__GNUC__) #define LIBYUV_TARGET_AVX2 __attribute__((target("avx2"))) -#define LIBYUV_TARGET_AVX512BW __attribute__((target("avx512bw,avx512vl,avx512f"))) #else #define LIBYUV_TARGET_AVX2 -#define LIBYUV_TARGET_AVX512BW #endif LIBYUV_TARGET_AVX2 @@ -212,197 +210,6 @@ LIBYUV_TARGET_AVX2 void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) { ARGBToYMatrixRow_AVX2(src_bgra, dst_y, width, &kBgraI601Constants); } - -#ifdef HAS_RAWTOARGBROW_AVX2 -LIBYUV_TARGET_AVX2 -void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - __m256i ymm_alpha = _mm256_set1_epi32(0xff000000); - __m128i shuf_low = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2); - __m128i shuf_high = _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6); - __m256i ymm_shuf = _mm256_broadcastsi128_si256(shuf_low); - __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(shuf_high); - - while (width > 0) { - __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_raw); - __m256i ymm0 = _mm256_castsi128_si256(xmm0); - ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1); - - __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_raw + 24)); - __m256i ymm1 = _mm256_castsi128_si256(xmm1); - ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1); - - __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_raw + 48)); - __m256i ymm2 = _mm256_castsi128_si256(xmm2); - ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1); - - __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_raw + 68)); - __m256i ymm3 = _mm256_castsi128_si256(xmm3); - ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1); - - ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); - ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf); - ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf); - ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf2); - - ymm0 = _mm256_or_si256(ymm0, ymm_alpha); - ymm1 = _mm256_or_si256(ymm1, ymm_alpha); - ymm2 = _mm256_or_si256(ymm2, ymm_alpha); - ymm3 = _mm256_or_si256(ymm3, ymm_alpha); - - _mm256_storeu_si256((__m256i*)dst_argb, ymm0); - _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1); - _mm256_storeu_si256((__m256i*)(dst_argb + 64), ymm2); - _mm256_storeu_si256((__m256i*)(dst_argb + 96), ymm3); - - src_raw += 96; - dst_argb += 128; - width -= 32; - } -} -#endif - -#ifdef HAS_RAWTOARGBROW_AVX512BW -LIBYUV_TARGET_AVX512BW -void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m128i* shuffler, int width) { - __m512i zmm_alpha = _mm512_set1_epi32(0xff000000); - __m512i zmm_perm = _mm512_set_epi32( - 12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0); - __m512i zmm_shuf = _mm512_broadcast_i32x4(_mm_loadu_si128(shuffler)); - - while (width > 0) { - __m512i zmm0 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw); - __m512i zmm1 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 48); - __m512i zmm2 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 96); - __m512i zmm3 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 144); - - zmm0 = _mm512_permutexvar_epi32(zmm_perm, zmm0); - zmm1 = _mm512_permutexvar_epi32(zmm_perm, zmm1); - zmm2 = _mm512_permutexvar_epi32(zmm_perm, zmm2); - zmm3 = _mm512_permutexvar_epi32(zmm_perm, zmm3); - - zmm0 = _mm512_shuffle_epi8(zmm0, zmm_shuf); - zmm1 = _mm512_shuffle_epi8(zmm1, zmm_shuf); - zmm2 = _mm512_shuffle_epi8(zmm2, zmm_shuf); - zmm3 = _mm512_shuffle_epi8(zmm3, zmm_shuf); - - zmm0 = _mm512_or_si512(zmm0, zmm_alpha); - zmm1 = _mm512_or_si512(zmm1, zmm_alpha); - zmm2 = _mm512_or_si512(zmm2, zmm_alpha); - zmm3 = _mm512_or_si512(zmm3, zmm_alpha); - - _mm512_storeu_si512(dst_argb, zmm0); - _mm512_storeu_si512(dst_argb + 64, zmm1); - _mm512_storeu_si512(dst_argb + 128, zmm2); - _mm512_storeu_si512(dst_argb + 192, zmm3); - - src_raw += 192; - dst_argb += 256; - width -= 64; - } -} - -LIBYUV_TARGET_AVX512BW -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - __m128i shuf = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2); - RGBToARGBRow_AVX512BW(src_raw, dst_argb, &shuf, width); -} - -LIBYUV_TARGET_AVX512BW -void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - __m128i shuf = _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0); - RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, &shuf, width); -} -#endif - -#ifdef HAS_ARGBTOUVMATRIXROW_AVX2 -LIBYUV_TARGET_AVX2 __attribute__((no_sanitize("cfi-icall"))) -void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - __m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU)); - __m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV)); - __m256i ymm_0101 = _mm256_set1_epi16(0x0101); - __m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, - 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15); - __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000); - __m256i ymm_zero = _mm256_setzero_si256(); - - while (width > 0) { - __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb); - __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32)); - __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb)); - __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32)); - - ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); - ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf); - ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf); - ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf); - - ymm0 = _mm256_maddubs_epi16(ymm0, ymm_0101); - ymm1 = _mm256_maddubs_epi16(ymm1, ymm_0101); - ymm2 = _mm256_maddubs_epi16(ymm2, ymm_0101); - ymm3 = _mm256_maddubs_epi16(ymm3, ymm_0101); - - ymm0 = _mm256_add_epi16(ymm0, ymm2); - ymm1 = _mm256_add_epi16(ymm1, ymm3); - - ymm0 = _mm256_srli_epi16(ymm0, 1); - ymm1 = _mm256_srli_epi16(ymm1, 1); - ymm0 = _mm256_avg_epu16(ymm0, ymm_zero); - ymm1 = _mm256_avg_epu16(ymm1, ymm_zero); - - ymm0 = _mm256_packus_epi16(ymm0, ymm1); - ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8); - - ymm1 = _mm256_maddubs_epi16(ymm0, ymm_v); - ymm0 = _mm256_maddubs_epi16(ymm0, ymm_u); - - ymm0 = _mm256_hadd_epi16(ymm0, ymm1); - ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8); - ymm0 = _mm256_sub_epi16(ymm_8000, ymm0); - ymm0 = _mm256_srli_epi16(ymm0, 8); - ymm0 = _mm256_packus_epi16(ymm0, ymm0); - - __m128i xmm_u = _mm256_castsi256_si128(ymm0); - __m128i xmm_v = _mm256_extracti128_si256(ymm0, 1); - - _mm_storel_epi64((__m128i*)dst_u, xmm_u); - _mm_storel_epi64((__m128i*)dst_v, xmm_v); - - src_argb += 64; - dst_u += 8; - dst_v += 8; - width -= 16; - } -} -#endif - -#ifdef HAS_MERGEUVROW_AVX2 -LIBYUV_TARGET_AVX2 -void MergeUVRow_AVX2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - while (width > 0) { - __m256i ymm0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_u)); - __m256i ymm1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_v)); - - ymm1 = _mm256_slli_epi16(ymm1, 8); - ymm0 = _mm256_or_si256(ymm0, ymm1); - - _mm256_storeu_si256((__m256i*)dst_uv, ymm0); - - src_u += 16; - src_v += 16; - dst_uv += 32; - width -= 16; - } -} -#endif - #endif diff --git a/source/scale_common.cc b/source/scale_common.cc index 537f030aa..e51af8d7a 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -362,35 +362,36 @@ void ScaleRowDown4Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { + intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >> + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] + - src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] + - src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] + - src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] + 8) >> + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >> + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> 4; } } @@ -399,35 +400,36 @@ void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { + intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >> + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] + - src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] + - src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] + - src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] + 8) >> + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >> + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> 4; } } @@ -890,26 +892,27 @@ void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { + intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + - src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * - (65536 / 9) >> - 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * - (65536 / 9) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] + - src_ptr[src_stride * 2 + 7]) * - (65536 / 6) >> - 16; + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; src_ptr += 8; dst_ptr += 3; } @@ -919,26 +922,27 @@ void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width) { + intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + - src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * - (65536u / 9u) >> - 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * - (65536u / 9u) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] + - src_ptr[src_stride * 2 + 7]) * - (65536u / 6u) >> - 16; + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536u / 9u) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536u / 9u) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536u / 6u) >> + 16; src_ptr += 8; dst_ptr += 3; } @@ -949,23 +953,22 @@ void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { + intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = - (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] + - src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) * - (65536 / 6) >> - 16; - dst_ptr[1] = - (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) * - (65536 / 6) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7]) * - (65536 / 4) >> + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; src_ptr += 8; dst_ptr += 3; } @@ -975,23 +978,22 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width) { + intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = - (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] + - src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) * - (65536u / 6u) >> - 16; - dst_ptr[1] = - (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) * - (65536u / 6u) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7]) * - (65536u / 4u) >> + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536u / 6u) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536u / 6u) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536u / 4u) >> + 16; src_ptr += 8; dst_ptr += 3; } @@ -1687,7 +1689,7 @@ void ScalePlaneVertical(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * (ptrdiff_t)src_stride, src_stride, + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, dst_width_bytes, yf); dst_argb += dst_stride; y += dy; @@ -1763,7 +1765,7 @@ void ScalePlaneVertical_16(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * (ptrdiff_t)src_stride, src_stride, + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, dst_width_words, yf); dst_argb += dst_stride; y += dy; @@ -1832,8 +1834,8 @@ void ScalePlaneVertical_16To8(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow_16To8(dst_argb, src_argb + yi * (ptrdiff_t)src_stride, - src_stride, scale, dst_width_words, yf); + InterpolateRow_16To8(dst_argb, src_argb + yi * src_stride, src_stride, + scale, dst_width_words, yf); dst_argb += dst_stride; y += dy; } diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 6a2524230..fdd38dfe5 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -183,10 +183,10 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } @@ -283,10 +283,10 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEROWDOWN2_AVX2 @@ -326,7 +326,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - ptrdiff_t stridex3; + intptr_t stridex3; asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "pabsw %%xmm4,%%xmm5 \n" @@ -367,11 +367,11 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, "lea 0x8(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "=&r"(stridex3) // %3 - : "r"(src_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "=&r"(stridex3) // %3 + : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -456,11 +456,11 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(src_stride * 3) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(src_stride * 3)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEROWDOWN4_AVX2 @@ -557,11 +557,11 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kMadd21) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -625,11 +625,11 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kMadd21) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -701,10 +701,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } @@ -762,10 +762,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -935,11 +935,11 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1084,12 +1084,12 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, "lea 0x20(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearShuffleFar) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1246,11 +1246,11 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1371,12 +1371,12 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearMadd31) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1497,12 +1497,12 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearMadd31) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1612,12 +1612,12 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearShuffleFar) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif @@ -1746,11 +1746,11 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif @@ -2016,10 +2016,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } @@ -2030,8 +2030,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width) { - ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx); - ptrdiff_t src_stepx_x12; + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12; (void)src_stride; asm volatile( "lea 0x00(,%1,4),%1 \n" @@ -2067,8 +2067,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width) { - ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx); - ptrdiff_t src_stepx_x12; + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12; + intptr_t row1 = (intptr_t)(src_stride); asm volatile( "lea 0x00(,%1,4),%1 \n" "lea 0x00(%1,%1,2),%4 \n" @@ -2101,7 +2102,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, "+r"(dst_argb), // %2 "+rm"(dst_width), // %3 "=&r"(src_stepx_x12), // %4 - "+r"(src_stride) // %5 + "+r"(row1) // %5 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } @@ -2363,12 +2364,12 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, "lea 0x8(%1),%1 \n" // 4 UV "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kShuffleSplitUV), // %4 - "m"(kShuffleMergeUV) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 @@ -2404,12 +2405,12 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kShuffleSplitUV), // %4 - "m"(kShuffleMergeUV) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEUVROWDOWN2BOX_AVX2 @@ -2530,12 +2531,12 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" // 4 uv to 8 uv "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kUVLinearMadd31) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kUVLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -2654,12 +2655,12 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kUVLinearMadd31) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kUVLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -2798,11 +2799,11 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, "lea 0x10(%1),%1 \n" // 2 uv to 4 uv "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -2929,11 +2930,11 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc index 177f3a669..e9a91804b 100644 --- a/unit_test/convert_argb_test.cc +++ b/unit_test/convert_argb_test.cc @@ -2827,8 +2827,9 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) { int has_large_malloc = 1; #endif if (!has_large_malloc) { - GTEST_SKIP() << "WARNING: Large allocation may assert for " - << (size_t)kWidth * kHeight << " bytes"; + printf("WARNING: Skipped. Large allocation may assert for %zd\n", + (size_t)kWidth * kHeight); + return; } // Allocate one extra column so that the coalesce optimizations do not trigger @@ -2840,16 +2841,20 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) { fflush(stdout); align_buffer_page_end(orig_i400, (size_t)kWidth * kHeight); if (!orig_i400) { - GTEST_SKIP() << "WARNING: unable to allocate I400 image of " - << (size_t)kWidth * kHeight << " bytes"; + printf("WARNING: unable to allocate I400 image of %zd bytes\n", + (size_t)kWidth * kHeight); + fflush(stdout); + return; } printf("INFO: allocate I400 image returned %p\n", orig_i400); fflush(stdout); align_buffer_page_end(dest_argb, (size_t)kWidth * kHeight * 4); if (!dest_argb) { + printf("WARNING: unable to allocate ARGB image of %zd bytes\n", + (size_t)kWidth * kHeight * 4); + fflush(stdout); free_aligned_buffer_page_end(orig_i400); - GTEST_SKIP() << "WARNING: unable to allocate ARGB image of " - << (size_t)kWidth * kHeight * 4 << " bytes"; + return; } printf("INFO: allocate ARGB image returned %p\n", dest_argb); fflush(stdout); @@ -2867,72 +2872,4 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) { #endif // !defined(LEAN_TESTS) - -#define TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, W1280, N, NEG, OFF) \ - TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ - const int kWidth = W1280; \ - const int kHeight = benchmark_height_; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kStrideA = \ - (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - const int kStrideY = kWidth; \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2; \ - const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_argb, \ - kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ - align_buffer_page_end(dst_y_c, kStrideY* kHeight); \ - align_buffer_page_end(dst_uv_c, kSizeUV); \ - align_buffer_page_end(dst_y_opt, kStrideY* kHeight); \ - align_buffer_page_end(dst_uv_opt, kSizeUV); \ - for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ - src_argb[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_y_c, 1, kStrideY* kHeight); \ - memset(dst_uv_c, 2, kSizeUV); \ - memset(dst_y_opt, 101, kStrideY* kHeight); \ - memset(dst_uv_opt, 102, kSizeUV); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_c, kStrideY, \ - dst_uv_c, kStrideUV, kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_opt, \ - kStrideY, dst_uv_opt, kStrideUV, kWidth, NEG kHeight); \ - } \ - for (int i = 0; i < kStrideY * kHeight; ++i) { \ - EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]); \ - } \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_uv_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_uv_opt); \ - } - -#if defined(ENABLE_FULL_TESTS) -#define TESTATOBP(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y) \ - TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0) \ - TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, benchmark_width_, _Unaligned, +, 4) \ - TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \ - TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, benchmark_width_, _Opt, +, 0) -#else -#define TESTATOBP(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y) \ - TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, benchmark_width_, _Opt, +, 0) -#endif - -TESTATOBP(RAW, uint8_t, 3, 3, 1, NV21, 2, 2) -TESTATOBP(RGB24, uint8_t, 3, 3, 1, NV12, 2, 2) -TESTATOBP(RAW, uint8_t, 3, 3, 1, JNV21, 2, 2) - } // namespace libyuv diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 3d5ce3799..f5c9c6259 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -825,6 +825,7 @@ TESTATOBP(ARGB, 1, 4, NV12, 2, 2) TESTATOBP(ARGB, 1, 4, NV21, 2, 2) TESTATOBP(ABGR, 1, 4, NV12, 2, 2) TESTATOBP(ABGR, 1, 4, NV21, 2, 2) +TESTATOBP(RAW, 1, 3, JNV21, 2, 2) TESTATOBP(YUY2, 2, 4, NV12, 2, 2) TESTATOBP(UYVY, 2, 4, NV12, 2, 2) TESTATOBP(AYUV, 1, 4, NV12, 2, 2) diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc index abc08efa8..9a9a4a305 100644 --- a/unit_test/rotate_test.cc +++ b/unit_test/rotate_test.cc @@ -892,11 +892,6 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Test) { Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4, (uint8_t*)dst_pixels_opt, width * 4, width); } else -#elif defined(HAS_TRANSPOSE4X4_32_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4, - (uint8_t*)dst_pixels_opt, width * 4, width); - } else #endif { Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, diff --git a/unit_test/scale_plane_test.cc b/unit_test/scale_plane_test.cc index 979c70aad..3e801f250 100644 --- a/unit_test/scale_plane_test.cc +++ b/unit_test/scale_plane_test.cc @@ -8,14 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include #include -#include #include -#include - #include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" #include "libyuv/scale.h" @@ -43,95 +38,6 @@ namespace libyuv { #ifdef ENABLE_ROW_TESTS -#ifdef HAS_SCALEROWDOWN2_SSSE3 -TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { - SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]); - SIMD_ALIGNED(uint8_t dst_pixels_opt[64]); - SIMD_ALIGNED(uint8_t dst_pixels_c[64]); - memset(orig_pixels, 0, sizeof(orig_pixels)); - memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt)); - memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); - - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - if (!has_ssse3) { - printf("Warning SSSE3 not detected; Skipping test.\n"); - } else { - // TL. - orig_pixels[0] = 255u; - orig_pixels[1] = 0u; - orig_pixels[128 + 0] = 0u; - orig_pixels[128 + 1] = 0u; - // TR. - orig_pixels[2] = 0u; - orig_pixels[3] = 100u; - orig_pixels[128 + 2] = 0u; - orig_pixels[128 + 3] = 0u; - // BL. - orig_pixels[4] = 0u; - orig_pixels[5] = 0u; - orig_pixels[128 + 4] = 50u; - orig_pixels[128 + 5] = 0u; - // BR. - orig_pixels[6] = 0u; - orig_pixels[7] = 0u; - orig_pixels[128 + 6] = 0u; - orig_pixels[128 + 7] = 20u; - // Odd. - orig_pixels[126] = 4u; - orig_pixels[127] = 255u; - orig_pixels[128 + 126] = 16u; - orig_pixels[128 + 127] = 255u; - - // Test regular half size. - ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64); - - EXPECT_EQ(64u, dst_pixels_c[0]); - EXPECT_EQ(25u, dst_pixels_c[1]); - EXPECT_EQ(13u, dst_pixels_c[2]); - EXPECT_EQ(5u, dst_pixels_c[3]); - EXPECT_EQ(0u, dst_pixels_c[4]); - EXPECT_EQ(133u, dst_pixels_c[63]); - - // Test Odd width version - Last pixel is just 1 horizontal pixel. - ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); - - EXPECT_EQ(64u, dst_pixels_c[0]); - EXPECT_EQ(25u, dst_pixels_c[1]); - EXPECT_EQ(13u, dst_pixels_c[2]); - EXPECT_EQ(5u, dst_pixels_c[3]); - EXPECT_EQ(0u, dst_pixels_c[4]); - EXPECT_EQ(10u, dst_pixels_c[63]); - - // Test one pixel less, should skip the last pixel. - memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); - ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63); - - EXPECT_EQ(64u, dst_pixels_c[0]); - EXPECT_EQ(25u, dst_pixels_c[1]); - EXPECT_EQ(13u, dst_pixels_c[2]); - EXPECT_EQ(5u, dst_pixels_c[3]); - EXPECT_EQ(0u, dst_pixels_c[4]); - EXPECT_EQ(0u, dst_pixels_c[63]); - - // Test regular half size SSSE3. - ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); - - EXPECT_EQ(64u, dst_pixels_opt[0]); - EXPECT_EQ(25u, dst_pixels_opt[1]); - EXPECT_EQ(13u, dst_pixels_opt[2]); - EXPECT_EQ(5u, dst_pixels_opt[3]); - EXPECT_EQ(0u, dst_pixels_opt[4]); - EXPECT_EQ(133u, dst_pixels_opt[63]); - - // Compare C and SSSE3 match. - ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); - ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); - for (int i = 0; i < 64; ++i) { - EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); - } - } -} -#endif // HAS_SCALEROWDOWN2_SSSE3 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) { SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]); @@ -467,71 +373,4 @@ TEST_F(LibYUVScaleTest, PlaneTest1_16_Box) { free_aligned_buffer_page_end(dst_pixels_alloc); free_aligned_buffer_page_end(orig_pixels_alloc); } - -// POC: int * int overflow in ScalePlaneVertical (scale_common.cc). -// -// `yi * src_stride` is evaluated as int * int. When the product exceeds -// INT_MAX it wraps negative and InterpolateRow reads from BEFORE the -// source allocation. -// -// Parameters: -// - dst_width == src_width -// -> ScalePlane dispatches to ScalePlaneVertical -// - src_height == 5, dst_height == 1 -// -> single iteration with yi == 2 -// - src_stride == 0x7FFFFFF8 -// -> 2 * 0x7FFFFFF8 == 0xFFFFFFF0 == -16 (int) -// -// The source buffer is sized so that the *correct* 64-bit offset -// (2 * 0x7FFFFFF8 == 4294967280) plus kWidth bytes is in-bounds. With the -// bug, the 32-bit product is -16 and ASAN reports a heap-buffer-overflow -// READ "16 bytes before" the allocation. -TEST_F(LibYUVScaleTest, ScalePlaneVertical_IntStrideOverflow) { - const int kWidth = 16; - const int kSrcHeight = 5; - const int kDstHeight = 1; - const int kStride = 0x7FFFFFF8; // 2147483640 - - // src_size is big enough for the only row this call legitimately touches - // (yi == 2) when computed in 64-bit: 2 * stride + width = 4 GiB. - size_t src_size = kStride; - if (src_size > SIZE_MAX / 2) { - GTEST_SKIP() << "could not represent allocation size in size_t"; - } - src_size *= 2; - if (src_size > SIZE_MAX - kWidth) { - GTEST_SKIP() << "could not represent allocation size in size_t"; - } - src_size += kWidth; - -#if defined(__aarch64__) - // Infer malloc can accept a large size for cpu with dot product (a76/a55) - int has_large_malloc = TestCpuFlag(kCpuHasNeonDotProd); -#else - int has_large_malloc = 1; -#endif - if (!has_large_malloc) { - GTEST_SKIP() << "large allocation may assert for " << src_size << " bytes"; - } - - uint8_t* src = new (std::nothrow) uint8_t[src_size]; - if (!src) { - GTEST_SKIP() << "could not allocate " << src_size << " bytes"; - } - uint8_t* dst = new uint8_t[kWidth]; - memset(dst, 0, kWidth); - - // Force the scalar path so the crash site is deterministic - // (InterpolateRow_C -> memcpy when yf == 0). - MaskCpuFlags(disable_cpu_flags_); - - int r = ScalePlane(src, kStride, kWidth, kSrcHeight, dst, kWidth, kWidth, - kDstHeight, kFilterNone); - - // Not reached under ASAN. - EXPECT_EQ(0, r); - delete[] src; - delete[] dst; -} - } // namespace libyuv