From 36e0fd216bedfd7404cd88b33434143b445a2cf4 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 29 Apr 2026 17:06:56 -0700 Subject: [PATCH] [libyuv] Remove all x86 SSE optimizations Removed all SSE functions, macros, dispatching logic, and related unit tests across the repository to reduce code size and complexity. Left cpuid detection intact. Supported architectures like AVX2, NEON, SVE, etc. are unaffected. R=rrwinterton@gmail.com Bug: None Test: Build and run libyuv_unittest Change-Id: Id19608dba35b79c4c8fc31f920a6a968883d300f --- README.chromium | 2 +- include/libyuv/convert_from_argb.h | 34 - include/libyuv/planar_functions.h | 3 - include/libyuv/row.h | 176 ++--- include/libyuv/row_sve.h | 6 +- include/libyuv/version.h | 2 +- psnr.o | Bin 0 -> 2560 bytes source/convert.cc | 874 +++++++---------------- source/convert_argb.cc | 27 +- source/convert_from_argb.cc | 645 +++++------------ source/planar_functions.cc | 2 +- source/rotate_gcc.cc | 34 +- source/row_any.cc | 42 +- source/row_common.cc | 52 ++ source/row_gcc.cc | 210 ++---- source/row_lasx.cc | 42 +- source/row_lsx.cc | 42 +- source/row_neon.cc | 80 +-- source/row_neon64.cc | 155 ++-- source/row_rvv.cc | 1047 +++++++++++++++++++++++++++- source/row_sme.cc | 14 - source/row_sve.cc | 13 - source/row_win.cc | 193 ----- source/scale_common.cc | 202 +++--- source/scale_gcc.cc | 239 +++---- unit_test/convert_argb_test.cc | 85 +-- unit_test/convert_test.cc | 1 + unit_test/rotate_test.cc | 5 - unit_test/scale_plane_test.cc | 161 ----- 29 files changed, 2031 insertions(+), 2357 deletions(-) create mode 100644 psnr.o diff --git a/README.chromium b/README.chromium index 1407f963e..a805c91be 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1934 +Version: 1928 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h index 8adec16dc..c0473fd70 100644 --- a/include/libyuv/convert_from_argb.h +++ b/include/libyuv/convert_from_argb.h @@ -456,40 +456,6 @@ int ARGBToUYVY(const uint8_t* src_argb, int width, int height); -// RAW to NV21 with Matrix -LIBYUV_API -int RAWToNV21Matrix(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - const struct ArgbConstants* argbconstants, - int width, - int height); - -// RAW to NV21 -LIBYUV_API -int RAWToNV21(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height); - -// RGB24 to NV12 -LIBYUV_API -int RGB24ToNV12(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height); - // RAW to JNV21 full range NV21 LIBYUV_API int RAWToJNV21(const uint8_t* src_raw, diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 20bf78198..852736a97 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -487,9 +487,6 @@ int NV21ToNV12(const uint8_t* src_y, int width, int height); -// Alias -#define NV12ToNV21 NV21ToNV12 - LIBYUV_API int YUY2ToY(const uint8_t* src_yuy2, int src_stride_yuy2, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 40272cf5a..b47d42eed 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -140,13 +140,6 @@ extern "C" { // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__) || \ - defined(_M_X64) || defined(_M_X86)) -#define HAS_ARGBTOUVMATRIXROW_AVX2 -#define HAS_MERGEUVROW_AVX2 -#endif - #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ defined(GCC_HAS_AVX2)) @@ -170,6 +163,7 @@ extern "C" { #define HAS_I444TORGB24ROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 #define HAS_J422TOARGBROW_AVX2 +#define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 #define HAS_NV12TORGB24ROW_AVX2 @@ -200,6 +194,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ (defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_ENABLE_ROWWIN) +#define HAS_RAWTOYJROW_SSSE3 #define HAS_AB64TOARGBROW_SSSE3 #define HAS_ABGRTOAR30ROW_SSSE3 #define HAS_ABGRTOYJROW_SSSE3 @@ -250,9 +245,11 @@ extern "C" { // TODO: port row_win to use 8 bit coefficients. #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 -#define HAS_ARGBTOYMATRIXROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 +#define HAS_RGB24TOYJROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 // TODO: adjust row_win to use 8 bit negative coefficients. @@ -300,7 +297,6 @@ extern "C" { #define HAS_ARGBTOUV444MATRIXROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 -#define HAS_ARGBTOYMATRIXROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 @@ -334,6 +330,8 @@ extern "C" { #define HAS_P210TOARGBROW_AVX2 #define HAS_P410TOAR30ROW_AVX2 #define HAS_P410TOARGBROW_AVX2 +#define HAS_RAWTOYJROW_AVX2 +#define HAS_RGB24TOYJROW_AVX2 #define HAS_RGBATOYJROW_AVX2 #define HAS_SPLITARGBROW_AVX2 #define HAS_SPLITRGBROW_AVX2 @@ -356,13 +354,7 @@ extern "C" { defined(_M_X64) || defined(_M_X86)) && \ ((defined(_MSC_VER) && !defined(__clang__)) || \ defined(LIBYUV_ENABLE_ROWWIN)) -#define HAS_RAWTOARGBROW_AVX2 -#if defined(__x86_64__) || defined(_M_X64) -#define HAS_RAWTOARGBROW_AVX512BW -#define HAS_RGB24TOARGBROW_AVX512BW -#endif #define HAS_ARGBTOYROW_AVX2 -#define HAS_ARGBTOYMATRIXROW_AVX2 #define HAS_ABGRTOYROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ABGRTOYJROW_AVX2 @@ -378,10 +370,6 @@ extern "C" { (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) && \ !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_COPYROW_AVX512BW -#if defined(__x86_64__) || defined(_M_X64) -#define HAS_RAWTOARGBROW_AVX512BW -#define HAS_RGB24TOARGBROW_AVX512BW -#endif #define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_CONVERT16TO8ROW_AVX512BW #define HAS_MERGEUVROW_AVX512BW @@ -395,7 +383,6 @@ extern "C" { #define HAS_ARGBTOUV444ROW_AVX512BW #define HAS_ARGBTOUV444MATRIXROW_AVX512BW #define HAS_ARGBTOYROW_AVX512BW -#define HAS_ARGBTOYMATRIXROW_AVX512BW #define HAS_ARGBTOUVJ444ROW_AVX512BW #define HAS_ARGBTOUVROW_AVX512BW #define HAS_ARGBTOUVJROW_AVX512BW @@ -433,7 +420,6 @@ extern "C" { #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJ444ROW_NEON #define HAS_ARGBTOUVJROW_NEON -#define HAS_ARGBTOUVMATRIXROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #if !defined(__aarch64__) @@ -496,9 +482,13 @@ extern "C" { #define HAS_RAWTORGBAROW_NEON #define HAS_RAWTOUVJROW_NEON #define HAS_RAWTOUVROW_NEON +#define HAS_RAWTOYJROW_NEON +#define HAS_RAWTOYROW_NEON #define HAS_RGB24TOARGBROW_NEON #define HAS_RGB24TOUVJROW_NEON #define HAS_RGB24TOUVROW_NEON +#define HAS_RGB24TOYJROW_NEON +#define HAS_RGB24TOYROW_NEON #define HAS_RGB565TOARGBROW_NEON #define HAS_RGB565TOUVROW_NEON #define HAS_RGB565TOYROW_NEON @@ -569,7 +559,6 @@ extern "C" { #define HAS_ARGBSEPIAROW_NEON_DOTPROD #define HAS_ARGBTOYJROW_NEON_DOTPROD #define HAS_ARGBTOYROW_NEON_DOTPROD -#define HAS_ARGBTOYMATRIXROW_NEON_DOTPROD #define HAS_BGRATOYROW_NEON_DOTPROD #define HAS_RGBATOYJROW_NEON_DOTPROD #define HAS_RGBATOYROW_NEON_DOTPROD @@ -580,7 +569,6 @@ extern "C" { #define HAS_ARGBTOUV444ROW_NEON_I8MM #define HAS_ARGBTOUVJ444ROW_NEON_I8MM #define HAS_ARGBTOUVJROW_NEON_I8MM -#define HAS_ARGBTOUVMATRIXROW_NEON_I8MM #define HAS_ARGBTOUVROW_NEON_I8MM #define HAS_BGRATOUVROW_NEON_I8MM #define HAS_RGBATOUVROW_NEON_I8MM @@ -596,7 +584,6 @@ extern "C" { #define HAS_ARGBTORGB565DITHERROW_SVE2 #define HAS_ARGBTORGB565ROW_SVE2 #define HAS_ARGBTOUVJROW_SVE2 -#define HAS_ARGBTOUVMATRIXROW_SVE2 #define HAS_ARGBTOUVROW_SVE2 #define HAS_AYUVTOUVROW_SVE2 #define HAS_AYUVTOVUROW_SVE2 @@ -648,7 +635,6 @@ extern "C" { #define HAS_ABGRTOUVROW_SME #define HAS_ARGBMULTIPLYROW_SME #define HAS_ARGBTOUVJROW_SME -#define HAS_ARGBTOUVMATRIXROW_SME #define HAS_ARGBTOUVROW_SME #define HAS_BGRATOUVROW_SME #define HAS_CONVERT16TO8ROW_SME @@ -757,8 +743,10 @@ extern "C" { #define HAS_RAWTOARGBROW_LSX #define HAS_RAWTORGB24ROW_LSX #define HAS_RAWTOUVROW_LSX +#define HAS_RAWTOYROW_LSX #define HAS_RGB24TOARGBROW_LSX #define HAS_RGB24TOUVROW_LSX +#define HAS_RGB24TOYROW_LSX #define HAS_RGB565TOARGBROW_LSX #define HAS_RGB565TOUVROW_LSX #define HAS_RGB565TOYROW_LSX @@ -778,9 +766,10 @@ extern "C" { #define HAS_YUY2TOUV422ROW_LSX #define HAS_YUY2TOYROW_LSX #define HAS_ARGBTOYROW_LSX -#define HAS_ARGBTOYMATRIXROW_LSX #define HAS_ABGRTOYJROW_LSX #define HAS_RGBATOYJROW_LSX +#define HAS_RGB24TOYJROW_LSX +#define HAS_RAWTOYJROW_LSX #endif #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) @@ -813,7 +802,6 @@ extern "C" { #define HAS_ARGBTOUVROW_LASX #define HAS_ARGBTOYJROW_LASX #define HAS_ARGBTOYROW_LASX -#define HAS_ARGBTOYMATRIXROW_LASX #define HAS_ABGRTOYJROW_LASX #define HAS_ABGRTOYROW_LASX #define HAS_I422ALPHATOARGBROW_LASX @@ -832,8 +820,10 @@ extern "C" { #define HAS_NV21TOARGBROW_LASX #define HAS_RAWTOARGBROW_LASX #define HAS_RAWTOUVROW_LASX +#define HAS_RAWTOYROW_LASX #define HAS_RGB24TOARGBROW_LASX #define HAS_RGB24TOUVROW_LASX +#define HAS_RGB24TOYROW_LASX #define HAS_RGB565TOARGBROW_LASX #define HAS_RGB565TOUVROW_LASX #define HAS_RGB565TOYROW_LASX @@ -846,6 +836,8 @@ extern "C" { #define HAS_RGBATOYROW_LASX #define HAS_RGBATOYJROW_LASX #define HAS_BGRATOYROW_LASX +#define HAS_RGB24TOYJROW_LASX +#define HAS_RAWTOYJROW_LASX #endif #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) @@ -875,6 +867,10 @@ extern "C" { #define HAS_BGRATOYROW_RVV #define HAS_COPYROW_RVV #define HAS_INTERPOLATEROW_RVV +#define HAS_RAWTOYJROW_RVV +#define HAS_RAWTOYROW_RVV +#define HAS_RGB24TOYJROW_RVV +#define HAS_RGB24TOYROW_RVV #define HAS_RGBATOYJROW_RVV #define HAS_RGBATOYMATRIXROW_RVV #define HAS_RGBATOYROW_RVV @@ -896,7 +892,8 @@ extern "C" { // __riscv_vcreate_v_u8m2x3 // __riscv_vcreate_v_u8m2x4 // __riscv_vcreate_v_u8m4x2 -#if defined(LIBYUV_RVV_HAS_VCREATE) +#if !defined(LIBYUV_RVV_HAS_TUPLE_TYPE) || \ + (defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VCREATE)) #define HAS_AB64TOARGBROW_RVV #define HAS_AR64TOAB64ROW_RVV #define HAS_ARGBATTENUATEROW_RVV @@ -1779,6 +1776,12 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYRow_AVX512BW(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width); +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width); void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width); @@ -1844,43 +1847,6 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); -void ARGBToUVMatrixRow_Any_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); -void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); -void ARGBToUVMatrixRow_Any_NEON_I8MM(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); -void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); -void ARGBToUVMatrixRow_SME(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c); - void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2131,6 +2097,10 @@ void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr, void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width); void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, @@ -2141,19 +2111,31 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width); void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width); void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width); void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width); void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width); +void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width); void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); @@ -2215,42 +2197,6 @@ void ARGBToYMatrixRow_Any_AVX512BW(const uint8_t* src_argb, int width, const struct ArgbConstants* c); -void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); - -void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); -void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); -void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); -void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); - -void ARGBToYMatrixRow_Any_NEON_DotProd(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); -void ARGBToYMatrixRow_Any_LSX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); -void ARGBToYMatrixRow_Any_LASX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); - - void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2305,6 +2251,10 @@ void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); @@ -2324,6 +2274,14 @@ void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON_DotProd(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -2352,6 +2310,10 @@ void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_NEON_DotProd(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -2365,21 +2327,29 @@ void ABGRToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4029,7 +3999,6 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, int width); void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); @@ -4121,9 +4090,6 @@ void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); -void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h index f7e2123a7..e47b9fe5e 100644 --- a/include/libyuv/row_sve.h +++ b/include/libyuv/row_sve.h @@ -2019,7 +2019,7 @@ static const int8_t kABGRToUVJCoefficients[] = { 43, 85, -128, 0, -128, 107, 21, 0, }; -#define ARGBTOUVMATRIX_SVE \ +#define ABCDTOUVMATRIX_SVE \ "ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \ "ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \ "ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \ @@ -2113,7 +2113,7 @@ static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb, "ptrue p4.d \n" "ptrue p5.h \n" "1: \n" // - ARGBTOUVMATRIX_SVE + ABCDTOUVMATRIX_SVE "b.gt 1b \n" "2: \n" @@ -2126,7 +2126,7 @@ static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb, "whilelt p3.d, %w[vl2], %w[width] \n" "whilelt p4.d, %w[vl3], %w[width] \n" "whilelt p5.h, wzr, %w[width] \n" // - ARGBTOUVMATRIX_SVE + ABCDTOUVMATRIX_SVE "b.gt 3b \n" "99: \n" diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b745710eb..06231806f 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1934 +#define LIBYUV_VERSION 1928 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/psnr.o b/psnr.o new file mode 100644 index 0000000000000000000000000000000000000000..bb3fe2adc5cc430318c0d3cd74a15990a660920b GIT binary patch literal 2560 zcmbtV-)|d55T3K0CM|8x282MZgoP-jK->fs)bjJ%bm<}`1rk}b2%278o1)fsIXjX; z2`yM4PF5uz5cCoLg1+^k2ajY=yLRf0OQfOS-d7H+{1%R>@fxU7Bv~ zt%R=e(L&ednsG zQT@}xI?Gt6;ii}5`GHS--khOF}p5K#LvzDcC zTKlu&UlZNPij66(J6HCMXS%7__Gxt3jJK!9axquPN>nz;H(vB4jbVai|--PnW zxT@EbwF%lUF7!v~*7tLI+OEzLD3qYuG?~7=mJU%{-~R3U!P+oiSAObM?#Y?Ay|y%; z@NPXM-358$k80D98y6T}Xq!j8>)fwo+f0ZS4in9`8CSyqwWe(zy@J4&KtG0hLUa)j z-JH2ww46;!*W|l+$3!b9&5t}B3SKM7!MToJxqg+9H92mPn)Gg4*9Px%U+#Q$AaFfu z!J)CS=aP>f%+9=L7m}mTq@GTV4DZ>cHGunrKrlX>1fvo+wrw69`T{TJcEsr+IYGnz zt?|zyV_P;vrZFP$y+An12wi0MN1rq(fVgjy6J?$T_D3p;|I5#jSQNStMb~D0Mkxk` zbE=wX#}pb2RL9g9zdMxs5sj&+=#ViGUx~%yE17|z6#%q>06=87LC~FuITfL3|29ya z82sh;^GEx>!B`U1`LCmFL<<+PhG7epJb+`QClBv?C*t=a&bI0?VlQg^OWgp~Tf|<) zCyviua;8*pQke`)zcs%1)P!Amed_QeOwHz}-k-IdtZQKFF8t8Jx>bFrQ@2$)2`TtTiN1770YD1e1D~iOcu`Beq0}jtNA=JpKTkJe)T`^Q-2f2 zGN0>H@4~ey_cyx!G8riJx1~EC=F|5#e0LCnioU> 1; +#if defined(HAS_RAWTOYJROW) + void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + RAWToUVJRow_C; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYJRow_C; +#else void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; - void (*ARGBToUVMatrixRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width, - const struct ArgbConstants* c) = - ARGBToUVMatrixRow_C; - void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = ARGBToYMatrixRow_C; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYJRow_C; +#endif void (*MergeUVRow)(const uint8_t* src_uj, const uint8_t* src_vj, uint8_t* dst_vu, int width) = MergeUVRow_C; -#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) - if (TestCpuFlag(kCpuHasNeonDotProd)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; - } - } -#endif -#if defined(HAS_ARGBTOYMATRIXROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; - } -#endif - - - if (!src_raw || !dst_y || !dst_vu || !argbconstants || width <= 0 || height == 0) { + if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -4213,6 +3944,44 @@ int RAWToNV21Matrix(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } +#if defined(HAS_RAWTOYJROW) + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVJRow = RAWToUVJRow_Any_NEON; + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_NEON; + RAWToUVJRow = RAWToUVJRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToYJRow = RAWToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToYJRow = RAWToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif + +// Other platforms do intermediate conversion from RAW to ARGB. +#else // HAS_RAWTOYJROW + #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; @@ -4229,99 +3998,47 @@ int RAWToNV21Matrix(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; - if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToARGBRow = RAWToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToARGBRow = RAWToARGBRow_NEON; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - RAWToARGBRow = RAWToARGBRow_SVE2; - } -#endif -#if defined(HAS_RAWTOARGBROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - RAWToARGBRow = RAWToARGBRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - RAWToARGBRow = RAWToARGBRow_LSX; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_LASX) - if (TestCpuFlag(kCpuHasLASX)) { - RAWToARGBRow = RAWToARGBRow_Any_LASX; - if (IS_ALIGNED(width, 32)) { - RAWToARGBRow = RAWToARGBRow_LASX; - } - } -#endif -#if defined(HAS_RAWTOARGBROW_RVV) - if (TestCpuFlag(kCpuHasRVV)) { - RAWToARGBRow = RAWToARGBRow_RVV; - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) - if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SME) - if (TestCpuFlag(kCpuHasSME)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOUVMATRIXROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; + ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW; +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW; + ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif - +#if defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVJRow = ARGBToUVJRow_AVX512BW; + } + } +#endif +#endif // HAS_RAWTOYJROW #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow = MergeUVRow_Any_SSE2; @@ -4333,7 +4050,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } @@ -4372,86 +4089,58 @@ int RAWToNV21Matrix(const uint8_t* src_raw, MergeUVRow = MergeUVRow_RVV; } #endif - { - // Allocate 2 rows of ARGB. - const int row_size = (width * 4 + 31) & ~31; - align_buffer_64(row, row_size * 2); - // Allocate 1 row of U and 1 row of V. - align_buffer_64(row_u, halfwidth); - align_buffer_64(row_v, halfwidth); - - if (!row || !row_u || !row_v) { - free_aligned_buffer_64(row); - free_aligned_buffer_64(row_u); - free_aligned_buffer_64(row_v); +#if defined(HAS_RAWTOYJROW) + // Allocate a row of uv. + const int row_uv_size = ((halfwidth + 31) & ~31); + align_buffer_64(row_uj, row_uv_size * 2); + uint8_t* row_vj = row_uj + row_uv_size; +#else + // Allocate row of uv and 2 rows of ARGB. + const int row_size = ((width * 4 + 31) & ~31); + const int row_uv_size = ((halfwidth + 31) & ~31); + align_buffer_64(row_uj, row_uv_size * 2 + row_size * 2); + uint8_t* row_vj = row_uj + row_uv_size; + uint8_t* row = row_vj + row_uv_size; +#endif + if (!row_uj) return 1; - } for (y = 0; y < height - 1; y += 2) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width); + MergeUVRow(row_vj, row_uj, dst_vu, halfwidth); + RAWToYJRow(src_raw, dst_y, width); + RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else RAWToARGBRow(src_raw, row, width); RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width); - ARGBToUVMatrixRow(row, row_size, row_u, row_v, width, argbconstants); - MergeUVRow(row_v, row_u, dst_vu, halfwidth); - ARGBToYMatrixRow(row, dst_y, width, argbconstants); - ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, argbconstants); + ARGBToUVJRow(row, row_size, row_uj, row_vj, width); + MergeUVRow(row_vj, row_uj, dst_vu, halfwidth); + ARGBToYJRow(row, dst_y, width); + ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width); +#endif src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; dst_vu += dst_stride_vu; } if (height & 1) { +#if defined(HAS_RAWTOYJROW) + RAWToUVJRow(src_raw, 0, row_uj, row_vj, width); + MergeUVRow(row_vj, row_uj, dst_vu, halfwidth); + RAWToYJRow(src_raw, dst_y, width); +#else RAWToARGBRow(src_raw, row, width); - ARGBToUVMatrixRow(row, 0, row_u, row_v, width, argbconstants); - MergeUVRow(row_v, row_u, dst_vu, halfwidth); - ARGBToYMatrixRow(row, dst_y, width, argbconstants); + ARGBToUVJRow(row, 0, row_uj, row_vj, width); + MergeUVRow(row_vj, row_uj, dst_vu, halfwidth); + ARGBToYJRow(row, dst_y, width); +#endif } - free_aligned_buffer_64(row_v); - free_aligned_buffer_64(row_u); - free_aligned_buffer_64(row); + free_aligned_buffer_64(row_uj); } return 0; } - -LIBYUV_API -int RAWToJNV21(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - return RAWToNV21Matrix(src_raw, src_stride_raw, dst_y, dst_stride_y, dst_vu, - dst_stride_vu, &kArgbJPEGConstants, width, height); -} - -LIBYUV_API -int RAWToNV21(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_vu, - int dst_stride_vu, - int width, - int height) { - return RAWToNV21Matrix(src_raw, src_stride_raw, dst_y, dst_stride_y, dst_vu, - dst_stride_vu, &kArgbI601Constants, width, height); -} - -LIBYUV_API -int RGB24ToNV12(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - return RAWToNV21Matrix(src_rgb24, src_stride_rgb24, dst_y, dst_stride_y, - dst_uv, dst_stride_uv, &kAbgrI601Constants, width, - height); -} - +#undef HAS_RAWTOYJROW #ifdef __cplusplus } // extern "C" diff --git a/source/planar_functions.cc b/source/planar_functions.cc index fde3717a4..96cac25f3 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -693,7 +693,7 @@ void MergeUVPlane(const uint8_t* src_u, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { + if (IS_ALIGNED(width, 16)) { MergeUVRow = MergeUVRow_AVX2; } } diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index 9847ecd48..ae7436b12 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -101,11 +101,11 @@ void TransposeWx8_SSSE3(const uint8_t* src, "movq %%xmm7,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(src_stride)), // %3 - "r"((ptrdiff_t)(dst_stride)) // %4 + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -243,11 +243,11 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src, "movq %%xmm15,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(src_stride)), // %3 - "r"((ptrdiff_t)(dst_stride)) // %4 + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"); @@ -356,13 +356,13 @@ void TransposeUVWx8_SSE2(const uint8_t* src, "movhpd %%xmm8,(%2,%6) \n" "lea (%2,%6,2),%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(src_stride)), // %4 - "r"((ptrdiff_t)(dst_stride_a)), // %5 - "r"((ptrdiff_t)(dst_stride_b)) // %6 + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9"); } diff --git a/source/row_any.cc b/source/row_any.cc index 82a4abe8d..8ac48d3c0 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -616,7 +616,7 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON, ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) #endif #ifdef HAS_MERGEUVROW_AVX2 -ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) +ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15) #endif #ifdef HAS_MERGEUVROW_AVX512BW ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31) @@ -1000,12 +1000,6 @@ ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) #if defined(HAS_RAWTOARGBROW_AVX2) ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31) #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) -ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63) -#endif -#if defined(HAS_RGB24TOARGBROW_AVX512BW) -ANY11(RGB24ToARGBRow_Any_AVX512BW, RGB24ToARGBRow_AVX512BW, 0, 3, 4, 63) -#endif #if defined(HAS_RAWTORGBAROW_SSSE3) ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) #endif @@ -1206,36 +1200,52 @@ ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15) ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31) #endif #ifdef HAS_RGB24TOYROW_NEON +ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_AVX2 +ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31) #endif #ifdef HAS_RGB24TOYJROW_SSSE3 +ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_NEON +ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYROW_LSX +ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_LSX +ANY11(RGB24ToYJRow_Any_LSX, RGB24ToYJRow_LSX, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_LASX +ANY11(RGB24ToYJRow_Any_LASX, RGB24ToYJRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RGB24TOYROW_LASX +ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RAWTOYROW_NEON +ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYJROW_AVX2 +ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31) #endif #ifdef HAS_RAWTOYJROW_SSSE3 +ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYJROW_NEON +ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYROW_LSX +ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYROW_LASX +ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RAWTOYJROW_LSX +ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYJROW_LASX +ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31) #endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 15) @@ -2264,12 +2274,6 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1)); \ } -#ifdef HAS_ARGBTOUVMATRIXROW_NEON -ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15) -#endif -#ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM -ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15) -#endif #ifdef HAS_ARGBTOUVMATRIXROW_AVX2 ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15) #endif @@ -2320,18 +2324,6 @@ ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63) #ifdef HAS_ARGBTOYMATRIXROW_NEON ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15) #endif -#ifdef HAS_ARGBTOYMATRIXROW_NEON_DOTPROD -ANY11MC(ARGBToYMatrixRow_Any_NEON_DotProd, ARGBToYMatrixRow_NEON_DotProd, 4, 15) -#endif -#ifdef HAS_ARGBTOYMATRIXROW_LSX -ANY11MC(ARGBToYMatrixRow_Any_LSX, ARGBToYMatrixRow_LSX, 4, 15) -#endif -#ifdef HAS_ARGBTOYMATRIXROW_LASX -ANY11MC(ARGBToYMatrixRow_Any_LASX, ARGBToYMatrixRow_LASX, 4, 31) -#endif -#ifdef HAS_ARGBTOYMATRIXROW_RVV -ANY11MC(ARGBToYMatrixRow_Any_RVV, ARGBToYMatrixRow_RVV, 4, 15) -#endif #undef ANY11MC #ifdef HAS_ARGBTOUVROW_AVX2 diff --git a/source/row_common.cc b/source/row_common.cc index b2a0ec12b..8b192a539 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -678,6 +678,8 @@ MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(BGRA, 1, 2, 3, 4) MAKEROWY(ABGR, 0, 1, 2, 4) MAKEROWY(RGBA, 3, 2, 1, 4) +MAKEROWY(RGB24, 2, 1, 0, 3) +MAKEROWY(RAW, 0, 1, 2, 3) #undef MAKEROWY // JPeg uses BT.601-1 full range @@ -751,6 +753,8 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { MAKEROWYJ(ARGB, 2, 1, 0, 4) MAKEROWYJ(ABGR, 0, 1, 2, 4) MAKEROWYJ(RGBA, 3, 2, 1, 4) +MAKEROWYJ(RGB24, 2, 1, 0, 3) +MAKEROWYJ(RAW, 0, 1, 2, 3) #undef MAKEROWYJ static __inline uint8_t RGBToYMatrix(uint8_t r, @@ -4375,21 +4379,69 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, #ifdef HAS_RGB24TOYJROW_AVX2 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); + ARGBToYJRow_AVX2(row, dst_yj, twidth); + src_rgb24 += twidth * 3; + dst_yj += twidth; + width -= twidth; + } } #endif // HAS_RGB24TOYJROW_AVX2 #ifdef HAS_RAWTOYJROW_AVX2 // Convert 32 RAW pixels (128 bytes) to 32 YJ values. +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; +#ifdef HAS_RAWTOARGBROW_AVX2 + RAWToARGBRow_AVX2(src_raw, row, twidth); +#else + RAWToARGBRow_SSSE3(src_raw, row, twidth); +#endif + ARGBToYJRow_AVX2(row, dst_yj, twidth); + src_raw += twidth * 3; + dst_yj += twidth; + width -= twidth; + } } #endif // HAS_RAWTOYJROW_AVX2 #ifdef HAS_RGB24TOYJROW_SSSE3 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. +void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); + ARGBToYJRow_SSSE3(row, dst_yj, twidth); + src_rgb24 += twidth * 3; + dst_yj += twidth; + width -= twidth; + } } #endif // HAS_RGB24TOYJROW_SSSE3 #ifdef HAS_RAWTOYJROW_SSSE3 // Convert 16 RAW pixels (64 bytes) to 16 YJ values. +void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RAWToARGBRow_SSSE3(src_raw, row, twidth); + ARGBToYJRow_SSSE3(row, dst_yj, twidth); + src_raw += twidth * 3; + dst_yj += twidth; + width -= twidth; + } } #endif // HAS_RAWTOYJROW_SSSE3 diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 0da6e2ada..9ed7fce9c 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -262,64 +262,6 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -#ifdef HAS_RAWTOARGBROW_AVX512BW -static const uint32_t kPermdRAWToARGB_AVX512BW[16] = { - 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; - -void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) { - asm volatile( - "vpternlogd $0xff,%%zmm6,%%zmm6,%%zmm6 \n" // 0xffffffff - "vpslld $0x18,%%zmm6,%%zmm6 \n" // 0xff000000 - "movabs $0xffffffffffff,%%rax \n" // 48 bytes mask - "kmovq %%rax,%%k1 \n" - "vmovdqu32 %3,%%zmm5 \n" - "vbroadcasti32x4 %4,%%zmm4 \n" - - LABELALIGN // - "1: \n" - "vmovdqu8 (%0),%%zmm0%{%%k1%}%{z%} \n" - "vmovdqu8 48(%0),%%zmm1%{%%k1%}%{z%} \n" - "vmovdqu8 96(%0),%%zmm2%{%%k1%}%{z%} \n" - "vmovdqu8 144(%0),%%zmm3%{%%k1%}%{z%} \n" - "lea 192(%0),%0 \n" - "vpermd %%zmm0,%%zmm5,%%zmm0 \n" - "vpermd %%zmm1,%%zmm5,%%zmm1 \n" - "vpermd %%zmm2,%%zmm5,%%zmm2 \n" - "vpermd %%zmm3,%%zmm5,%%zmm3 \n" - "vpshufb %%zmm4,%%zmm0,%%zmm0 \n" - "vpshufb %%zmm4,%%zmm1,%%zmm1 \n" - "vpshufb %%zmm4,%%zmm2,%%zmm2 \n" - "vpshufb %%zmm4,%%zmm3,%%zmm3 \n" - "vpord %%zmm6,%%zmm0,%%zmm0 \n" - "vpord %%zmm6,%%zmm1,%%zmm1 \n" - "vpord %%zmm6,%%zmm2,%%zmm2 \n" - "vpord %%zmm6,%%zmm3,%%zmm3 \n" - "vmovdqu32 %%zmm0,(%1) \n" - "vmovdqu32 %%zmm1,0x40(%1) \n" - "vmovdqu32 %%zmm2,0x80(%1) \n" - "vmovdqu32 %%zmm3,0xc0(%1) \n" - "lea 0x100(%1),%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kPermdRAWToARGB_AVX512BW), // %3 - "m"(*shuffler) // %4 - : "memory", "cc", "rax", "k1", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6"); -} - -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - RGBToARGBRow_AVX512BW(src_raw, dst_argb, (const uint32_t*)&kShuffleMaskRAWToARGB, width); -} - -void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, (const uint32_t*)&kShuffleMaskRGB24ToARGB, width); -} -#endif - - // Same code as RAWToARGB with different shuffler and A in low bits void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( @@ -1913,9 +1855,9 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, #else "+rm"(width) // %3 #endif - : "r"((ptrdiff_t)(src_stride_argb)), // %4 - "r"(c), // %5 - "m"(kShuffleAARRGGBB) // %6 + : "r"((intptr_t)(src_stride_argb)), // %4 + "r"(c), // %5 + "m"(kShuffleAARRGGBB) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1988,9 +1930,9 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, #else "+rm"(width) // %3 #endif - : "r"((ptrdiff_t)(src_stride_argb)), // %4 - "r"(c), // %5 - "m"(kShuffleAARRGGBB) // %6 + : "r"((intptr_t)(src_stride_argb)), // %4 + "r"(c), // %5 + "m"(kShuffleAARRGGBB) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -2293,11 +2235,11 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb, #else "+rm"(width) // %3 #endif - : "r"((ptrdiff_t)(src_stride_argb)), // %4 - "r"(c), // %5 - "m"(kShuffleAARRGGBB), // %6 - "m"(kPermdARGBToY_AVX512BW), // %7 - "m"(kPermdARGBToUV_AVX512BW) // %8 + : "r"((intptr_t)(src_stride_argb)), // %4 + "r"(c), // %5 + "m"(kShuffleAARRGGBB), // %6 + "m"(kPermdARGBToY_AVX512BW), // %7 + "m"(kPermdARGBToUV_AVX512BW) // %8 : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm16", "zmm17", "zmm18", "zmm19"); } @@ -4649,7 +4591,7 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN @@ -4670,7 +4612,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_MIRRORROW_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN @@ -4697,7 +4639,7 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN @@ -4718,7 +4660,7 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { #ifdef HAS_MIRRORUVROW_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN @@ -4747,7 +4689,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile( "movdqa %4,%%xmm1 \n" "lea -0x10(%0,%3,2),%0 \n" @@ -4786,7 +4728,7 @@ static const uvec8 kShuffleMirrorRGB1 = { void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); src_rgb24 += width * 3 - 48; asm volatile( "movdqa %3,%%xmm4 \n" @@ -4822,7 +4764,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, #ifdef HAS_ARGBMIRRORROW_SSE2 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("lea -0x10(%0,%2,4),%0 \n" LABELALIGN @@ -4846,7 +4788,7 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { // Shuffle table for reversing the bytes. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); + intptr_t temp_width = (intptr_t)(width); asm volatile("vmovdqu %3,%%ymm5 \n" LABELALIGN @@ -6867,10 +6809,10 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(stride_yuy2)) // %3 + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } @@ -6906,11 +6848,11 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_yuy2)) // %4 + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } @@ -7001,11 +6943,11 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_uyvy)) // %4 + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } @@ -7092,10 +7034,10 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_uv), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)(stride_yuy2)) // %3 + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 : "memory", "cc", "xmm0", "xmm1"); } @@ -7132,11 +7074,11 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_yuy2)) // %4 + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm5"); } @@ -7232,11 +7174,11 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(stride_uyvy)) // %4 + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm5"); } @@ -8596,12 +8538,12 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, "sub $0x1,%3 \n" "jge 10b \n" "19: \n" - : "+r"(topleft), // %0 - "+r"(botleft), // %1 - "+r"(dst), // %2 - "+rm"(count) // %3 - : "r"((ptrdiff_t)(width)), // %4 - "rm"(area) // %5 + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"((intptr_t)(width)), // %4 + "rm"(area) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 @@ -8614,7 +8556,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* src_dudv, int width) { - ptrdiff_t src_argb_stride_temp = src_argb_stride; + intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; asm volatile( "movq (%3),%%xmm2 \n" @@ -8766,11 +8708,11 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, "jg 100b \n" "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+rm"(width), // %2 - "+r"(source_y_fraction) // %3 - : "r"(src_stride) // %4 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+rm"(width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_SSSE3 @@ -8844,11 +8786,11 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, "99: \n" "vzeroupper \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(width), // %2 - "+r"(source_y_fraction) // %3 - : "r"(src_stride) // %4 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_AVX2 @@ -9678,12 +9620,12 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, "lea 0x10(%2),%2 \n" "sub $0x10,%3 \n" // 16 src pixels per loop "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(src_stride_u)), // %4 - "r"((ptrdiff_t)(src_stride_v)) // %5 + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -9724,12 +9666,12 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, "sub $0x20,%3 \n" // 32 src pixels per loop "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)(src_stride_u)), // %4 - "r"((ptrdiff_t)(src_stride_v)) // %5 + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } diff --git a/source/row_lasx.cc b/source/row_lasx.cc index 94cb44ed1..19deb9a8f 100644 --- a/source/row_lasx.cc +++ b/source/row_lasx.cc @@ -2013,24 +2013,24 @@ void NV21ToARGBRow_LASX(const uint8_t* src_y, } } -#ifndef ArgbConstants -struct ArgbConstants { +#ifndef RgbConstants +struct RgbConstants { uint8_t kRGBToY[4]; uint16_t kAddY; uint16_t pad; }; -#define ArgbConstants ArgbConstants +#define RgbConstants RgbConstants // RGB to JPeg coefficients // B * 0.1140 coefficient = 29 // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 -static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128, 0}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -2038,20 +2038,20 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0} // R * 0.2578 coefficient = 66 // Add 16.5 = 0x1080 -static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, 0x1080, 0}; -static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080, 0}; -#endif // ArgbConstants +#endif // RgbConstants // ARGB expects first 3 values to contain RGB and 4th value is ignored. -void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, +static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants @@ -2088,7 +2088,7 @@ void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, : "+&r"(src_argb), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c), "r"(shuff) + : "r"(rgbconstants), "r"(shuff) : "memory"); } @@ -2113,7 +2113,7 @@ void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants @@ -2150,7 +2150,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, : "+&r"(src_rgba), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c), "r"(shuff) + : "r"(rgbconstants), "r"(shuff) : "memory"); } @@ -2169,7 +2169,7 @@ void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { int8_t shuff[128] = { 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, @@ -2219,14 +2219,26 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, : "+&r"(src_rgba), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c), // %3 + : "r"(rgbconstants), // %3 "r"(shuff) // %4 : "memory"); } +void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} +void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants); +} +void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants); +} +void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants); +} void ARGBToUVJRow_LASX(const uint8_t* src_argb, int src_stride_argb, diff --git a/source/row_lsx.cc b/source/row_lsx.cc index 41689578a..d3cc2b5d9 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -2798,24 +2798,24 @@ void HalfFloatRow_LSX(const uint16_t* src, } } -#ifndef ArgbConstants -struct ArgbConstants { +#ifndef RgbConstants +struct RgbConstants { uint8_t kRGBToY[4]; uint16_t kAddY; uint16_t pad; }; -#define ArgbConstants ArgbConstants +#define RgbConstants RgbConstants // RGB to JPeg coefficients // B * 0.1140 coefficient = 29 // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 -static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128, 0}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -2823,20 +2823,20 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0} // R * 0.2578 coefficient = 66 // Add 16.5 = 0x1080 -static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, 0x1080, 0}; -static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080, 0}; -#endif // ArgbConstants +#endif // RgbConstants // ARGB expects first 3 values to contain RGB and 4th value is ignored. -void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, +static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants @@ -2870,7 +2870,7 @@ void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, : "+&r"(src_argb), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c) + : "r"(rgbconstants) : "memory"); } @@ -2895,7 +2895,7 @@ void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants @@ -2929,7 +2929,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, : "+&r"(src_rgba), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c) + : "r"(rgbconstants) : "memory"); } @@ -2948,7 +2948,7 @@ void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, @@ -2990,14 +2990,26 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, : "+&r"(src_rgba), // %0 "+&r"(dst_y), // %1 "+&r"(width) // %2 - : "r"(c), // %3 + : "r"(rgbconstants), // %3 "r"(shuff) // %4 : "memory"); } +void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} +void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants); +} +void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants); +} +void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants); +} // undef for unified sources build #undef YUVTORGB_SETUP diff --git a/source/row_neon.cc b/source/row_neon.cc index 895e6f113..6c3118913 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1918,72 +1918,6 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, // clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. -void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vld1.8 {d18}, [%5] \n" // load kRGBToU - "vld1.8 {d19}, [%6] \n" // load kRGBToV - "vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17) - "vmovl.s8 q9, d19 \n" // V coeffs in q9 (d18, d19) - "vdup.16 q10, d16[0] \n" // U0 - "vdup.16 q11, d16[1] \n" // U1 - "vdup.16 q12, d16[2] \n" // U2 - "vdup.16 q13, d18[0] \n" // V0 - "vdup.16 q14, d18[1] \n" // V1 - "vdup.16 q15, d18[2] \n" // V2 - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "subs %4, %4, #16 \n" // 16 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. - - "vrshr.u16 q0, q0, #2 \n" // average of 4 - "vrshr.u16 q1, q1, #2 \n" - "vrshr.u16 q2, q2, #2 \n" - - "vmov.u16 q3, #0x8000 \n" // 128.0 - - "vmul.s16 q8, q0, q10 \n" // U = B * U0 - "vmla.s16 q8, q1, q11 \n" // U += G * U1 - "vmla.s16 q8, q2, q12 \n" // U += R * U2 - - "vmul.s16 q9, q0, q13 \n" // V = B * V0 - "vmla.s16 q9, q1, q14 \n" // V += G * V1 - "vmla.s16 q9, q2, q15 \n" // V += R * V2 - - "vsub.u16 q8, q3, q8 \n" // 128.0 - U - "vsub.u16 q9, q3, q9 \n" // 128.0 - V - - "vqshrn.u16 d0, q8, #8 \n" // Saturating shift right - "vqshrn.u16 d1, q9, #8 \n" - - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride_argb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : "r"(&c->kRGBToU), // %5 - "r"(&c->kRGBToV) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2896,7 +2830,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants); } -void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, +static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct ArgbConstants* c) { @@ -2931,9 +2865,21 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, "q12"); } +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kArgbJPEGConstants); +} +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kAbgrJPEGConstants); +} +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kArgbI601Constants); +} +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kAbgrI601Constants); +} // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8_t* dst_ptr, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 19016cc3b..c0fdc6d0d 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -9,7 +9,6 @@ */ #include "libyuv/row.h" -#include "libyuv/convert_from_argb.h" #ifdef __cplusplus namespace libyuv { @@ -2894,26 +2893,14 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. -void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - "ldr q16, [%[c], #16] \n" // kRGBToU - "ldr q17, [%[c], #32] \n" // kRGBToV - "sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit - "sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit - "dup v20.8h, v16.h[0] \n" // U0 (-BU) - "dup v21.8h, v16.h[1] \n" // U1 (-GU) - "dup v22.8h, v16.h[2] \n" // U2 (-RU) - "dup v23.8h, v17.h[0] \n" // V0 (-BV) - "dup v24.8h, v17.h[1] \n" // V1 (-GV) - "dup v26.8h, v17.h[2] \n" // V2 (-RV) - "movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000) - + RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. @@ -2922,7 +2909,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -2932,20 +2919,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "urshr v1.8h, v1.8h, #2 \n" "urshr v2.8h, v2.8h, #2 \n" - // U = B*U0 + G*U1 + R*U2 - "mul v3.8h, v0.8h, v20.8h \n" - "mla v3.8h, v1.8h, v21.8h \n" - "mla v3.8h, v2.8h, v22.8h \n" - - // V = B*V0 + G*V1 + R*V2 - "mul v4.8h, v0.8h, v23.8h \n" - "mla v4.8h, v1.8h, v24.8h \n" - "mla v4.8h, v2.8h, v26.8h \n" - - // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8 - "subhn v0.8b, v25.8h, v3.8h \n" - "subhn v1.8b, v25.8h, v4.8h \n" - + RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" @@ -2954,21 +2928,12 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 - : [c] "r"(c) // %5 + : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26" + "v20", "v21", "v22", "v23", "v24", "v25" ); } -void ARGBToUVRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width, - &kArgbI601Constants); -} - void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -3484,7 +3449,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, } // Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout. -static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src, +static void ABCDToUVMatrixRow_NEON_I8MM(const uint8_t* src, int src_stride, uint8_t* dst_u, uint8_t* dst_v, @@ -3581,25 +3546,12 @@ static const int8_t kRGBAToUVCoefficients[] = { 0, -112, 74, 38, 0, 18, 94, -112, }; -void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - int8_t uvconstants[8] = { - (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], - (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, - uvconstants); -} - void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, kARGBToUVCoefficients); } @@ -3608,7 +3560,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, kABGRToUVCoefficients); } @@ -3617,7 +3569,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width, kBGRAToUVCoefficients); } @@ -3626,7 +3578,7 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width, kRGBAToUVCoefficients); } @@ -3654,7 +3606,7 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, kARGBToUVJCoefficients); } @@ -3663,7 +3615,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width, + ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, kABGRToUVJCoefficients); } @@ -3763,20 +3715,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); } - +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; +}; // ARGB expects first 3 values to contain RGB and 4th value is ignored. -void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, +static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( - "ldr s0, [%3] \n" // load rgbconstants - "ldr s1, [%3, #48] \n" + "ldr d0, [%3] \n" // load rgbconstants "dup v6.16b, v0.b[0] \n" "dup v7.16b, v0.b[1] \n" "dup v16.16b, v0.b[2] \n" - "dup v17.8h, v1.h[0] \n" + "dup v17.8h, v0.h[2] \n" "1: \n" "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16 // pixels. @@ -3795,21 +3749,20 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 - : "r"(c) // %3 + : "r"(rgbconstants) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"); } -void ARGBToYMatrixRow_NEON_DotProd( +static void ARGBToYMatrixRow_NEON_DotProd( const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( - "ldr s0, [%3] \n" // load rgbconstants - "ldr s1, [%3, #48] \n" + "ldr d0, [%3] \n" // load rgbconstants "dup v16.4s, v0.s[0] \n" - "dup v17.8h, v1.h[0] \n" + "dup v17.8h, v0.h[2] \n" "1: \n" "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16 // pixels. @@ -3831,7 +3784,7 @@ void ARGBToYMatrixRow_NEON_DotProd( : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 - : "r"(c) // %3 + : "r"(rgbconstants) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"); } @@ -3841,10 +3794,12 @@ void ARGBToYMatrixRow_NEON_DotProd( // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 -static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {}, {}, {0x0080}, {}}; -static const struct ArgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, {}, {}, {0x0080}, {}}; +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 0x0080}; +static const struct RgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, + 0x0080}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, {0x0080}, {}}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 0x0080}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -3852,11 +3807,14 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, // R * 0.2578 coefficient = 66 // Add 16.5 = 0x1080 -static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {}, {}, {0x1080}, {}}; -static const struct ArgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, {}, {}, {0x1080}, {}}; +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080}; +static const struct RgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, + 0x1080}; -static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {}, {}, {0x1080}, {}}; -static const struct ArgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, {}, {}, {0x1080}, {}}; +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; +static const struct RgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, + 0x1080}; void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); @@ -3903,14 +3861,13 @@ void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr, static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( - "ldr s0, [%3] \n" // load rgbconstants - "ldr s1, [%3, #48] \n" + "ldr d0, [%3] \n" // load rgbconstants "dup v6.16b, v0.b[0] \n" "dup v7.16b, v0.b[1] \n" "dup v16.16b, v0.b[2] \n" - "dup v17.8h, v1.h[0] \n" + "dup v17.8h, v0.h[2] \n" "1: \n" "ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16 // pixels. @@ -3929,7 +3886,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 - : "r"(c) // %3 + : "r"(rgbconstants) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"); } @@ -3973,10 +3930,10 @@ void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra, &kRawI601DotProdConstants); } -void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, +static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { asm volatile( "ldr d0, [%3] \n" // load rgbconstants "dup v5.16b, v0.b[0] \n" @@ -4000,13 +3957,25 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, : "+r"(src_rgb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 - : "r"(c) // %3 + : "r"(rgbconstants) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); +} +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); +} +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); +} // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8_t* dst_ptr, diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 93bc431bc..0bdcd879b 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -126,6 +126,7 @@ extern "C" { } #endif +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE // Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv #define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ { \ @@ -169,6 +170,45 @@ extern "C" { v_y = __riscv_vle8_v_u8m2(src_y, vl); \ v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } +#else +// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv +#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu +#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } +#endif #ifdef HAS_ARGBTOAR64ROW_RVV void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { @@ -189,6 +229,7 @@ void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { #endif #ifdef HAS_ARGBTOAB64ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { size_t avl = (size_t)width; do { @@ -215,6 +256,29 @@ void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { dst_ab64 += 4 * vl; } while (avl > 0); } +#else +void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { + size_t avl = (size_t)width; + do { + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + vuint8m1_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m1(avl); + __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl); + v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl); + v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl); + v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl); + v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl); + v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl); + v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl); + v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl); + __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl); + avl -= vl; + src_argb += 4 * vl; + dst_ab64 += 4 * vl; + } while (avl > 0); +} +#endif #endif #ifdef HAS_AR64TOARGBROW_RVV @@ -235,6 +299,7 @@ void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { #endif #ifdef HAS_AR64TOAB64ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void AR64ToAB64Row_RVV(const uint16_t* src_ar64, uint16_t* dst_ab64, int width) { @@ -253,9 +318,26 @@ void AR64ToAB64Row_RVV(const uint16_t* src_ar64, dst_ab64 += vl * 4; } while (w > 0); } +#else +void AR64ToAB64Row_RVV(const uint16_t* src_ar64, + uint16_t* dst_ab64, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e16m2(w); + vuint16m2_t v_b, v_g, v_r, v_a; + __riscv_vlseg4e16_v_u16m2(&v_b, &v_g, &v_r, &v_a, src_ar64, vl); + __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r, v_g, v_b, v_a, vl); + w -= vl; + src_ar64 += vl * 4; + dst_ab64 += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_AB64TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { size_t avl = (size_t)width; do { @@ -276,9 +358,29 @@ void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { dst_argb += 4 * vl; } while (avl > 0); } +#else +void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { + size_t avl = (size_t)width; + do { + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + vuint8m1_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e16m2(avl); + __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl); + v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl); + v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl); + v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl); + v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl); + __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + avl -= vl; + src_ab64 += 4 * vl; + dst_argb += 4 * vl; + } while (avl > 0); +} +#endif #endif #ifdef HAS_RAWTOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -296,9 +398,26 @@ void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#else +void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_raw += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#endif #endif #ifdef HAS_RAWTORGBAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -316,9 +435,26 @@ void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#else +void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_raw += vl * 3; + dst_rgba += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#endif #endif #ifdef HAS_RAWTORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { size_t w = (size_t)width; do { @@ -334,9 +470,24 @@ void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { dst_rgb24 += vl * 3; } while (w > 0); } +#else +void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl); + w -= vl; + src_raw += vl * 3; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTORAWROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { size_t w = (size_t)width; do { @@ -352,9 +503,24 @@ void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { dst_raw += vl * 3; } while (w > 0); } +#else +void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl); + w -= vl; + src_argb += vl * 4; + dst_raw += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { @@ -372,9 +538,26 @@ void ARGBToRGB24Row_RVV(const uint8_t* src_argb, dst_rgb24 += vl * 3; } while (w > 0); } +#else +void ARGBToRGB24Row_RVV(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_argb += vl * 4; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTOABGRROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { size_t w = (size_t)width; do { @@ -391,9 +574,24 @@ void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { dst_abgr += vl * 4; } while (w > 0); } +#else +void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a, v_r, v_g, v_b; + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg4e8_v_u8m2(dst_abgr, v_r, v_g, v_b, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_abgr += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTOBGRAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { size_t w = (size_t)width; do { @@ -410,9 +608,24 @@ void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { dst_bgra += vl * 4; } while (w > 0); } +#else +void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a, v_r, v_g, v_b; + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg4e8_v_u8m2(dst_bgra, v_a, v_r, v_g, v_b, vl); + w -= vl; + src_argb += vl * 4; + dst_bgra += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTORGBAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { size_t w = (size_t)width; do { @@ -429,9 +642,24 @@ void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { dst_rgba += vl * 4; } while (w > 0); } +#else +void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a, v_r, v_g, v_b; + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_argb += vl * 4; + dst_rgba += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_RGBATOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { size_t w = (size_t)width; do { @@ -448,9 +676,24 @@ void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { dst_argb += vl * 4; } while (w > 0); } +#else +void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a, v_r, v_g, v_b; + __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_rgba += vl * 4; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_RGB24TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { @@ -470,9 +713,28 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#else +void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_rgb24 += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#endif #endif #ifdef HAS_I444TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I444ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -503,9 +765,40 @@ void I444ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void I444ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_I444ALPHATOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I444AlphaToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -538,9 +831,42 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void I444AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_I444TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I444ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -570,9 +896,39 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#else +void I444ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl; + src_v += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_I422TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I422ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -603,9 +959,40 @@ void I422ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void I422ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_I422ALPHATOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I422AlphaToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -638,9 +1025,42 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void I422AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_I422TORGBAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I422ToRGBARow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -671,9 +1091,40 @@ void I422ToRGBARow_RVV(const uint8_t* src_y, dst_rgba += vl * 4; } while (w > 0); } +#else +void I422ToRGBARow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgba += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_I422TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void I422ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -703,10 +1154,39 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#else +void I422ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_I400TOARGBROW_RVV -#if defined(LIBYUV_RVV_HAS_VXRM_ARG) +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) void I400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -787,6 +1267,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y, #endif #ifdef HAS_J400TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -801,6 +1282,22 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#else +void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_y; + v_y = __riscv_vle8_v_u8m2(src_y, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl); + w -= vl; + src_y += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#endif #endif #ifdef HAS_COPYROW_RVV @@ -818,6 +1315,7 @@ void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { #endif #ifdef HAS_NV12TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void NV12ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, @@ -846,9 +1344,38 @@ void NV12ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void NV12ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_NV12TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void NV12ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, @@ -876,9 +1403,37 @@ void NV12ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#else +void NV12ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_NV21TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void NV21ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_argb, @@ -907,9 +1462,38 @@ void NV21ToARGBRow_RVV(const uint8_t* src_y, dst_argb += vl * 4; } while (w > 0); } +#else +void NV21ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_NV21TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void NV21ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, @@ -937,6 +1521,33 @@ void NV21ToRGB24Row_RVV(const uint8_t* src_y, dst_rgb24 += vl * 3; } while (w > 0); } +#else +void NV21ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#endif #endif // Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 @@ -1056,6 +1667,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, #endif #ifdef HAS_SPLITRGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -1078,9 +1690,32 @@ void SplitRGBRow_RVV(const uint8_t* src_rgb, src_rgb += vl * 3; } while (w > 0); } +#else +void SplitRGBRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_rgb += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_MERGERGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void MergeRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -1101,9 +1736,31 @@ void MergeRGBRow_RVV(const uint8_t* src_r, dst_rgb += vl * 3; } while (w > 0); } +#else +void MergeRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_rgb += vl * 3; + } while (w > 0); +} +#endif #endif #ifdef HAS_SPLITARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void SplitARGBRow_RVV(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -1130,9 +1787,35 @@ void SplitARGBRow_RVV(const uint8_t* src_argb, src_argb += vl * 4; } while (w > 0); } +#else +void SplitARGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_a += vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_MERGEARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void MergeARGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -1156,9 +1839,34 @@ void MergeARGBRow_RVV(const uint8_t* src_r, dst_argb += vl * 4; } while (w > 0); } +#else +void MergeARGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + src_a += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_SPLITXRGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void SplitXRGBRow_RVV(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -1181,9 +1889,32 @@ void SplitXRGBRow_RVV(const uint8_t* src_argb, src_argb += vl * 4; } while (w > 0); } +#else +void SplitXRGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_MERGEXRGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void MergeXRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -1206,9 +1937,34 @@ void MergeXRGBRow_RVV(const uint8_t* src_r, vl = __riscv_vsetvl_e8m2(w); } while (w > 0); } +#else +void MergeXRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_r, v_g, v_b; + v_r = __riscv_vle8_v_u8m2(src_r, vl); + v_g = __riscv_vle8_v_u8m2(src_g, vl); + v_b = __riscv_vle8_v_u8m2(src_b, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#endif #endif #ifdef HAS_SPLITUVROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void SplitUVRow_RVV(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -1227,9 +1983,29 @@ void SplitUVRow_RVV(const uint8_t* src_uv, src_uv += 2 * vl; } while (w > 0); } +#else +void SplitUVRow_RVV(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_u, v_v; + __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl); + __riscv_vse8_v_u8m4(dst_u, v_u, vl); + __riscv_vse8_v_u8m4(dst_v, v_v, vl); + w -= vl; + dst_u += vl; + dst_v += vl; + src_uv += 2 * vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_MERGEUVROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void MergeUVRow_RVV(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -1247,18 +2023,43 @@ void MergeUVRow_RVV(const uint8_t* src_u, dst_uv += 2 * vl; } while (w > 0); } +#else +void MergeUVRow_RVV(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + size_t w = (size_t)width; + do { + vuint8m4_t v_u, v_v; + size_t vl = __riscv_vsetvl_e8m4(w); + v_u = __riscv_vle8_v_u8m4(src_u, vl); + v_v = __riscv_vle8_v_u8m4(src_v, vl); + __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl); + w -= vl; + src_u += vl; + src_v += vl; + dst_uv += 2 * vl; + } while (w > 0); +} +#endif #endif - +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; // RGB to JPeg coefficients // B * 0.1140 coefficient = 29 // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 -static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {0}, {0}, {128}, {0}}; +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0}, {128}, {0}}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -1266,25 +2067,30 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0 // R * 0.2578 coefficient = 66 // Add 16.5 = 0x1080 -static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {0}, {0}, {0x1080}, {0}}; +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; -static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {0}, {0}, {0x1080}, {0}}; +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; // ARGB expects first 3 values to contain RGB and 4th value is ignored #ifdef HAS_ARGBTOYMATRIXROW_RVV -void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +static void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { assert(width != 0); size_t w = (size_t)width; vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant vuint16m4_t v_addy; // vector is to store kAddY size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(c->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(c->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(c->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(c->kAddY[0], vl); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); do { vuint8m2_t v_y; vuint16m4_t v_y_u16; @@ -1304,6 +2110,37 @@ void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, dst_y += vl; } while (w > 0); } +#else +static void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_a, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_argb += 4 * vl; + dst_y += vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBTOYROW_RVV @@ -1332,19 +2169,20 @@ void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { // RGBA expects first value to be A and ignored, then 3 values to contain RGB. #ifdef HAS_RGBATOYMATRIXROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE static void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { assert(width != 0); size_t w = (size_t)width; vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant vuint16m4_t v_addy; // vector is to store kAddY size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(c->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(c->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(c->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(c->kAddY[0], vl); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); do { vuint8m2_t v_y; vuint16m4_t v_y_u16; @@ -1364,6 +2202,37 @@ static void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, dst_y += vl; } while (w > 0); } +#else +static void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_a, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgba += 4 * vl; + dst_y += vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_RGBATOYROW_RVV @@ -1385,19 +2254,20 @@ void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) { #endif #ifdef HAS_RGBTOYMATRIXROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, uint8_t* dst_y, int width, - const struct ArgbConstants* c) { + const struct RgbConstants* rgbconstants) { assert(width != 0); size_t w = (size_t)width; vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant vuint16m4_t v_addy; // vector is to store kAddY size_t vl = __riscv_vsetvl_e8m2(w); - v_by = __riscv_vmv_v_x_u8m2(c->kRGBToY[0], vl); - v_gy = __riscv_vmv_v_x_u8m2(c->kRGBToY[1], vl); - v_ry = __riscv_vmv_v_x_u8m2(c->kRGBToY[2], vl); - v_addy = __riscv_vmv_v_x_u16m4(c->kAddY[0], vl); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); do { vuint8m2_t v_y; vuint16m4_t v_y_u16; @@ -1417,24 +2287,68 @@ static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, dst_y += vl; } while (w > 0); } +#else +static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgb += 3 * vl; + dst_y += vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_RGB24TOYJROW_RVV +void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} #endif #ifdef HAS_RAWTOYJROW_RVV +void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants); +} #endif #ifdef HAS_RGB24TOYROW_RVV +void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants); +} #endif #ifdef HAS_RAWTOYROW_RVV +void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants); +} #endif // Blend src_argb over src_argb1 and store to dst_argb. // dst_argb may be src_argb or src_argb1. // src_argb: RGB values have already been pre-multiplied by the a. #ifdef HAS_ARGBBLENDROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBBlendRow_RVV(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -1481,6 +2395,48 @@ void ARGBBlendRow_RVV(const uint8_t* src_argb, dst_argb += 4 * vl; } while (w > 0); } +#else +void ARGBBlendRow_RVV(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvlmax_e8m2(); + // clamp255((((256 - a) * b) >> 8) + f) + // = b * (256 - a) / 256 + f + // = b - (b * a / 256) + f + vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl); + do { + vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a; + vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a; + vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r; + vuint8m2_t v_dst_b, v_dst_g, v_dst_r; + vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a, + src_argb, vl); + __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a, + src_argb1, vl); + + v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl); + v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl); + v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl); + + v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl); + v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl); + v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl); + + v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl); + v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl); + v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl); + + w -= vl; + src_argb += 4 * vl; + src_argb1 += 4 * vl; + dst_argb += 4 * vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_BLENDPLANEROW_RVV @@ -1518,6 +2474,7 @@ void BlendPlaneRow_RVV(const uint8_t* src0, // Attenuate: (f * a + 255) >> 8 #ifdef HAS_ARGBATTENUATEROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBAttenuateRow_RVV(const uint8_t* src_argb, uint8_t* dst_argb, int width) { @@ -1551,9 +2508,39 @@ void ARGBAttenuateRow_RVV(const uint8_t* src_argb, dst_argb += vl * 4; } while (w > 0); } +#else +void ARGBAttenuateRow_RVV(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_ba_16, v_ga_16, v_ra_16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + // f * a + v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl); + v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl); + v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl); + // f * a + 255 + v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl); + v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl); + v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl); + // (f * a + 255) >> 8 + v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl); + v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl); + v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_argb += vl * 4; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBEXTRACTALPHAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, uint8_t* dst_a, int width) { @@ -1568,6 +2555,22 @@ void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, dst_a += vl; } while (w > 0); } +#else +void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_b, v_g, v_r, v_a; + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_a += vl; + } while (w > 0); +} +#endif #endif #ifdef HAS_ARGBCOPYYTOALPHAROW_RVV diff --git a/source/row_sme.cc b/source/row_sme.cc index fca536dc4..bd61b20bf 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -1120,20 +1120,6 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y, : "cc", "memory", "z0", "z1", "z2", "p0", "p1"); } -__arm_locally_streaming void ARGBToUVMatrixRow_SME( - const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - int8_t uvconstants[8] = { - (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], - (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; - ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, - uvconstants); -} - __arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, diff --git a/source/row_sve.cc b/source/row_sve.cc index 7d8734921..4a51b68fc 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -217,19 +217,6 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y, NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width); } -void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - int8_t uvconstants[8] = { - (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], - (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; - ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, - uvconstants); -} - void ARGBToUVRow_SVE2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, diff --git a/source/row_win.cc b/source/row_win.cc index 77070d031..e680ffd9d 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -122,10 +122,8 @@ extern "C" { #if defined(__clang__) || defined(__GNUC__) #define LIBYUV_TARGET_AVX2 __attribute__((target("avx2"))) -#define LIBYUV_TARGET_AVX512BW __attribute__((target("avx512bw,avx512vl,avx512f"))) #else #define LIBYUV_TARGET_AVX2 -#define LIBYUV_TARGET_AVX512BW #endif LIBYUV_TARGET_AVX2 @@ -212,197 +210,6 @@ LIBYUV_TARGET_AVX2 void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) { ARGBToYMatrixRow_AVX2(src_bgra, dst_y, width, &kBgraI601Constants); } - -#ifdef HAS_RAWTOARGBROW_AVX2 -LIBYUV_TARGET_AVX2 -void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - __m256i ymm_alpha = _mm256_set1_epi32(0xff000000); - __m128i shuf_low = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2); - __m128i shuf_high = _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6); - __m256i ymm_shuf = _mm256_broadcastsi128_si256(shuf_low); - __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(shuf_high); - - while (width > 0) { - __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_raw); - __m256i ymm0 = _mm256_castsi128_si256(xmm0); - ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1); - - __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_raw + 24)); - __m256i ymm1 = _mm256_castsi128_si256(xmm1); - ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1); - - __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_raw + 48)); - __m256i ymm2 = _mm256_castsi128_si256(xmm2); - ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1); - - __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_raw + 68)); - __m256i ymm3 = _mm256_castsi128_si256(xmm3); - ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1); - - ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); - ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf); - ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf); - ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf2); - - ymm0 = _mm256_or_si256(ymm0, ymm_alpha); - ymm1 = _mm256_or_si256(ymm1, ymm_alpha); - ymm2 = _mm256_or_si256(ymm2, ymm_alpha); - ymm3 = _mm256_or_si256(ymm3, ymm_alpha); - - _mm256_storeu_si256((__m256i*)dst_argb, ymm0); - _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1); - _mm256_storeu_si256((__m256i*)(dst_argb + 64), ymm2); - _mm256_storeu_si256((__m256i*)(dst_argb + 96), ymm3); - - src_raw += 96; - dst_argb += 128; - width -= 32; - } -} -#endif - -#ifdef HAS_RAWTOARGBROW_AVX512BW -LIBYUV_TARGET_AVX512BW -void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m128i* shuffler, int width) { - __m512i zmm_alpha = _mm512_set1_epi32(0xff000000); - __m512i zmm_perm = _mm512_set_epi32( - 12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0); - __m512i zmm_shuf = _mm512_broadcast_i32x4(_mm_loadu_si128(shuffler)); - - while (width > 0) { - __m512i zmm0 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw); - __m512i zmm1 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 48); - __m512i zmm2 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 96); - __m512i zmm3 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 144); - - zmm0 = _mm512_permutexvar_epi32(zmm_perm, zmm0); - zmm1 = _mm512_permutexvar_epi32(zmm_perm, zmm1); - zmm2 = _mm512_permutexvar_epi32(zmm_perm, zmm2); - zmm3 = _mm512_permutexvar_epi32(zmm_perm, zmm3); - - zmm0 = _mm512_shuffle_epi8(zmm0, zmm_shuf); - zmm1 = _mm512_shuffle_epi8(zmm1, zmm_shuf); - zmm2 = _mm512_shuffle_epi8(zmm2, zmm_shuf); - zmm3 = _mm512_shuffle_epi8(zmm3, zmm_shuf); - - zmm0 = _mm512_or_si512(zmm0, zmm_alpha); - zmm1 = _mm512_or_si512(zmm1, zmm_alpha); - zmm2 = _mm512_or_si512(zmm2, zmm_alpha); - zmm3 = _mm512_or_si512(zmm3, zmm_alpha); - - _mm512_storeu_si512(dst_argb, zmm0); - _mm512_storeu_si512(dst_argb + 64, zmm1); - _mm512_storeu_si512(dst_argb + 128, zmm2); - _mm512_storeu_si512(dst_argb + 192, zmm3); - - src_raw += 192; - dst_argb += 256; - width -= 64; - } -} - -LIBYUV_TARGET_AVX512BW -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - __m128i shuf = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2); - RGBToARGBRow_AVX512BW(src_raw, dst_argb, &shuf, width); -} - -LIBYUV_TARGET_AVX512BW -void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - __m128i shuf = _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0); - RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, &shuf, width); -} -#endif - -#ifdef HAS_ARGBTOUVMATRIXROW_AVX2 -LIBYUV_TARGET_AVX2 __attribute__((no_sanitize("cfi-icall"))) -void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { - __m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU)); - __m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV)); - __m256i ymm_0101 = _mm256_set1_epi16(0x0101); - __m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, - 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15); - __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000); - __m256i ymm_zero = _mm256_setzero_si256(); - - while (width > 0) { - __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb); - __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32)); - __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb)); - __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32)); - - ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); - ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf); - ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf); - ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf); - - ymm0 = _mm256_maddubs_epi16(ymm0, ymm_0101); - ymm1 = _mm256_maddubs_epi16(ymm1, ymm_0101); - ymm2 = _mm256_maddubs_epi16(ymm2, ymm_0101); - ymm3 = _mm256_maddubs_epi16(ymm3, ymm_0101); - - ymm0 = _mm256_add_epi16(ymm0, ymm2); - ymm1 = _mm256_add_epi16(ymm1, ymm3); - - ymm0 = _mm256_srli_epi16(ymm0, 1); - ymm1 = _mm256_srli_epi16(ymm1, 1); - ymm0 = _mm256_avg_epu16(ymm0, ymm_zero); - ymm1 = _mm256_avg_epu16(ymm1, ymm_zero); - - ymm0 = _mm256_packus_epi16(ymm0, ymm1); - ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8); - - ymm1 = _mm256_maddubs_epi16(ymm0, ymm_v); - ymm0 = _mm256_maddubs_epi16(ymm0, ymm_u); - - ymm0 = _mm256_hadd_epi16(ymm0, ymm1); - ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8); - ymm0 = _mm256_sub_epi16(ymm_8000, ymm0); - ymm0 = _mm256_srli_epi16(ymm0, 8); - ymm0 = _mm256_packus_epi16(ymm0, ymm0); - - __m128i xmm_u = _mm256_castsi256_si128(ymm0); - __m128i xmm_v = _mm256_extracti128_si256(ymm0, 1); - - _mm_storel_epi64((__m128i*)dst_u, xmm_u); - _mm_storel_epi64((__m128i*)dst_v, xmm_v); - - src_argb += 64; - dst_u += 8; - dst_v += 8; - width -= 16; - } -} -#endif - -#ifdef HAS_MERGEUVROW_AVX2 -LIBYUV_TARGET_AVX2 -void MergeUVRow_AVX2(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - while (width > 0) { - __m256i ymm0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_u)); - __m256i ymm1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_v)); - - ymm1 = _mm256_slli_epi16(ymm1, 8); - ymm0 = _mm256_or_si256(ymm0, ymm1); - - _mm256_storeu_si256((__m256i*)dst_uv, ymm0); - - src_u += 16; - src_v += 16; - dst_uv += 32; - width -= 16; - } -} -#endif - #endif diff --git a/source/scale_common.cc b/source/scale_common.cc index 537f030aa..e51af8d7a 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -362,35 +362,36 @@ void ScaleRowDown4Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { + intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >> + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] + - src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] + - src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] + - src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] + 8) >> + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >> + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> 4; } } @@ -399,35 +400,36 @@ void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { + intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >> + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] + - src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] + - src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] + - src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] + 8) >> + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >> + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> 4; } } @@ -890,26 +892,27 @@ void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { + intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + - src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * - (65536 / 9) >> - 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * - (65536 / 9) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] + - src_ptr[src_stride * 2 + 7]) * - (65536 / 6) >> - 16; + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; src_ptr += 8; dst_ptr += 3; } @@ -919,26 +922,27 @@ void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width) { + intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + - src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * - (65536u / 9u) >> - 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * - (65536u / 9u) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] + - src_ptr[src_stride * 2 + 7]) * - (65536u / 6u) >> - 16; + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536u / 9u) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536u / 9u) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536u / 6u) >> + 16; src_ptr += 8; dst_ptr += 3; } @@ -949,23 +953,22 @@ void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { + intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = - (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] + - src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) * - (65536 / 6) >> - 16; - dst_ptr[1] = - (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) * - (65536 / 6) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7]) * - (65536 / 4) >> + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; src_ptr += 8; dst_ptr += 3; } @@ -975,23 +978,22 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width) { + intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = - (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] + - src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) * - (65536u / 6u) >> - 16; - dst_ptr[1] = - (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) * - (65536u / 6u) >> - 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] + - src_ptr[src_stride + 7]) * - (65536u / 4u) >> + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536u / 6u) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536u / 6u) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536u / 4u) >> + 16; src_ptr += 8; dst_ptr += 3; } @@ -1687,7 +1689,7 @@ void ScalePlaneVertical(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * (ptrdiff_t)src_stride, src_stride, + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, dst_width_bytes, yf); dst_argb += dst_stride; y += dy; @@ -1763,7 +1765,7 @@ void ScalePlaneVertical_16(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * (ptrdiff_t)src_stride, src_stride, + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, dst_width_words, yf); dst_argb += dst_stride; y += dy; @@ -1832,8 +1834,8 @@ void ScalePlaneVertical_16To8(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow_16To8(dst_argb, src_argb + yi * (ptrdiff_t)src_stride, - src_stride, scale, dst_width_words, yf); + InterpolateRow_16To8(dst_argb, src_argb + yi * src_stride, src_stride, + scale, dst_width_words, yf); dst_argb += dst_stride; y += dy; } diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 6a2524230..fdd38dfe5 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -183,10 +183,10 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } @@ -283,10 +283,10 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEROWDOWN2_AVX2 @@ -326,7 +326,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - ptrdiff_t stridex3; + intptr_t stridex3; asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "pabsw %%xmm4,%%xmm5 \n" @@ -367,11 +367,11 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, "lea 0x8(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "=&r"(stridex3) // %3 - : "r"(src_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "=&r"(stridex3) // %3 + : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -456,11 +456,11 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(src_stride * 3) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(src_stride * 3)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEROWDOWN4_AVX2 @@ -557,11 +557,11 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kMadd21) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -625,11 +625,11 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kMadd21) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -701,10 +701,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } @@ -762,10 +762,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -935,11 +935,11 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1084,12 +1084,12 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, "lea 0x20(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearShuffleFar) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1246,11 +1246,11 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1371,12 +1371,12 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearMadd31) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1497,12 +1497,12 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearMadd31) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1612,12 +1612,12 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kLinearShuffleFar) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif @@ -1746,11 +1746,11 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif @@ -2016,10 +2016,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride) // %3 + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } @@ -2030,8 +2030,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width) { - ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx); - ptrdiff_t src_stepx_x12; + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12; (void)src_stride; asm volatile( "lea 0x00(,%1,4),%1 \n" @@ -2067,8 +2067,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width) { - ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx); - ptrdiff_t src_stepx_x12; + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); + intptr_t src_stepx_x12; + intptr_t row1 = (intptr_t)(src_stride); asm volatile( "lea 0x00(,%1,4),%1 \n" "lea 0x00(%1,%1,2),%4 \n" @@ -2101,7 +2102,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, "+r"(dst_argb), // %2 "+rm"(dst_width), // %3 "=&r"(src_stepx_x12), // %4 - "+r"(src_stride) // %5 + "+r"(row1) // %5 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } @@ -2363,12 +2364,12 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, "lea 0x8(%1),%1 \n" // 4 UV "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kShuffleSplitUV), // %4 - "m"(kShuffleMergeUV) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 @@ -2404,12 +2405,12 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "m"(kShuffleSplitUV), // %4 - "m"(kShuffleMergeUV) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEUVROWDOWN2BOX_AVX2 @@ -2530,12 +2531,12 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" // 4 uv to 8 uv "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kUVLinearMadd31) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kUVLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -2654,12 +2655,12 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride), // %4 - "m"(kUVLinearMadd31) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kUVLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -2798,11 +2799,11 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, "lea 0x10(%1),%1 \n" // 2 uv to 4 uv "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -2929,11 +2930,11 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(src_stride), // %3 - "r"(dst_stride) // %4 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc index 177f3a669..e9a91804b 100644 --- a/unit_test/convert_argb_test.cc +++ b/unit_test/convert_argb_test.cc @@ -2827,8 +2827,9 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) { int has_large_malloc = 1; #endif if (!has_large_malloc) { - GTEST_SKIP() << "WARNING: Large allocation may assert for " - << (size_t)kWidth * kHeight << " bytes"; + printf("WARNING: Skipped. Large allocation may assert for %zd\n", + (size_t)kWidth * kHeight); + return; } // Allocate one extra column so that the coalesce optimizations do not trigger @@ -2840,16 +2841,20 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) { fflush(stdout); align_buffer_page_end(orig_i400, (size_t)kWidth * kHeight); if (!orig_i400) { - GTEST_SKIP() << "WARNING: unable to allocate I400 image of " - << (size_t)kWidth * kHeight << " bytes"; + printf("WARNING: unable to allocate I400 image of %zd bytes\n", + (size_t)kWidth * kHeight); + fflush(stdout); + return; } printf("INFO: allocate I400 image returned %p\n", orig_i400); fflush(stdout); align_buffer_page_end(dest_argb, (size_t)kWidth * kHeight * 4); if (!dest_argb) { + printf("WARNING: unable to allocate ARGB image of %zd bytes\n", + (size_t)kWidth * kHeight * 4); + fflush(stdout); free_aligned_buffer_page_end(orig_i400); - GTEST_SKIP() << "WARNING: unable to allocate ARGB image of " - << (size_t)kWidth * kHeight * 4 << " bytes"; + return; } printf("INFO: allocate ARGB image returned %p\n", dest_argb); fflush(stdout); @@ -2867,72 +2872,4 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) { #endif // !defined(LEAN_TESTS) - -#define TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, W1280, N, NEG, OFF) \ - TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ - const int kWidth = W1280; \ - const int kHeight = benchmark_height_; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kStrideA = \ - (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - const int kStrideY = kWidth; \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2; \ - const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_argb, \ - kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ - align_buffer_page_end(dst_y_c, kStrideY* kHeight); \ - align_buffer_page_end(dst_uv_c, kSizeUV); \ - align_buffer_page_end(dst_y_opt, kStrideY* kHeight); \ - align_buffer_page_end(dst_uv_opt, kSizeUV); \ - for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ - src_argb[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_y_c, 1, kStrideY* kHeight); \ - memset(dst_uv_c, 2, kSizeUV); \ - memset(dst_y_opt, 101, kStrideY* kHeight); \ - memset(dst_uv_opt, 102, kSizeUV); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_c, kStrideY, \ - dst_uv_c, kStrideUV, kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_opt, \ - kStrideY, dst_uv_opt, kStrideUV, kWidth, NEG kHeight); \ - } \ - for (int i = 0; i < kStrideY * kHeight; ++i) { \ - EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]); \ - } \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_uv_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_uv_opt); \ - } - -#if defined(ENABLE_FULL_TESTS) -#define TESTATOBP(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y) \ - TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0) \ - TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, benchmark_width_, _Unaligned, +, 4) \ - TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \ - TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, benchmark_width_, _Opt, +, 0) -#else -#define TESTATOBP(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y) \ - TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ - SUBSAMP_Y, benchmark_width_, _Opt, +, 0) -#endif - -TESTATOBP(RAW, uint8_t, 3, 3, 1, NV21, 2, 2) -TESTATOBP(RGB24, uint8_t, 3, 3, 1, NV12, 2, 2) -TESTATOBP(RAW, uint8_t, 3, 3, 1, JNV21, 2, 2) - } // namespace libyuv diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 3d5ce3799..f5c9c6259 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -825,6 +825,7 @@ TESTATOBP(ARGB, 1, 4, NV12, 2, 2) TESTATOBP(ARGB, 1, 4, NV21, 2, 2) TESTATOBP(ABGR, 1, 4, NV12, 2, 2) TESTATOBP(ABGR, 1, 4, NV21, 2, 2) +TESTATOBP(RAW, 1, 3, JNV21, 2, 2) TESTATOBP(YUY2, 2, 4, NV12, 2, 2) TESTATOBP(UYVY, 2, 4, NV12, 2, 2) TESTATOBP(AYUV, 1, 4, NV12, 2, 2) diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc index abc08efa8..9a9a4a305 100644 --- a/unit_test/rotate_test.cc +++ b/unit_test/rotate_test.cc @@ -892,11 +892,6 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Test) { Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4, (uint8_t*)dst_pixels_opt, width * 4, width); } else -#elif defined(HAS_TRANSPOSE4X4_32_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4, - (uint8_t*)dst_pixels_opt, width * 4, width); - } else #endif { Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, diff --git a/unit_test/scale_plane_test.cc b/unit_test/scale_plane_test.cc index 979c70aad..3e801f250 100644 --- a/unit_test/scale_plane_test.cc +++ b/unit_test/scale_plane_test.cc @@ -8,14 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include #include -#include #include -#include - #include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" #include "libyuv/scale.h" @@ -43,95 +38,6 @@ namespace libyuv { #ifdef ENABLE_ROW_TESTS -#ifdef HAS_SCALEROWDOWN2_SSSE3 -TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { - SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]); - SIMD_ALIGNED(uint8_t dst_pixels_opt[64]); - SIMD_ALIGNED(uint8_t dst_pixels_c[64]); - memset(orig_pixels, 0, sizeof(orig_pixels)); - memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt)); - memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); - - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - if (!has_ssse3) { - printf("Warning SSSE3 not detected; Skipping test.\n"); - } else { - // TL. - orig_pixels[0] = 255u; - orig_pixels[1] = 0u; - orig_pixels[128 + 0] = 0u; - orig_pixels[128 + 1] = 0u; - // TR. - orig_pixels[2] = 0u; - orig_pixels[3] = 100u; - orig_pixels[128 + 2] = 0u; - orig_pixels[128 + 3] = 0u; - // BL. - orig_pixels[4] = 0u; - orig_pixels[5] = 0u; - orig_pixels[128 + 4] = 50u; - orig_pixels[128 + 5] = 0u; - // BR. - orig_pixels[6] = 0u; - orig_pixels[7] = 0u; - orig_pixels[128 + 6] = 0u; - orig_pixels[128 + 7] = 20u; - // Odd. - orig_pixels[126] = 4u; - orig_pixels[127] = 255u; - orig_pixels[128 + 126] = 16u; - orig_pixels[128 + 127] = 255u; - - // Test regular half size. - ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64); - - EXPECT_EQ(64u, dst_pixels_c[0]); - EXPECT_EQ(25u, dst_pixels_c[1]); - EXPECT_EQ(13u, dst_pixels_c[2]); - EXPECT_EQ(5u, dst_pixels_c[3]); - EXPECT_EQ(0u, dst_pixels_c[4]); - EXPECT_EQ(133u, dst_pixels_c[63]); - - // Test Odd width version - Last pixel is just 1 horizontal pixel. - ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); - - EXPECT_EQ(64u, dst_pixels_c[0]); - EXPECT_EQ(25u, dst_pixels_c[1]); - EXPECT_EQ(13u, dst_pixels_c[2]); - EXPECT_EQ(5u, dst_pixels_c[3]); - EXPECT_EQ(0u, dst_pixels_c[4]); - EXPECT_EQ(10u, dst_pixels_c[63]); - - // Test one pixel less, should skip the last pixel. - memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); - ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63); - - EXPECT_EQ(64u, dst_pixels_c[0]); - EXPECT_EQ(25u, dst_pixels_c[1]); - EXPECT_EQ(13u, dst_pixels_c[2]); - EXPECT_EQ(5u, dst_pixels_c[3]); - EXPECT_EQ(0u, dst_pixels_c[4]); - EXPECT_EQ(0u, dst_pixels_c[63]); - - // Test regular half size SSSE3. - ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); - - EXPECT_EQ(64u, dst_pixels_opt[0]); - EXPECT_EQ(25u, dst_pixels_opt[1]); - EXPECT_EQ(13u, dst_pixels_opt[2]); - EXPECT_EQ(5u, dst_pixels_opt[3]); - EXPECT_EQ(0u, dst_pixels_opt[4]); - EXPECT_EQ(133u, dst_pixels_opt[63]); - - // Compare C and SSSE3 match. - ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); - ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); - for (int i = 0; i < 64; ++i) { - EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); - } - } -} -#endif // HAS_SCALEROWDOWN2_SSSE3 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) { SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]); @@ -467,71 +373,4 @@ TEST_F(LibYUVScaleTest, PlaneTest1_16_Box) { free_aligned_buffer_page_end(dst_pixels_alloc); free_aligned_buffer_page_end(orig_pixels_alloc); } - -// POC: int * int overflow in ScalePlaneVertical (scale_common.cc). -// -// `yi * src_stride` is evaluated as int * int. When the product exceeds -// INT_MAX it wraps negative and InterpolateRow reads from BEFORE the -// source allocation. -// -// Parameters: -// - dst_width == src_width -// -> ScalePlane dispatches to ScalePlaneVertical -// - src_height == 5, dst_height == 1 -// -> single iteration with yi == 2 -// - src_stride == 0x7FFFFFF8 -// -> 2 * 0x7FFFFFF8 == 0xFFFFFFF0 == -16 (int) -// -// The source buffer is sized so that the *correct* 64-bit offset -// (2 * 0x7FFFFFF8 == 4294967280) plus kWidth bytes is in-bounds. With the -// bug, the 32-bit product is -16 and ASAN reports a heap-buffer-overflow -// READ "16 bytes before" the allocation. -TEST_F(LibYUVScaleTest, ScalePlaneVertical_IntStrideOverflow) { - const int kWidth = 16; - const int kSrcHeight = 5; - const int kDstHeight = 1; - const int kStride = 0x7FFFFFF8; // 2147483640 - - // src_size is big enough for the only row this call legitimately touches - // (yi == 2) when computed in 64-bit: 2 * stride + width = 4 GiB. - size_t src_size = kStride; - if (src_size > SIZE_MAX / 2) { - GTEST_SKIP() << "could not represent allocation size in size_t"; - } - src_size *= 2; - if (src_size > SIZE_MAX - kWidth) { - GTEST_SKIP() << "could not represent allocation size in size_t"; - } - src_size += kWidth; - -#if defined(__aarch64__) - // Infer malloc can accept a large size for cpu with dot product (a76/a55) - int has_large_malloc = TestCpuFlag(kCpuHasNeonDotProd); -#else - int has_large_malloc = 1; -#endif - if (!has_large_malloc) { - GTEST_SKIP() << "large allocation may assert for " << src_size << " bytes"; - } - - uint8_t* src = new (std::nothrow) uint8_t[src_size]; - if (!src) { - GTEST_SKIP() << "could not allocate " << src_size << " bytes"; - } - uint8_t* dst = new uint8_t[kWidth]; - memset(dst, 0, kWidth); - - // Force the scalar path so the crash site is deterministic - // (InterpolateRow_C -> memcpy when yf == 0). - MaskCpuFlags(disable_cpu_flags_); - - int r = ScalePlane(src, kStride, kWidth, kSrcHeight, dst, kWidth, kWidth, - kDstHeight, kFilterNone); - - // Not reached under ASAN. - EXPECT_EQ(0, r); - delete[] src; - delete[] dst; -} - } // namespace libyuv