diff --git a/include/libyuv/row.h b/include/libyuv/row.h index a622e3185..6519fd801 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -421,7 +421,9 @@ extern "C" { #define HAS_ABGRTOUVJROW_NEON #define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOYJROW_NEON +#define HAS_ABGRTOYJROW_NEON_DOTPROD #define HAS_ABGRTOYROW_NEON +#define HAS_ABGRTOYROW_NEON_DOTPROD #define HAS_AR64TOARGBROW_NEON #define HAS_ARGB1555TOARGBROW_NEON #define HAS_ARGB1555TOUVROW_NEON @@ -443,12 +445,15 @@ extern "C" { #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON +#define HAS_ARGBTOYJROW_NEON_DOTPROD #define HAS_ARGBTOYROW_NEON +#define HAS_ARGBTOYROW_NEON_DOTPROD #define HAS_AYUVTOUVROW_NEON #define HAS_AYUVTOVUROW_NEON #define HAS_AYUVTOYROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON +#define HAS_BGRATOYROW_NEON_DOTPROD #define HAS_BYTETOFLOATROW_NEON #define HAS_CONVERT16TO8ROW_NEON #define HAS_COPYROW_NEON @@ -512,7 +517,9 @@ extern "C" { #define HAS_RGB565TOYROW_NEON #define HAS_RGBATOUVROW_NEON #define HAS_RGBATOYJROW_NEON +#define HAS_RGBATOYJROW_NEON_DOTPROD #define HAS_RGBATOYROW_NEON +#define HAS_RGBATOYROW_NEON_DOTPROD #define HAS_SETROW_NEON #define HAS_SPLITARGBROW_NEON #define HAS_SPLITRGBROW_NEON @@ -1409,6 +1416,18 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width); void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width); void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width); +void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb, + uint8_t* dst_y, + int width); +void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb, + uint8_t* dst_yj, + int width); +void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr, + uint8_t* dst_yj, + int width); +void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba, + uint8_t* dst_yj, + int width); void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width); void ABGRToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width); @@ -1632,6 +1651,15 @@ void RAWToUVRow_LASX(const uint8_t* src_raw, void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra, + uint8_t* dst_y, + int width); +void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr, + uint8_t* dst_y, + int width); +void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba, + uint8_t* dst_y, + int width); void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width); void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width); @@ -1709,12 +1737,33 @@ void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_Any_NEON_DotProd(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_NEON_DotProd(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ABGRToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYJRow_Any_NEON_DotProd(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYJRow_Any_NEON_DotProd(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_NEON_DotProd(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_NEON_DotProd(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_NEON_DotProd(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/source/convert.cc b/source/convert.cc index 2aa1865ac..e852a90cd 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1904,6 +1904,14 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYRow = ARGBToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; @@ -2066,6 +2074,14 @@ int ARGBToI420Alpha(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYRow = ARGBToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; @@ -2229,6 +2245,14 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif +#if defined(HAS_BGRATOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + BGRAToYRow = BGRAToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_BGRATOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { BGRAToUVRow = BGRAToUVRow_Any_NEON; @@ -2388,6 +2412,14 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ABGRToYRow = ABGRToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ABGRTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToUVRow = ABGRToUVRow_Any_NEON; @@ -2497,6 +2529,14 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + RGBAToYRow = RGBAToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + RGBAToYRow = RGBAToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_RGBATOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGBAToUVRow = RGBAToUVRow_Any_NEON; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index b45de8c8f..6c361c05a 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -116,6 +116,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYRow = ARGBToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -231,6 +239,14 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYRow = ARGBToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; @@ -331,6 +347,14 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYRow = ARGBToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; @@ -553,6 +577,14 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYRow = ARGBToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; @@ -752,6 +784,14 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ABGRToYRow = ABGRToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ABGRTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToUVRow = ABGRToUVRow_Any_NEON; @@ -940,6 +980,14 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ABGRToYRow = ABGRToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ABGRTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToUVRow = ABGRToUVRow_Any_NEON; @@ -1133,6 +1181,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYRow = ARGBToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; @@ -1329,6 +1385,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYRow = ARGBToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; @@ -1501,6 +1565,14 @@ int ARGBToI400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYRow = ARGBToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -2248,6 +2320,14 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVJRow = ARGBToUVJRow_Any_NEON; @@ -2416,6 +2496,14 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVJRow = ARGBToUVJRow_Any_NEON; @@ -2522,6 +2610,14 @@ int ARGBToJ400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ARGBTOYJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYJRow = ARGBToYJRow_Any_MSA; @@ -2593,6 +2689,14 @@ int RGBAToJ400(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOYJROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + RGBAToYJRow = RGBAToYJRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_NEON_DotProd; + } + } +#endif #if defined(HAS_RGBATOYJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGBAToYJRow = RGBAToYJRow_Any_MSA; @@ -2698,6 +2802,14 @@ int ABGRToJ420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYJROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ABGRToYJRow = ABGRToYJRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ABGRTOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToUVJRow = ABGRToUVJRow_Any_NEON; @@ -2828,6 +2940,14 @@ int ABGRToJ422(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYJROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ABGRToYJRow = ABGRToYJRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ABGRTOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToUVJRow = ABGRToUVJRow_Any_NEON; @@ -2930,6 +3050,14 @@ int ABGRToJ400(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYJROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ABGRToYJRow = ABGRToYJRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_NEON_DotProd; + } + } +#endif #if defined(HAS_ABGRTOYJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYJRow = ABGRToYJRow_Any_MSA; diff --git a/source/row_any.cc b/source/row_any.cc index e574543cc..8ed5a49c2 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1072,6 +1072,9 @@ ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) #ifdef HAS_ARGBTOYROW_NEON ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYROW_NEON_DOTPROD +ANY11(ARGBToYRow_Any_NEON_DotProd, ARGBToYRow_NEON_DotProd, 0, 4, 1, 15) +#endif #ifdef HAS_ARGBTOYROW_MSA ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) #endif @@ -1084,12 +1087,21 @@ ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31) #ifdef HAS_ARGBTOYJROW_NEON ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYJROW_NEON_DOTPROD +ANY11(ARGBToYJRow_Any_NEON_DotProd, ARGBToYJRow_NEON_DotProd, 0, 4, 1, 15) +#endif #ifdef HAS_ABGRTOYJROW_NEON ANY11(ABGRToYJRow_Any_NEON, ABGRToYJRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_ABGRTOYJROW_NEON_DOTPROD +ANY11(ABGRToYJRow_Any_NEON_DotProd, ABGRToYJRow_NEON_DotProd, 0, 4, 1, 15) +#endif #ifdef HAS_RGBATOYJROW_NEON ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_RGBATOYJROW_NEON_DOTPROD +ANY11(RGBAToYJRow_Any_NEON_DotProd, RGBAToYJRow_NEON_DotProd, 0, 4, 1, 15) +#endif #ifdef HAS_ARGBTOYJROW_MSA ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) #endif @@ -1114,6 +1126,9 @@ ANY11(ABGRToYJRow_Any_LASX, ABGRToYJRow_LASX, 0, 4, 1, 31) #ifdef HAS_BGRATOYROW_NEON ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_BGRATOYROW_NEON_DOTPROD +ANY11(BGRAToYRow_Any_NEON_DotProd, BGRAToYRow_NEON_DotProd, 0, 4, 1, 15) +#endif #ifdef HAS_BGRATOYROW_MSA ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) #endif @@ -1126,6 +1141,9 @@ ANY11(BGRAToYRow_Any_LASX, BGRAToYRow_LASX, 0, 4, 1, 31) #ifdef HAS_ABGRTOYROW_NEON ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_ABGRTOYROW_NEON_DOTPROD +ANY11(ABGRToYRow_Any_NEON_DotProd, ABGRToYRow_NEON_DotProd, 0, 4, 1, 15) +#endif #ifdef HAS_ABGRTOYROW_MSA ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) #endif @@ -1138,6 +1156,9 @@ ANY11(ABGRToYRow_Any_LASX, ABGRToYRow_LASX, 0, 4, 1, 31) #ifdef HAS_RGBATOYROW_NEON ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_RGBATOYROW_NEON_DOTPROD +ANY11(RGBAToYRow_Any_NEON_DotProd, RGBAToYRow_NEON_DotProd, 0, 4, 1, 15) +#endif #ifdef HAS_RGBATOYROW_MSA ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) #endif diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 4920292cf..e06f65e56 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3029,12 +3029,49 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, "v17"); } +void +ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "ldr d0, [%3] \n" // load rgbconstants + "dup v16.4s, v0.s[0] \n" + "dup v17.8h, v0.h[2] \n" + "1: \n" + "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16 + // pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "movi v0.16b, #0 \n" + "movi v1.16b, #0 \n" + "movi v2.16b, #0 \n" + "movi v3.16b, #0 \n" + "udot v0.4s, v4.16b, v16.16b \n" + "udot v1.4s, v5.16b, v16.16b \n" + "udot v2.4s, v6.16b, v16.16b \n" + "udot v3.4s, v7.16b, v16.16b \n" + "uzp1 v0.8h, v0.8h, v1.8h \n" + "uzp1 v1.8h, v2.8h, v3.8h \n" + "addhn v0.8b, v0.8h, v17.8h \n" + "addhn v1.8b, v1.8h, v17.8h \n" + "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(rgbconstants) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17"); +} + // RGB to JPeg coefficients // B * 0.1140 coefficient = 29 // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; +static const struct RgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, + 128}; static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; @@ -3046,8 +3083,12 @@ static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, 0x1080}; +static const struct RgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, + 0x1080}; static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; +static const struct RgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, + 0x1080}; void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); @@ -3065,6 +3106,30 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); } +void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { + ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb, + uint8_t* dst_yj, + int width) { + ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr, + uint8_t* dst_y, + int width) { + ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr, + uint8_t* dst_yj, + int width) { + ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + // RGBA expects first value to be A and ignored, then 3 values to contain RGB. // Same code as ARGB, except the LD4 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, @@ -3112,6 +3177,33 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); } +void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba, + uint8_t* dst_y, + int width) { + // No need for a separate implementation for RGBA inputs, just permute the + // RGB constants. + ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width, + &kRgb24I601DotProdConstants); +} + +void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba, + uint8_t* dst_yj, + int width) { + // No need for a separate implementation for RGBA inputs, just permute the + // RGB constants. + ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width, + &kRgb24JPEGDotProdConstants); +} + +void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra, + uint8_t* dst_y, + int width) { + // No need for a separate implementation for RGBA inputs, just permute the + // RGB constants. + ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width, + &kRawI601DotProdConstants); +} + void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width,