diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 6e8f46f5c..46cec2723 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -551,9 +551,15 @@ extern "C" { #define HAS_ARGBGRAYROW_NEON_DOTPROD #define HAS_ARGBSEPIAROW_NEON_DOTPROD +#define HAS_ABGRTOUVJROW_NEON_I8MM +#define HAS_ABGRTOUVROW_NEON_I8MM #define HAS_ARGBCOLORMATRIXROW_NEON_I8MM #define HAS_ARGBTOUV444ROW_NEON_I8MM #define HAS_ARGBTOUVJ444ROW_NEON_I8MM +#define HAS_ARGBTOUVJROW_NEON_I8MM +#define HAS_ARGBTOUVROW_NEON_I8MM +#define HAS_BGRATOUVROW_NEON_I8MM +#define HAS_RGBATOUVROW_NEON_I8MM #endif // The following are available on AArch64 SVE platforms: @@ -1912,6 +1918,11 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_SVE2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1949,6 +1960,11 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVJRow_SVE2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1959,6 +1975,11 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_uj, uint8_t* dst_vj, int width); +void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width); void ABGRToUVJRow_SVE2(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_uj, @@ -1969,6 +1990,11 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width); +void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_SVE2(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, @@ -1979,6 +2005,11 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVRow_SVE2(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, @@ -1989,6 +2020,11 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGBAToUVRow_SVE2(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, @@ -2431,6 +2467,11 @@ void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_Any_NEON_I8MM(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_Any_SVE2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2468,6 +2509,11 @@ void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_Any_NEON_I8MM(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVJRow_Any_SVE2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2478,6 +2524,11 @@ void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_Any_NEON_I8MM(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVJRow_Any_SVE2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2488,6 +2539,11 @@ void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void BGRAToUVRow_Any_NEON_I8MM(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_Any_SVE2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2498,6 +2554,11 @@ void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVRow_Any_NEON_I8MM(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVRow_Any_SVE2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2508,6 +2569,11 @@ void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGBAToUVRow_Any_NEON_I8MM(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGBAToUVRow_Any_SVE2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, diff --git a/source/convert.cc b/source/convert.cc index 7ebdad27b..24b0f0b6d 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2087,6 +2087,14 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ARGBToUVRow = ARGBToUVRow_Any_SVE2; @@ -2265,6 +2273,14 @@ int ARGBToI420Alpha(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ARGBToUVRow = ARGBToUVRow_Any_SVE2; @@ -2444,6 +2460,14 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif +#if defined(HAS_BGRATOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + BGRAToUVRow = BGRAToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_BGRATOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { BGRAToUVRow = BGRAToUVRow_Any_SVE2; @@ -2619,6 +2643,14 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ABGRTOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ABGRToUVRow = ABGRToUVRow_Any_SVE2; @@ -2744,6 +2776,14 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + RGBAToUVRow = RGBAToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_RGBATOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { RGBAToUVRow = RGBAToUVRow_Any_SVE2; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 8d2e8d05e..8451821c8 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -271,6 +271,14 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ARGBToUVRow = ARGBToUVRow_Any_SVE2; @@ -387,6 +395,14 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ARGBToUVRow = ARGBToUVRow_Any_SVE2; @@ -630,6 +646,14 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ARGBToUVRow = ARGBToUVRow_Any_SVE2; @@ -850,6 +874,14 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ABGRTOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ABGRToUVRow = ABGRToUVRow_Any_SVE2; @@ -1059,6 +1091,14 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ABGRTOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ABGRToUVRow = ABGRToUVRow_Any_SVE2; @@ -1273,6 +1313,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ARGBToUVRow = ARGBToUVRow_Any_SVE2; @@ -1485,6 +1533,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUVROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ARGBToUVRow = ARGBToUVRow_Any_SVE2; @@ -2625,6 +2681,14 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUVJROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ARGBToUVJRow = ARGBToUVJRow_Any_SVE2; @@ -2809,6 +2873,14 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUVJROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ARGBToUVJRow = ARGBToUVJRow_Any_SVE2; @@ -3123,6 +3195,14 @@ int ABGRToJ420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVJROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ABGRToUVJRow = ABGRToUVJRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ABGRTOUVJROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ABGRToUVJRow = ABGRToUVJRow_Any_SVE2; @@ -3269,6 +3349,14 @@ int ABGRToJ422(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVJROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ABGRToUVJRow = ABGRToUVJRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_NEON_I8MM; + } + } +#endif #if defined(HAS_ABGRTOUVJROW_SVE2) if (TestCpuFlag(kCpuHasSVE2)) { ABGRToUVJRow = ABGRToUVJRow_Any_SVE2; diff --git a/source/row_any.cc b/source/row_any.cc index 85fb6ffb5..5dac7a9c7 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2358,6 +2358,9 @@ ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) #ifdef HAS_ARGBTOUVROW_NEON ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVROW_NEON_I8MM +ANY12S(ARGBToUVRow_Any_NEON_I8MM, ARGBToUVRow_NEON_I8MM, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVROW_SVE2 ANY12S(ARGBToUVRow_Any_SVE2, ARGBToUVRow_SVE2, 0, 4, 1) #endif @@ -2373,12 +2376,18 @@ ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31) #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVJROW_NEON_I8MM +ANY12S(ARGBToUVJRow_Any_NEON_I8MM, ARGBToUVJRow_NEON_I8MM, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVJROW_SVE2 ANY12S(ARGBToUVJRow_Any_SVE2, ARGBToUVJRow_SVE2, 0, 4, 1) #endif #ifdef HAS_ABGRTOUVJROW_NEON ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVJROW_NEON_I8MM +ANY12S(ABGRToUVJRow_Any_NEON_I8MM, ABGRToUVJRow_NEON_I8MM, 0, 4, 15) +#endif #ifdef HAS_ABGRTOUVJROW_SVE2 ANY12S(ABGRToUVJRow_Any_SVE2, ABGRToUVJRow_SVE2, 0, 4, 1) #endif @@ -2394,6 +2403,9 @@ ANY12S(ARGBToUVJRow_Any_LASX, ARGBToUVJRow_LASX, 0, 4, 31) #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_BGRATOUVROW_NEON_I8MM +ANY12S(BGRAToUVRow_Any_NEON_I8MM, BGRAToUVRow_NEON_I8MM, 0, 4, 15) +#endif #ifdef HAS_BGRATOUVROW_SVE2 ANY12S(BGRAToUVRow_Any_SVE2, BGRAToUVRow_SVE2, 0, 4, 1) #endif @@ -2406,6 +2418,9 @@ ANY12S(BGRAToUVRow_Any_LSX, BGRAToUVRow_LSX, 0, 4, 15) #ifdef HAS_ABGRTOUVROW_NEON ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVROW_NEON_I8MM +ANY12S(ABGRToUVRow_Any_NEON_I8MM, ABGRToUVRow_NEON_I8MM, 0, 4, 15) +#endif #ifdef HAS_ABGRTOUVROW_SVE2 ANY12S(ABGRToUVRow_Any_SVE2, ABGRToUVRow_SVE2, 0, 4, 1) #endif @@ -2418,6 +2433,9 @@ ANY12S(ABGRToUVRow_Any_LSX, ABGRToUVRow_LSX, 0, 4, 15) #ifdef HAS_RGBATOUVROW_NEON ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_RGBATOUVROW_NEON_I8MM +ANY12S(RGBAToUVRow_Any_NEON_I8MM, RGBAToUVRow_NEON_I8MM, 0, 4, 15) +#endif #ifdef HAS_RGBATOUVROW_SVE2 ANY12S(RGBAToUVRow_Any_SVE2, RGBAToUVRow_SVE2, 0, 4, 1) #endif diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 0a0427c46..4408675d2 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3448,6 +3448,177 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, ); } +// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout. +static void ABCDToUVMatrixRow_NEON_I8MM(const uint8_t* src, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const int8_t* uvconstants) { + const uint8_t* src1 = src + src_stride; + asm volatile( + "movi v23.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in + // 16-bit) + "ld2r {v24.4s, v25.4s}, [%[uvconstants]] \n" + + "1: \n" + "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" // load 8 pixels + "ld2 {v2.4s, v3.4s}, [%[src]], #32 \n" // load 8 pixels + "subs %w[width], %w[width], #16 \n" // 16 processed per loop + "uaddl v4.8h, v0.8b, v1.8b \n" // ABCDABCD + "uaddl2 v5.8h, v0.16b, v1.16b \n" // ABCDABCD + "uaddl v6.8h, v2.8b, v3.8b \n" // ABCDABCD + "uaddl2 v7.8h, v2.16b, v3.16b \n" // ABCDABCD + + "ld2 {v0.4s, v1.4s}, [%[src1]], #32 \n" // load 8 pixels + "ld2 {v2.4s, v3.4s}, [%[src1]], #32 \n" // load 8 pixels + "uaddw v4.8h, v4.8h, v0.8b \n" // ABCDABCD + "uaddw2 v5.8h, v5.8h, v0.16b \n" // ABCDABCD + "uaddw v6.8h, v6.8h, v2.8b \n" // ABCDABCD + "uaddw2 v7.8h, v7.8h, v2.16b \n" // ABCDABCD + "prfm pldl1keep, [%[src], 448] \n" + "uaddw v4.8h, v4.8h, v1.8b \n" // ABCDABCD + "uaddw2 v5.8h, v5.8h, v1.16b \n" // ABCDABCD + "uaddw v6.8h, v6.8h, v3.8b \n" // ABCDABCD + "uaddw2 v7.8h, v7.8h, v3.16b \n" // ABCDABCD + "prfm pldl1keep, [%[src1], 448] \n" + + "rshrn v4.8b, v4.8h, #2 \n" // average of 4 pixels + "rshrn v6.8b, v6.8h, #2 \n" // average of 4 pixels + "rshrn2 v4.16b, v5.8h, #2 \n" // average of 4 pixels + "rshrn2 v6.16b, v7.8h, #2 \n" // average of 4 pixels + + "movi v0.4s, #0 \n" // U + "movi v1.4s, #0 \n" // U + "usdot v0.4s, v4.16b, v24.16b \n" + "usdot v1.4s, v6.16b, v24.16b \n" + + "movi v2.4s, #0 \n" // V + "movi v3.4s, #0 \n" // V + "usdot v2.4s, v4.16b, v25.16b \n" + "usdot v3.4s, v6.16b, v25.16b \n" + + "uzp1 v0.8h, v0.8h, v1.8h \n" // U + "uzp1 v1.8h, v2.8h, v3.8h \n" // V + + "subhn v0.8b, v23.8h, v0.8h \n" // +128 -> unsigned + "subhn v1.8b, v23.8h, v1.8h \n" // +128 -> unsigned + + "str d0, [%[dst_u]], #8 \n" // store 8 pixels U + "str d1, [%[dst_v]], #8 \n" // store 8 pixels V + "b.gt 1b \n" + : [src] "+r"(src), // %[src] + [src1] "+r"(src1), // %[src1] + [dst_u] "+r"(dst_u), // %[dst_u] + [dst_v] "+r"(dst_v), // %[dst_v] + [width] "+r"(width) // %[width] + : [uvconstants] "r"(uvconstants) // %[uvconstants] + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23", + "v24", "v25"); +} + +// RGB to BT601 coefficients +// UB 0.875 coefficient = 112 +// UG -0.5781 coefficient = -74 +// UR -0.2969 coefficient = -38 +// VB -0.1406 coefficient = -18 +// VG -0.7344 coefficient = -94 +// VR 0.875 coefficient = 112 +// I8MM constants are stored negated such that we can store 128 in int8_t. + +static const int8_t kARGBToUVCoefficients[] = { + // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0 + -112, 74, 38, 0, 18, 94, -112, 0, +}; + +static const int8_t kABGRToUVCoefficients[] = { + // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0 + 38, 74, -112, 0, -112, 94, 18, 0, +}; + +static const int8_t kBGRAToUVCoefficients[] = { + // 0, -UR, -UG, -UB, 0, -VR, -VG, -VB + 0, 38, 74, -112, 0, -112, 94, 18, +}; + +static const int8_t kRGBAToUVCoefficients[] = { + // 0, -UB, -UG, -UR, 0, -VB, -VG, -VR + 0, -112, 74, 38, 0, 18, 94, -112, +}; + +void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, + kARGBToUVCoefficients); +} + +void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, + kABGRToUVCoefficients); +} + +void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ABCDToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width, + kBGRAToUVCoefficients); +} + +void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ABCDToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width, + kRGBAToUVCoefficients); +} + +// RGB to JPEG coefficients +// UB 0.500 coefficient = 128 +// UG -0.33126 coefficient = -85 +// UR -0.16874 coefficient = -43 +// VB -0.08131 coefficient = -21 +// VG -0.41869 coefficient = -107 +// VR 0.500 coefficient = 128 +// I8MM constants are stored negated such that we can store 128 in int8_t. + +static const int8_t kARGBToUVJCoefficients[] = { + // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0 + -128, 85, 43, 0, 21, 107, -128, 0, +}; + +static const int8_t kABGRToUVJCoefficients[] = { + // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0 + 43, 85, -128, 0, -128, 107, 21, 0, +}; + +void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, + kARGBToUVJCoefficients); +} + +void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, + kABGRToUVJCoefficients); +} + void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { asm volatile( "movi v24.16b, #25 \n" // B * 0.1016 coefficient