diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 93feb0c6d..43ffe247c 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -564,10 +564,16 @@ extern "C" { // The following are available on AArch64 SVE platforms: #if !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) -#define HAS_I444TOARGBROW_SVE2 +#define HAS_ABGRTOUVJROW_SVE2 +#define HAS_ABGRTOUVROW_SVE2 +#define HAS_ARGBTOUVJROW_SVE2 +#define HAS_ARGBTOUVROW_SVE2 +#define HAS_BGRATOUVROW_SVE2 +#define HAS_I422ALPHATOARGBROW_SVE2 #define HAS_I422TOARGBROW_SVE2 #define HAS_I444ALPHATOARGBROW_SVE2 -#define HAS_I422ALPHATOARGBROW_SVE2 +#define HAS_I444TOARGBROW_SVE2 +#define HAS_RGBATOUVROW_SVE2 #endif // The following are available on AArch64 platforms: @@ -1489,6 +1495,11 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_MSA(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1521,26 +1532,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVJRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_uj, uint8_t* dst_vj, int width); +void ABGRToUVJRow_SVE2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width); void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width); +void BGRAToUVRow_SVE2(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVRow_SVE2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGBAToUVRow_NEON(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGBAToUVRow_SVE2(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, @@ -1966,6 +2002,11 @@ void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -1998,26 +2039,51 @@ void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); +void BGRAToUVRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGBAToUVRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, diff --git a/source/convert.cc b/source/convert.cc index e852a90cd..fdd0cb644 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1920,6 +1920,14 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -2090,6 +2098,14 @@ int ARGBToI420Alpha(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -2261,6 +2277,14 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif +#if defined(HAS_BGRATOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + BGRAToUVRow = BGRAToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + BGRAToUVRow = BGRAToUVRow_SVE2; + } + } +#endif #if defined(HAS_BGRATOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { BGRAToYRow = BGRAToYRow_Any_SSSE3; @@ -2428,6 +2452,14 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ABGRToUVRow = ABGRToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ABGRToUVRow = ABGRToUVRow_SVE2; + } + } +#endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; @@ -2545,6 +2577,14 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RGBAToUVRow = RGBAToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + RGBAToUVRow = RGBAToUVRow_SVE2; + } + } +#endif #if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGBAToYRow = RGBAToYRow_Any_MSA; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 6c361c05a..c684ac00d 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -255,6 +255,14 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -363,6 +371,14 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -593,6 +609,14 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -800,6 +824,14 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ABGRToUVRow = ABGRToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ABGRToUVRow = ABGRToUVRow_SVE2; + } + } +#endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; @@ -996,6 +1028,14 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ABGRToUVRow = ABGRToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ABGRToUVRow = ABGRToUVRow_SVE2; + } + } +#endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; @@ -1197,6 +1237,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -1401,6 +1449,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -2336,6 +2392,14 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVJRow = ARGBToUVJRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -2512,6 +2576,14 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVJRow = ARGBToUVJRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYJRow = ARGBToYJRow_Any_MSA; @@ -2818,6 +2890,14 @@ int ABGRToJ420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVJROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ABGRToUVJRow = ABGRToUVJRow_SVE2; + } + } +#endif #if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYJRow = ABGRToYJRow_Any_MSA; @@ -2956,6 +3036,14 @@ int ABGRToJ422(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVJROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ABGRToUVJRow = ABGRToUVJRow_SVE2; + } + } +#endif #if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYJRow = ABGRToYJRow_Any_MSA; diff --git a/source/row_any.cc b/source/row_any.cc index 8ed5a49c2..a466e3428 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2225,6 +2225,9 @@ ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) #ifdef HAS_ARGBTOUVROW_NEON ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVROW_SVE2 +ANY12S(ARGBToUVRow_Any_SVE2, ARGBToUVRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_ARGBTOUVROW_MSA ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) #endif @@ -2237,9 +2240,15 @@ ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31) #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVJROW_SVE2 +ANY12S(ARGBToUVJRow_Any_SVE2, ARGBToUVJRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_ABGRTOUVJROW_NEON ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVJROW_SVE2 +ANY12S(ABGRToUVJRow_Any_SVE2, ABGRToUVJRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_ARGBTOUVJROW_MSA ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) #endif @@ -2252,6 +2261,9 @@ ANY12S(ARGBToUVJRow_Any_LASX, ARGBToUVJRow_LASX, 0, 4, 31) #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_BGRATOUVROW_SVE2 +ANY12S(BGRAToUVRow_Any_SVE2, BGRAToUVRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_BGRATOUVROW_MSA ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15) #endif @@ -2261,6 +2273,9 @@ ANY12S(BGRAToUVRow_Any_LSX, BGRAToUVRow_LSX, 0, 4, 15) #ifdef HAS_ABGRTOUVROW_NEON ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVROW_SVE2 +ANY12S(ABGRToUVRow_Any_SVE2, ABGRToUVRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_ABGRTOUVROW_MSA ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15) #endif @@ -2270,6 +2285,9 @@ ANY12S(ABGRToUVRow_Any_LSX, ABGRToUVRow_LSX, 0, 4, 15) #ifdef HAS_RGBATOUVROW_NEON ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_RGBATOUVROW_SVE2 +ANY12S(RGBAToUVRow_Any_SVE2, RGBAToUVRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_RGBATOUVROW_MSA ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15) #endif diff --git a/source/row_sve.cc b/source/row_sve.cc index a7048b65a..ff65af0ab 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -262,6 +262,243 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y, : "cc", "memory", YUVTORGB_SVE_REGS); } +// Dot-product constants are stored as four-tuples with the two innermost +// elements flipped to account for the interleaving nature of the widening +// addition instructions. + +static const int16_t kArgbToUvArr[] = { + // UB, -UR, -UG, 0, -VB, VR, -VG, 0 + 56, -19, -37, 0, -9, 56, -47, 0, +}; + +static const int16_t kRgbaToUvArr[] = { + // 0, -UG, UB, -UR, 0, -VG, -VB, VR + 0, -37, 56, -19, 0, -47, -9, 56, +}; + +static const int16_t kBgraToUvArr[] = { + // 0, -UG, -UR, UB, 0, -VG, VR, -VB + 0, -37, -19, 56, 0, -47, 56, -9, +}; + +static const int16_t kAbgrToUvArr[] = { + // -UR, UB, -UG, 0, VR, -VB, -VG, 0 + -19, 56, -37, 0, 56, -9, -47, 0, +}; + +static const int16_t kArgbToUvjArr[] = { + // UB, -UR, -UG, 0, -VB, VR, -VG, 0 + 63, -21, -42, 0, -10, 63, -53, 0, +}; + +static const int16_t kAbgrToUvjArr[] = { + // -UR, UB, -UG, 0, VR, -VB, -VG, 0 + -21, 63, -42, 0, 63, -10, -53, 0, +}; + +void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const int16_t* uvconstants) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; + uint64_t vl; + asm volatile( + "ptrue p0.b \n" + "ld1rd {z24.d}, p0/z, [%[uvconstants]] \n" + "ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n" + "mov z26.b, #0x80 \n" + + "cntb %[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Process 4x vectors from each input row per iteration. + // Cannot use predication here due to unrolling. + "1: \n" // e.g. + "ld1b {z0.b}, p0/z, [%[src0], #0, mul vl] \n" // bgrabgra + "ld1b {z4.b}, p0/z, [%[src1], #0, mul vl] \n" // bgrabgra + "ld1b {z1.b}, p0/z, [%[src0], #1, mul vl] \n" // bgrabgra + "ld1b {z5.b}, p0/z, [%[src1], #1, mul vl] \n" // bgrabgra + "ld1b {z2.b}, p0/z, [%[src0], #2, mul vl] \n" // bgrabgra + "ld1b {z6.b}, p0/z, [%[src1], #2, mul vl] \n" // bgrabgra + "ld1b {z3.b}, p0/z, [%[src0], #3, mul vl] \n" // bgrabgra + "ld1b {z7.b}, p0/z, [%[src1], #3, mul vl] \n" // bgrabgra + "incb %[src0], all, mul #4 \n" + "incb %[src1], all, mul #4 \n" + + "uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr + "uaddlt z17.h, z0.b, z4.b \n" // gagagaga + "uaddlb z18.h, z1.b, z5.b \n" // brbrbrbr + "uaddlt z19.h, z1.b, z5.b \n" // gagagaga + "uaddlb z20.h, z2.b, z6.b \n" // brbrbrbr + "uaddlt z21.h, z2.b, z6.b \n" // gagagaga + "uaddlb z22.h, z3.b, z7.b \n" // brbrbrbr + "uaddlt z23.h, z3.b, z7.b \n" // gagagaga + + "trn1 z0.s, z16.s, z17.s \n" // brgabgra + "trn2 z1.s, z16.s, z17.s \n" // brgabgra + "trn1 z2.s, z18.s, z19.s \n" // brgabgra + "trn2 z3.s, z18.s, z19.s \n" // brgabgra + "trn1 z4.s, z20.s, z21.s \n" // brgabgra + "trn2 z5.s, z20.s, z21.s \n" // brgabgra + "trn1 z6.s, z22.s, z23.s \n" // brgabgra + "trn2 z7.s, z22.s, z23.s \n" // brgabgra + + "subs %w[width], %w[width], %w[vl] \n" // 4*VL per loop + + "urhadd z0.h, p0/m, z0.h, z1.h \n" // brgabrga + "urhadd z2.h, p0/m, z2.h, z3.h \n" // brgabrga + "urhadd z4.h, p0/m, z4.h, z5.h \n" // brgabrga + "urhadd z6.h, p0/m, z6.h, z7.h \n" // brgabrga + + "movi v16.8h, #0 \n" + "movi v17.8h, #0 \n" + "movi v18.8h, #0 \n" + "movi v19.8h, #0 \n" + + "movi v20.8h, #0 \n" + "movi v21.8h, #0 \n" + "movi v22.8h, #0 \n" + "movi v23.8h, #0 \n" + + "sdot z16.d, z0.h, z24.h \n" // UUxxxxxx + "sdot z17.d, z2.h, z24.h \n" // UUxxxxxx + "sdot z18.d, z4.h, z24.h \n" // UUxxxxxx + "sdot z19.d, z6.h, z24.h \n" // UUxxxxxx + + "sdot z20.d, z0.h, z25.h \n" // VVxxxxxx + "sdot z21.d, z2.h, z25.h \n" // VVxxxxxx + "sdot z22.d, z4.h, z25.h \n" // VVxxxxxx + "sdot z23.d, z6.h, z25.h \n" // VVxxxxxx + + "uzp1 z16.s, z16.s, z17.s \n" // UUxx + "uzp1 z18.s, z18.s, z19.s \n" // UUxx + "uzp1 z20.s, z20.s, z21.s \n" // VVxx + "uzp1 z22.s, z22.s, z23.s \n" // VVxx + + "uzp1 z16.h, z16.h, z18.h \n" // UU + "uzp1 z20.h, z20.h, z22.h \n" // VV + + "addhnb z16.b, z16.h, z26.h \n" // U + "addhnb z20.b, z20.h, z26.h \n" // V + + "st1b {z16.h}, p0, [%[dst_u]] \n" // U + "st1b {z20.h}, p0, [%[dst_v]] \n" // V + "inch %[dst_u] \n" + "inch %[dst_v] \n" + + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" // VL per loop + "b.le 99f \n" + + // Process remaining pixels from each input row. + // Use predication to do one vector from each input array, so may loop up + // to three iterations. + "cntw %x[vl] \n" + + "3: \n" + "whilelt p1.s, wzr, %w[width] \n" + "ld1d {z0.d}, p1/z, [%[src0]] \n" // bgrabgra + "ld1d {z4.d}, p1/z, [%[src1]] \n" // bgrabgra + "incb %[src0] \n" + "incb %[src1] \n" + + "uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr + "uaddlt z17.h, z0.b, z4.b \n" // gagagaga + + "trn1 z0.s, z16.s, z17.s \n" // brgabgra + "trn2 z1.s, z16.s, z17.s \n" // brgabgra + + "urhadd z0.h, p0/m, z0.h, z1.h \n" // brgabrga + + "subs %w[width], %w[width], %w[vl] \n" // VL per loop + + "movi v16.8h, #0 \n" + "movi v20.8h, #0 \n" + + "sdot z16.d, z0.h, z24.h \n" + "sdot z20.d, z0.h, z25.h \n" + + "addhnb z16.b, z16.h, z26.h \n" // U + "addhnb z20.b, z20.h, z26.h \n" // V + + "st1b {z16.d}, p0, [%[dst_u]] \n" // U + "st1b {z20.d}, p0, [%[dst_v]] \n" // V + "incd %[dst_u] \n" + "incd %[dst_v] \n" + "b.gt 3b \n" + + "99: \n" + : [src0] "+r"(src_argb), // %[src0] + [src1] "+r"(src_argb_1), // %[src1] + [dst_u] "+r"(dst_u), // %[dst_u] + [dst_v] "+r"(dst_v), // %[dst_v] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [uvconstants] "r"(uvconstants) + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", + "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", + "p0"); +} + +void ARGBToUVRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width, + kArgbToUvArr); +} + +void ARGBToUVJRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width, + kArgbToUvjArr); +} + +void ABGRToUVJRow_SVE2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width) { + ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_uj, dst_vj, width, + kAbgrToUvjArr); +} + +void BGRAToUVRow_SVE2(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE2(src_bgra, src_stride_bgra, dst_u, dst_v, width, + kBgraToUvArr); +} + +void ABGRToUVRow_SVE2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_u, dst_v, width, + kAbgrToUvArr); +} + +void RGBAToUVRow_SVE2(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE2(src_rgba, src_stride_rgba, dst_u, dst_v, width, + kRgbaToUvArr); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus