From 6f1d8b1e11868bdcff72eeaf7e0a80fd82fde929 Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 12 Apr 2024 16:44:54 +0100 Subject: [PATCH] [AArch64] Add SVE2 implementations for ARGBToUVRow and similar By maintaining the interleaved format of the data we can use a common kernel for all input channel orderings and simply pass a different vector of constants instead. A similar approach is possible with only Neon by making use of multiplies and repeated application of ADDP to combine channels, however this is slower on older cores like Cortex-A53 so is not pursued further. For odd problem sizes we need a slightly different implementation for the final element, so introduce an "any" kernel to address that rather than bloating the code for the common case. Observed affect on runtimes compared to the existing Neon kernels: | Cortex-A510 | Cortex-A720 | Cortex-X2 ABGRToUVJRow | -15.5% | +5.4% | -33.1% ABGRToUVRow | -15.6% | +5.3% | -35.9% ARGBToUVJRow | -10.1% | +5.4% | -32.7% ARGBToUVRow | -10.1% | +5.4% | -29.3% BGRAToUVRow | -15.5% | +4.6% | -32.8% RGBAToUVRow | -10.1% | +4.2% | -36.0% Bug: libyuv:973 Change-Id: I041ca44db0ae8a2adffcdf24e822eebe962baf33 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5505537 Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- include/libyuv/row.h | 70 ++++++++++- source/convert.cc | 40 ++++++ source/convert_from_argb.cc | 88 +++++++++++++ source/row_any.cc | 18 +++ source/row_sve.cc | 237 ++++++++++++++++++++++++++++++++++++ 5 files changed, 451 insertions(+), 2 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 93feb0c6d..43ffe247c 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -564,10 +564,16 @@ extern "C" { // The following are available on AArch64 SVE platforms: #if !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) -#define HAS_I444TOARGBROW_SVE2 +#define HAS_ABGRTOUVJROW_SVE2 +#define HAS_ABGRTOUVROW_SVE2 +#define HAS_ARGBTOUVJROW_SVE2 +#define HAS_ARGBTOUVROW_SVE2 +#define HAS_BGRATOUVROW_SVE2 +#define HAS_I422ALPHATOARGBROW_SVE2 #define HAS_I422TOARGBROW_SVE2 #define HAS_I444ALPHATOARGBROW_SVE2 -#define HAS_I422ALPHATOARGBROW_SVE2 +#define HAS_I444TOARGBROW_SVE2 +#define HAS_RGBATOUVROW_SVE2 #endif // The following are available on AArch64 platforms: @@ -1489,6 +1495,11 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_MSA(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1521,26 +1532,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVJRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_uj, uint8_t* dst_vj, int width); +void ABGRToUVJRow_SVE2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width); void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width); +void BGRAToUVRow_SVE2(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVRow_SVE2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGBAToUVRow_NEON(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGBAToUVRow_SVE2(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, @@ -1966,6 +2002,11 @@ void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -1998,26 +2039,51 @@ void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); +void BGRAToUVRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGBAToUVRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, diff --git a/source/convert.cc b/source/convert.cc index e852a90cd..fdd0cb644 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1920,6 +1920,14 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -2090,6 +2098,14 @@ int ARGBToI420Alpha(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -2261,6 +2277,14 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif +#if defined(HAS_BGRATOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + BGRAToUVRow = BGRAToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + BGRAToUVRow = BGRAToUVRow_SVE2; + } + } +#endif #if defined(HAS_BGRATOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { BGRAToYRow = BGRAToYRow_Any_SSSE3; @@ -2428,6 +2452,14 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ABGRToUVRow = ABGRToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ABGRToUVRow = ABGRToUVRow_SVE2; + } + } +#endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; @@ -2545,6 +2577,14 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RGBAToUVRow = RGBAToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + RGBAToUVRow = RGBAToUVRow_SVE2; + } + } +#endif #if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGBAToYRow = RGBAToYRow_Any_MSA; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 6c361c05a..c684ac00d 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -255,6 +255,14 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -363,6 +371,14 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -593,6 +609,14 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -800,6 +824,14 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ABGRToUVRow = ABGRToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ABGRToUVRow = ABGRToUVRow_SVE2; + } + } +#endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; @@ -996,6 +1028,14 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ABGRToUVRow = ABGRToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ABGRToUVRow = ABGRToUVRow_SVE2; + } + } +#endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; @@ -1197,6 +1237,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -1401,6 +1449,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVRow = ARGBToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -2336,6 +2392,14 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVJRow = ARGBToUVJRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -2512,6 +2576,14 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ARGBToUVJRow = ARGBToUVJRow_SVE2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYJRow = ARGBToYJRow_Any_MSA; @@ -2818,6 +2890,14 @@ int ABGRToJ420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVJROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ABGRToUVJRow = ABGRToUVJRow_SVE2; + } + } +#endif #if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYJRow = ABGRToYJRow_Any_MSA; @@ -2956,6 +3036,14 @@ int ABGRToJ422(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVJROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + ABGRToUVJRow = ABGRToUVJRow_SVE2; + } + } +#endif #if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYJRow = ABGRToYJRow_Any_MSA; diff --git a/source/row_any.cc b/source/row_any.cc index 8ed5a49c2..a466e3428 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2225,6 +2225,9 @@ ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) #ifdef HAS_ARGBTOUVROW_NEON ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVROW_SVE2 +ANY12S(ARGBToUVRow_Any_SVE2, ARGBToUVRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_ARGBTOUVROW_MSA ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) #endif @@ -2237,9 +2240,15 @@ ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31) #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVJROW_SVE2 +ANY12S(ARGBToUVJRow_Any_SVE2, ARGBToUVJRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_ABGRTOUVJROW_NEON ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVJROW_SVE2 +ANY12S(ABGRToUVJRow_Any_SVE2, ABGRToUVJRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_ARGBTOUVJROW_MSA ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) #endif @@ -2252,6 +2261,9 @@ ANY12S(ARGBToUVJRow_Any_LASX, ARGBToUVJRow_LASX, 0, 4, 31) #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_BGRATOUVROW_SVE2 +ANY12S(BGRAToUVRow_Any_SVE2, BGRAToUVRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_BGRATOUVROW_MSA ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15) #endif @@ -2261,6 +2273,9 @@ ANY12S(BGRAToUVRow_Any_LSX, BGRAToUVRow_LSX, 0, 4, 15) #ifdef HAS_ABGRTOUVROW_NEON ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVROW_SVE2 +ANY12S(ABGRToUVRow_Any_SVE2, ABGRToUVRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_ABGRTOUVROW_MSA ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15) #endif @@ -2270,6 +2285,9 @@ ANY12S(ABGRToUVRow_Any_LSX, ABGRToUVRow_LSX, 0, 4, 15) #ifdef HAS_RGBATOUVROW_NEON ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_RGBATOUVROW_SVE2 +ANY12S(RGBAToUVRow_Any_SVE2, RGBAToUVRow_SVE2, 0, 4, 1) +#endif #ifdef HAS_RGBATOUVROW_MSA ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15) #endif diff --git a/source/row_sve.cc b/source/row_sve.cc index a7048b65a..ff65af0ab 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -262,6 +262,243 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y, : "cc", "memory", YUVTORGB_SVE_REGS); } +// Dot-product constants are stored as four-tuples with the two innermost +// elements flipped to account for the interleaving nature of the widening +// addition instructions. + +static const int16_t kArgbToUvArr[] = { + // UB, -UR, -UG, 0, -VB, VR, -VG, 0 + 56, -19, -37, 0, -9, 56, -47, 0, +}; + +static const int16_t kRgbaToUvArr[] = { + // 0, -UG, UB, -UR, 0, -VG, -VB, VR + 0, -37, 56, -19, 0, -47, -9, 56, +}; + +static const int16_t kBgraToUvArr[] = { + // 0, -UG, -UR, UB, 0, -VG, VR, -VB + 0, -37, -19, 56, 0, -47, 56, -9, +}; + +static const int16_t kAbgrToUvArr[] = { + // -UR, UB, -UG, 0, VR, -VB, -VG, 0 + -19, 56, -37, 0, 56, -9, -47, 0, +}; + +static const int16_t kArgbToUvjArr[] = { + // UB, -UR, -UG, 0, -VB, VR, -VG, 0 + 63, -21, -42, 0, -10, 63, -53, 0, +}; + +static const int16_t kAbgrToUvjArr[] = { + // -UR, UB, -UG, 0, VR, -VB, -VG, 0 + -21, 63, -42, 0, 63, -10, -53, 0, +}; + +void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const int16_t* uvconstants) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; + uint64_t vl; + asm volatile( + "ptrue p0.b \n" + "ld1rd {z24.d}, p0/z, [%[uvconstants]] \n" + "ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n" + "mov z26.b, #0x80 \n" + + "cntb %[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Process 4x vectors from each input row per iteration. + // Cannot use predication here due to unrolling. + "1: \n" // e.g. + "ld1b {z0.b}, p0/z, [%[src0], #0, mul vl] \n" // bgrabgra + "ld1b {z4.b}, p0/z, [%[src1], #0, mul vl] \n" // bgrabgra + "ld1b {z1.b}, p0/z, [%[src0], #1, mul vl] \n" // bgrabgra + "ld1b {z5.b}, p0/z, [%[src1], #1, mul vl] \n" // bgrabgra + "ld1b {z2.b}, p0/z, [%[src0], #2, mul vl] \n" // bgrabgra + "ld1b {z6.b}, p0/z, [%[src1], #2, mul vl] \n" // bgrabgra + "ld1b {z3.b}, p0/z, [%[src0], #3, mul vl] \n" // bgrabgra + "ld1b {z7.b}, p0/z, [%[src1], #3, mul vl] \n" // bgrabgra + "incb %[src0], all, mul #4 \n" + "incb %[src1], all, mul #4 \n" + + "uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr + "uaddlt z17.h, z0.b, z4.b \n" // gagagaga + "uaddlb z18.h, z1.b, z5.b \n" // brbrbrbr + "uaddlt z19.h, z1.b, z5.b \n" // gagagaga + "uaddlb z20.h, z2.b, z6.b \n" // brbrbrbr + "uaddlt z21.h, z2.b, z6.b \n" // gagagaga + "uaddlb z22.h, z3.b, z7.b \n" // brbrbrbr + "uaddlt z23.h, z3.b, z7.b \n" // gagagaga + + "trn1 z0.s, z16.s, z17.s \n" // brgabgra + "trn2 z1.s, z16.s, z17.s \n" // brgabgra + "trn1 z2.s, z18.s, z19.s \n" // brgabgra + "trn2 z3.s, z18.s, z19.s \n" // brgabgra + "trn1 z4.s, z20.s, z21.s \n" // brgabgra + "trn2 z5.s, z20.s, z21.s \n" // brgabgra + "trn1 z6.s, z22.s, z23.s \n" // brgabgra + "trn2 z7.s, z22.s, z23.s \n" // brgabgra + + "subs %w[width], %w[width], %w[vl] \n" // 4*VL per loop + + "urhadd z0.h, p0/m, z0.h, z1.h \n" // brgabrga + "urhadd z2.h, p0/m, z2.h, z3.h \n" // brgabrga + "urhadd z4.h, p0/m, z4.h, z5.h \n" // brgabrga + "urhadd z6.h, p0/m, z6.h, z7.h \n" // brgabrga + + "movi v16.8h, #0 \n" + "movi v17.8h, #0 \n" + "movi v18.8h, #0 \n" + "movi v19.8h, #0 \n" + + "movi v20.8h, #0 \n" + "movi v21.8h, #0 \n" + "movi v22.8h, #0 \n" + "movi v23.8h, #0 \n" + + "sdot z16.d, z0.h, z24.h \n" // UUxxxxxx + "sdot z17.d, z2.h, z24.h \n" // UUxxxxxx + "sdot z18.d, z4.h, z24.h \n" // UUxxxxxx + "sdot z19.d, z6.h, z24.h \n" // UUxxxxxx + + "sdot z20.d, z0.h, z25.h \n" // VVxxxxxx + "sdot z21.d, z2.h, z25.h \n" // VVxxxxxx + "sdot z22.d, z4.h, z25.h \n" // VVxxxxxx + "sdot z23.d, z6.h, z25.h \n" // VVxxxxxx + + "uzp1 z16.s, z16.s, z17.s \n" // UUxx + "uzp1 z18.s, z18.s, z19.s \n" // UUxx + "uzp1 z20.s, z20.s, z21.s \n" // VVxx + "uzp1 z22.s, z22.s, z23.s \n" // VVxx + + "uzp1 z16.h, z16.h, z18.h \n" // UU + "uzp1 z20.h, z20.h, z22.h \n" // VV + + "addhnb z16.b, z16.h, z26.h \n" // U + "addhnb z20.b, z20.h, z26.h \n" // V + + "st1b {z16.h}, p0, [%[dst_u]] \n" // U + "st1b {z20.h}, p0, [%[dst_v]] \n" // V + "inch %[dst_u] \n" + "inch %[dst_v] \n" + + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" // VL per loop + "b.le 99f \n" + + // Process remaining pixels from each input row. + // Use predication to do one vector from each input array, so may loop up + // to three iterations. + "cntw %x[vl] \n" + + "3: \n" + "whilelt p1.s, wzr, %w[width] \n" + "ld1d {z0.d}, p1/z, [%[src0]] \n" // bgrabgra + "ld1d {z4.d}, p1/z, [%[src1]] \n" // bgrabgra + "incb %[src0] \n" + "incb %[src1] \n" + + "uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr + "uaddlt z17.h, z0.b, z4.b \n" // gagagaga + + "trn1 z0.s, z16.s, z17.s \n" // brgabgra + "trn2 z1.s, z16.s, z17.s \n" // brgabgra + + "urhadd z0.h, p0/m, z0.h, z1.h \n" // brgabrga + + "subs %w[width], %w[width], %w[vl] \n" // VL per loop + + "movi v16.8h, #0 \n" + "movi v20.8h, #0 \n" + + "sdot z16.d, z0.h, z24.h \n" + "sdot z20.d, z0.h, z25.h \n" + + "addhnb z16.b, z16.h, z26.h \n" // U + "addhnb z20.b, z20.h, z26.h \n" // V + + "st1b {z16.d}, p0, [%[dst_u]] \n" // U + "st1b {z20.d}, p0, [%[dst_v]] \n" // V + "incd %[dst_u] \n" + "incd %[dst_v] \n" + "b.gt 3b \n" + + "99: \n" + : [src0] "+r"(src_argb), // %[src0] + [src1] "+r"(src_argb_1), // %[src1] + [dst_u] "+r"(dst_u), // %[dst_u] + [dst_v] "+r"(dst_v), // %[dst_v] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [uvconstants] "r"(uvconstants) + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", + "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", + "p0"); +} + +void ARGBToUVRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width, + kArgbToUvArr); +} + +void ARGBToUVJRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width, + kArgbToUvjArr); +} + +void ABGRToUVJRow_SVE2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width) { + ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_uj, dst_vj, width, + kAbgrToUvjArr); +} + +void BGRAToUVRow_SVE2(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE2(src_bgra, src_stride_bgra, dst_u, dst_v, width, + kBgraToUvArr); +} + +void ABGRToUVRow_SVE2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_u, dst_v, width, + kAbgrToUvArr); +} + +void RGBAToUVRow_SVE2(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE2(src_rgba, src_stride_rgba, dst_u, dst_v, width, + kRgbaToUvArr); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus