From 007b920232baa76e8f37ad1a184409a5ecb70836 Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 13 Jun 2025 09:13:02 +0100 Subject: [PATCH] [AArch64] Add SME implementation of ARGBToUVRow and similar Mostly just a straightforward copy of the existing SVE2 code ported to Streaming-SVE. Introduce new "any" kernels for non-multiple of two cases, matching what we already do for SVE2. The existing SVE2 code makes use of the Neon MOVI instruction that is not supported in Streaming-SVE, so adjust the code to use FMOV instead which has the same performance characteristics. Change-Id: I74b7ea1fe8e6af75dfaf92826a4de775a1559f77 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6663806 Reviewed-by: Justin Green Reviewed-by: Frank Barchard --- include/libyuv/row.h | 66 ++++++++++++ include/libyuv/row_sve.h | 173 +++++++++++++++++++++++++++++++ source/convert.cc | 40 ++++++++ source/convert_from_argb.cc | 88 ++++++++++++++++ source/row_any.cc | 18 ++++ source/row_sme.cc | 54 ++++++++++ source/row_sve.cc | 196 +++--------------------------------- 7 files changed, 451 insertions(+), 184 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 0441a43fd..f89217792 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -615,7 +615,12 @@ extern "C" { // The following are available on AArch64 SME platforms: #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ defined(__aarch64__) +#define HAS_ABGRTOUVJROW_SME +#define HAS_ABGRTOUVROW_SME #define HAS_ARGBMULTIPLYROW_SME +#define HAS_ARGBTOUVJROW_SME +#define HAS_ARGBTOUVROW_SME +#define HAS_BGRATOUVROW_SME #define HAS_CONVERT16TO8ROW_SME #define HAS_CONVERT8TO16ROW_SME #define HAS_CONVERT8TO8ROW_SME @@ -654,6 +659,7 @@ extern "C" { #define HAS_P210TOARGBROW_SME #define HAS_P410TOAR30ROW_SME #define HAS_P410TOARGBROW_SME +#define HAS_RGBATOUVROW_SME #define HAS_YUY2TOARGBROW_SME #endif @@ -1954,6 +1960,11 @@ void ARGBToUVRow_SVE2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_SME(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_MSA(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1996,6 +2007,11 @@ void ARGBToUVJRow_SVE2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_SME(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVJRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_uj, @@ -2011,6 +2027,11 @@ void ABGRToUVJRow_SVE2(const uint8_t* src_abgr, uint8_t* dst_uj, uint8_t* dst_vj, int width); +void ABGRToUVJRow_SME(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width); void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, @@ -2026,6 +2047,11 @@ void BGRAToUVRow_SVE2(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width); +void BGRAToUVRow_SME(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, @@ -2041,6 +2067,11 @@ void ABGRToUVRow_SVE2(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVRow_SME(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGBAToUVRow_NEON(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, @@ -2056,6 +2087,11 @@ void RGBAToUVRow_SVE2(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGBAToUVRow_SME(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, @@ -2503,6 +2539,11 @@ void ARGBToUVRow_Any_SVE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_Any_SME(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -2545,6 +2586,11 @@ void ARGBToUVJRow_Any_SVE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJRow_Any_SME(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2560,6 +2606,11 @@ void ABGRToUVJRow_Any_SVE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_Any_SME(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2575,6 +2626,11 @@ void BGRAToUVRow_Any_SVE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void BGRAToUVRow_Any_SME(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2590,6 +2646,11 @@ void ABGRToUVRow_Any_SVE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVRow_Any_SME(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -2605,6 +2666,11 @@ void RGBAToUVRow_Any_SVE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGBAToUVRow_Any_SME(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h index ece62bcc5..3feed1045 100644 --- a/include/libyuv/row_sve.h +++ b/include/libyuv/row_sve.h @@ -1971,6 +1971,179 @@ static inline void Convert8To8Row_SVE_SC(const uint8_t* src_y, : "cc", "memory", "z0", "z1", "z2", "z3", "p0", "p1"); } +// SVE constants are stored negated such that we can store 128 in int8_t. + +// RGB to BT601 coefficients +// UB 0.875 coefficient = 112 +// UG -0.5781 coefficient = -74 +// UR -0.2969 coefficient = -38 +// VB -0.1406 coefficient = -18 +// VG -0.7344 coefficient = -94 +// VR 0.875 coefficient = 112 + +static const int8_t kARGBToUVCoefficients[] = { + // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0 + -112, 74, 38, 0, 18, 94, -112, 0, +}; + +static const int8_t kABGRToUVCoefficients[] = { + // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0 + 38, 74, -112, 0, -112, 94, 18, 0, +}; + +static const int8_t kBGRAToUVCoefficients[] = { + // 0, -UR, -UG, -UB, 0, -VR, -VG, -VB + 0, 38, 74, -112, 0, -112, 94, 18, +}; + +static const int8_t kRGBAToUVCoefficients[] = { + // 0, -UB, -UG, -UR, 0, -VB, -VG, -VR + 0, -112, 74, 38, 0, 18, 94, -112, +}; + +// RGB to JPEG coefficients +// UB 0.500 coefficient = 128 +// UG -0.33126 coefficient = -85 +// UR -0.16874 coefficient = -43 +// VB -0.08131 coefficient = -21 +// VG -0.41869 coefficient = -107 +// VR 0.500 coefficient = 128 + +static const int8_t kARGBToUVJCoefficients[] = { + // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0 + -128, 85, 43, 0, 21, 107, -128, 0, +}; + +static const int8_t kABGRToUVJCoefficients[] = { + // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0 + 43, 85, -128, 0, -128, 107, 21, 0, +}; + +#define ABCDTOUVMATRIX_SVE \ + "ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \ + "ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \ + "ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \ + "ld1d {z3.d}, p4/z, [%[src0], #3, mul vl] \n" /* MNOP(bgra) */ \ + "ld1d {z4.d}, p1/z, [%[src1]] \n" /* ABCD(bgra) */ \ + "ld1d {z5.d}, p2/z, [%[src1], #1, mul vl] \n" /* EFGH(bgra) */ \ + "ld1d {z6.d}, p3/z, [%[src1], #2, mul vl] \n" /* IJKL(bgra) */ \ + "ld1d {z7.d}, p4/z, [%[src1], #3, mul vl] \n" /* MNOP(bgra) */ \ + "incb %[src0], all, mul #4 \n" \ + "incb %[src1], all, mul #4 \n" \ + \ + "uaddlb z16.h, z0.b, z4.b \n" /* ABCD(br) */ \ + "uaddlb z18.h, z1.b, z5.b \n" /* EFGH(br) */ \ + "uaddlb z20.h, z2.b, z6.b \n" /* IJKL(br) */ \ + "uaddlb z22.h, z3.b, z7.b \n" /* MNOP(br) */ \ + "uaddlt z17.h, z0.b, z4.b \n" /* ABCD(ga) */ \ + "uaddlt z19.h, z1.b, z5.b \n" /* EFGH(ga) */ \ + "uaddlt z21.h, z2.b, z6.b \n" /* IJKL(ga) */ \ + "uaddlt z23.h, z3.b, z7.b \n" /* MNOP(ga) */ \ + \ + /* Use ADDP on 32-bit elements to add adjacent pairs of 9-bit unsigned */ \ + "addp z16.s, p0/m, z16.s, z18.s \n" /* ABEFCDGH(br) */ \ + "addp z17.s, p0/m, z17.s, z19.s \n" /* ABEFCDGH(ga) */ \ + "addp z20.s, p0/m, z20.s, z22.s \n" /* IJMNKLOP(br) */ \ + "addp z21.s, p0/m, z21.s, z23.s \n" /* IJMNKLOP(ga) */ \ + \ + "rshrnb z0.b, z16.h, #2 \n" /* ABEFCDGH(b0r0) */ \ + "rshrnb z1.b, z20.h, #2 \n" /* IJMNKLOP(b0r0) */ \ + "rshrnt z0.b, z17.h, #2 \n" /* ABEFCDGH(bgra) */ \ + "rshrnt z1.b, z21.h, #2 \n" /* IJMNKLOP(bgra) */ \ + \ + "tbl z0.s, {z0.s}, z27.s \n" /* ABCDEFGH */ \ + "tbl z1.s, {z1.s}, z27.s \n" /* IJKLMNOP */ \ + \ + "subs %w[width], %w[width], %w[vl], lsl #2 \n" /* VL per loop */ \ + \ + "fmov s16, wzr \n" \ + "fmov s17, wzr \n" \ + "fmov s20, wzr \n" \ + "fmov s21, wzr \n" \ + \ + "usdot z16.s, z0.b, z24.b \n" \ + "usdot z17.s, z1.b, z24.b \n" \ + "usdot z20.s, z0.b, z25.b \n" \ + "usdot z21.s, z1.b, z25.b \n" \ + \ + "subhnb z16.b, z26.h, z16.h \n" /* U */ \ + "subhnb z20.b, z26.h, z20.h \n" /* V */ \ + "subhnb z17.b, z26.h, z17.h \n" /* U */ \ + "subhnb z21.b, z26.h, z21.h \n" /* V */ \ + \ + "uzp1 z16.h, z16.h, z17.h \n" \ + "uzp1 z20.h, z20.h, z21.h \n" \ + \ + "st1b {z16.h}, p5, [%[dst_u]] \n" /* U */ \ + "st1b {z20.h}, p5, [%[dst_v]] \n" /* V */ \ + "inch %[dst_u] \n" \ + "inch %[dst_v] \n" + +static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const int8_t* uvconstants) + STREAMING_COMPATIBLE { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; + uint64_t vl; + asm("cntd %x0" : "=r"(vl)); + + // Width is a multiple of two here, so halve it. + width >>= 1; + + asm volatile( + "ptrue p0.b \n" + "ld1rw {z24.s}, p0/z, [%[uvconstants]] \n" + "ld1rw {z25.s}, p0/z, [%[uvconstants], #4] \n" + "mov z26.h, #0x8000 \n" // 128.0 (0x8000) + + // Generate some TBL indices to undo the interleaving from ADDP. + "index z0.s, #0, #1 \n" + "index z1.s, #1, #1 \n" + "uzp1 z27.s, z0.s, z1.s \n" + + "subs %w[width], %w[width], %w[vl], lsl #2 \n" + "b.lt 2f \n" + + "ptrue p1.d \n" + "ptrue p2.d \n" + "ptrue p3.d \n" + "ptrue p4.d \n" + "ptrue p5.h \n" + "1: \n" // + ABCDTOUVMATRIX_SVE + "b.gt 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl], lsl #2 \n" + "b.eq 99f \n" + + "3: \n" + "whilelt p1.d, wzr, %w[width] \n" + "whilelt p2.d, %w[vl], %w[width] \n" + "whilelt p3.d, %w[vl2], %w[width] \n" + "whilelt p4.d, %w[vl3], %w[width] \n" + "whilelt p5.h, wzr, %w[width] \n" // + ABCDTOUVMATRIX_SVE + "b.gt 3b \n" + + "99: \n" + : [src0] "+r"(src_argb), // %[src0] + [src1] "+r"(src_argb_1), // %[src1] + [dst_u] "+r"(dst_u), // %[dst_u] + [dst_v] "+r"(dst_v), // %[dst_v] + [width] "+r"(width) // %[width] + : [uvconstants] "r"(uvconstants), // %[uvconstants] + [vl] "r"(vl), // %[vl] + [vl2] "r"(vl * 2), // %[vl2] + [vl3] "r"(vl * 3) // %[vl3] + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", + "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", + "z27", "p0", "p1", "p2", "p3", "p4", "p5"); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus diff --git a/source/convert.cc b/source/convert.cc index 24b0f0b6d..ecc01f0b2 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2103,6 +2103,14 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVRow = ARGBToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SME; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -2289,6 +2297,14 @@ int ARGBToI420Alpha(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVRow = ARGBToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SME; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -2476,6 +2492,14 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif +#if defined(HAS_BGRATOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + BGRAToUVRow = BGRAToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + BGRAToUVRow = BGRAToUVRow_SME; + } + } +#endif #if defined(HAS_BGRATOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { BGRAToYRow = BGRAToYRow_Any_SSSE3; @@ -2659,6 +2683,14 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ABGRToUVRow = ABGRToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ABGRToUVRow = ABGRToUVRow_SME; + } + } +#endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; @@ -2792,6 +2824,14 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + RGBAToUVRow = RGBAToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + RGBAToUVRow = RGBAToUVRow_SME; + } + } +#endif #if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGBAToYRow = RGBAToYRow_Any_MSA; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 8451821c8..c37558266 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -287,6 +287,14 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVRow = ARGBToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SME; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -411,6 +419,14 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVRow = ARGBToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SME; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -662,6 +678,14 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVRow = ARGBToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SME; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -890,6 +914,14 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ABGRToUVRow = ABGRToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ABGRToUVRow = ABGRToUVRow_SME; + } + } +#endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; @@ -1107,6 +1139,14 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ABGRToUVRow = ABGRToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ABGRToUVRow = ABGRToUVRow_SME; + } + } +#endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; @@ -1329,6 +1369,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVRow = ARGBToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SME; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -1549,6 +1597,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVRow = ARGBToUVRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVRow = ARGBToUVRow_SME; + } + } +#endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; @@ -2697,6 +2753,14 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVJRow = ARGBToUVJRow_SME; + } + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -2889,6 +2953,14 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ARGBToUVJRow = ARGBToUVJRow_SME; + } + } +#endif #if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYJRow = ARGBToYJRow_Any_MSA; @@ -3211,6 +3283,14 @@ int ABGRToJ420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVJROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ABGRToUVJRow = ABGRToUVJRow_SME; + } + } +#endif #if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYJRow = ABGRToYJRow_Any_MSA; @@ -3365,6 +3445,14 @@ int ABGRToJ422(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVJROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SME; + if (IS_ALIGNED(width, 2)) { + ABGRToUVJRow = ABGRToUVJRow_SME; + } + } +#endif #if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYJRow = ABGRToYJRow_Any_MSA; diff --git a/source/row_any.cc b/source/row_any.cc index a1b1fc13a..edfaa7f5f 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2404,6 +2404,9 @@ ANY12S(ARGBToUVRow_Any_NEON_I8MM, ARGBToUVRow_NEON_I8MM, 0, 4, 15) #ifdef HAS_ARGBTOUVROW_SVE2 ANY12S(ARGBToUVRow_Any_SVE2, ARGBToUVRow_SVE2, 0, 4, 1) #endif +#ifdef HAS_ARGBTOUVROW_SME +ANY12S(ARGBToUVRow_Any_SME, ARGBToUVRow_SME, 0, 4, 1) +#endif #ifdef HAS_ARGBTOUVROW_MSA ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) #endif @@ -2422,6 +2425,9 @@ ANY12S(ARGBToUVJRow_Any_NEON_I8MM, ARGBToUVJRow_NEON_I8MM, 0, 4, 15) #ifdef HAS_ARGBTOUVJROW_SVE2 ANY12S(ARGBToUVJRow_Any_SVE2, ARGBToUVJRow_SVE2, 0, 4, 1) #endif +#ifdef HAS_ARGBTOUVJROW_SME +ANY12S(ARGBToUVJRow_Any_SME, ARGBToUVJRow_SME, 0, 4, 1) +#endif #ifdef HAS_ABGRTOUVJROW_NEON ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15) #endif @@ -2431,6 +2437,9 @@ ANY12S(ABGRToUVJRow_Any_NEON_I8MM, ABGRToUVJRow_NEON_I8MM, 0, 4, 15) #ifdef HAS_ABGRTOUVJROW_SVE2 ANY12S(ABGRToUVJRow_Any_SVE2, ABGRToUVJRow_SVE2, 0, 4, 1) #endif +#ifdef HAS_ABGRTOUVJROW_SME +ANY12S(ABGRToUVJRow_Any_SME, ABGRToUVJRow_SME, 0, 4, 1) +#endif #ifdef HAS_ARGBTOUVJROW_MSA ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) #endif @@ -2449,6 +2458,9 @@ ANY12S(BGRAToUVRow_Any_NEON_I8MM, BGRAToUVRow_NEON_I8MM, 0, 4, 15) #ifdef HAS_BGRATOUVROW_SVE2 ANY12S(BGRAToUVRow_Any_SVE2, BGRAToUVRow_SVE2, 0, 4, 1) #endif +#ifdef HAS_BGRATOUVROW_SME +ANY12S(BGRAToUVRow_Any_SME, BGRAToUVRow_SME, 0, 4, 1) +#endif #ifdef HAS_BGRATOUVROW_MSA ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15) #endif @@ -2464,6 +2476,9 @@ ANY12S(ABGRToUVRow_Any_NEON_I8MM, ABGRToUVRow_NEON_I8MM, 0, 4, 15) #ifdef HAS_ABGRTOUVROW_SVE2 ANY12S(ABGRToUVRow_Any_SVE2, ABGRToUVRow_SVE2, 0, 4, 1) #endif +#ifdef HAS_ABGRTOUVROW_SME +ANY12S(ABGRToUVRow_Any_SME, ABGRToUVRow_SME, 0, 4, 1) +#endif #ifdef HAS_ABGRTOUVROW_MSA ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15) #endif @@ -2479,6 +2494,9 @@ ANY12S(RGBAToUVRow_Any_NEON_I8MM, RGBAToUVRow_NEON_I8MM, 0, 4, 15) #ifdef HAS_RGBATOUVROW_SVE2 ANY12S(RGBAToUVRow_Any_SVE2, RGBAToUVRow_SVE2, 0, 4, 1) #endif +#ifdef HAS_RGBATOUVROW_SME +ANY12S(RGBAToUVRow_Any_SME, RGBAToUVRow_SME, 0, 4, 1) +#endif #ifdef HAS_RGBATOUVROW_MSA ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15) #endif diff --git a/source/row_sme.cc b/source/row_sme.cc index 6b68ad5ed..a78f74150 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -1120,6 +1120,60 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y, : "cc", "memory", "z0", "z1", "z2", "p0", "p1"); } +__arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, + kARGBToUVCoefficients); +} + +__arm_locally_streaming void ARGBToUVJRow_SME(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, + kARGBToUVJCoefficients); +} + +__arm_locally_streaming void ABGRToUVJRow_SME(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width) { + ARGBToUVMatrixRow_SVE_SC(src_abgr, src_stride_abgr, dst_uj, dst_vj, width, + kABGRToUVJCoefficients); +} + +__arm_locally_streaming void BGRAToUVRow_SME(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE_SC(src_bgra, src_stride_bgra, dst_u, dst_v, width, + kBGRAToUVCoefficients); +} + +__arm_locally_streaming void ABGRToUVRow_SME(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE_SC(src_abgr, src_stride_abgr, dst_u, dst_v, width, + kABGRToUVCoefficients); +} + +__arm_locally_streaming void RGBAToUVRow_SME(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SVE_SC(src_rgba, src_stride_rgba, dst_u, dst_v, width, + kRGBAToUVCoefficients); +} + #endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && // defined(__aarch64__) diff --git a/source/row_sve.cc b/source/row_sve.cc index 7251fe79d..d4e0ffb13 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -217,185 +217,13 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y, NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width); } -// SVE constants are stored negated such that we can store 128 in int8_t. - -// RGB to BT601 coefficients -// UB 0.875 coefficient = 112 -// UG -0.5781 coefficient = -74 -// UR -0.2969 coefficient = -38 -// VB -0.1406 coefficient = -18 -// VG -0.7344 coefficient = -94 -// VR 0.875 coefficient = 112 - -static const int8_t kARGBToUVCoefficients[] = { - // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0 - -112, 74, 38, 0, 18, 94, -112, 0, -}; - -static const int8_t kABGRToUVCoefficients[] = { - // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0 - 38, 74, -112, 0, -112, 94, 18, 0, -}; - -static const int8_t kBGRAToUVCoefficients[] = { - // 0, -UR, -UG, -UB, 0, -VR, -VG, -VB - 0, 38, 74, -112, 0, -112, 94, 18, -}; - -static const int8_t kRGBAToUVCoefficients[] = { - // 0, -UB, -UG, -UR, 0, -VB, -VG, -VR - 0, -112, 74, 38, 0, 18, 94, -112, -}; - -// RGB to JPEG coefficients -// UB 0.500 coefficient = 128 -// UG -0.33126 coefficient = -85 -// UR -0.16874 coefficient = -43 -// VB -0.08131 coefficient = -21 -// VG -0.41869 coefficient = -107 -// VR 0.500 coefficient = 128 - -static const int8_t kARGBToUVJCoefficients[] = { - // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0 - -128, 85, 43, 0, 21, 107, -128, 0, -}; - -static const int8_t kABGRToUVJCoefficients[] = { - // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0 - 43, 85, -128, 0, -128, 107, 21, 0, -}; - -#define ABCDTOUVMATRIX_SVE \ - "ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \ - "ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \ - "ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \ - "ld1d {z3.d}, p4/z, [%[src0], #3, mul vl] \n" /* MNOP(bgra) */ \ - "ld1d {z4.d}, p1/z, [%[src1]] \n" /* ABCD(bgra) */ \ - "ld1d {z5.d}, p2/z, [%[src1], #1, mul vl] \n" /* EFGH(bgra) */ \ - "ld1d {z6.d}, p3/z, [%[src1], #2, mul vl] \n" /* IJKL(bgra) */ \ - "ld1d {z7.d}, p4/z, [%[src1], #3, mul vl] \n" /* MNOP(bgra) */ \ - "incb %[src0], all, mul #4 \n" \ - "incb %[src1], all, mul #4 \n" \ - \ - "uaddlb z16.h, z0.b, z4.b \n" /* ABCD(br) */ \ - "uaddlb z18.h, z1.b, z5.b \n" /* EFGH(br) */ \ - "uaddlb z20.h, z2.b, z6.b \n" /* IJKL(br) */ \ - "uaddlb z22.h, z3.b, z7.b \n" /* MNOP(br) */ \ - "uaddlt z17.h, z0.b, z4.b \n" /* ABCD(ga) */ \ - "uaddlt z19.h, z1.b, z5.b \n" /* EFGH(ga) */ \ - "uaddlt z21.h, z2.b, z6.b \n" /* IJKL(ga) */ \ - "uaddlt z23.h, z3.b, z7.b \n" /* MNOP(ga) */ \ - \ - /* Use ADDP on 32-bit elements to add adjacent pairs of 9-bit unsigned */ \ - "addp z16.s, p0/m, z16.s, z18.s \n" /* ABEFCDGH(br) */ \ - "addp z17.s, p0/m, z17.s, z19.s \n" /* ABEFCDGH(ga) */ \ - "addp z20.s, p0/m, z20.s, z22.s \n" /* IJMNKLOP(br) */ \ - "addp z21.s, p0/m, z21.s, z23.s \n" /* IJMNKLOP(ga) */ \ - \ - "rshrnb z0.b, z16.h, #2 \n" /* ABEFCDGH(b0r0) */ \ - "rshrnb z1.b, z20.h, #2 \n" /* IJMNKLOP(b0r0) */ \ - "rshrnt z0.b, z17.h, #2 \n" /* ABEFCDGH(bgra) */ \ - "rshrnt z1.b, z21.h, #2 \n" /* IJMNKLOP(bgra) */ \ - \ - "tbl z0.s, {z0.s}, z27.s \n" /* ABCDEFGH */ \ - "tbl z1.s, {z1.s}, z27.s \n" /* IJKLMNOP */ \ - \ - "subs %w[width], %w[width], %w[vl], lsl #2 \n" /* VL per loop */ \ - \ - "movi v16.8h, #0 \n" \ - "movi v17.8h, #0 \n" \ - "movi v20.8h, #0 \n" \ - "movi v21.8h, #0 \n" \ - \ - "usdot z16.s, z0.b, z24.b \n" \ - "usdot z17.s, z1.b, z24.b \n" \ - "usdot z20.s, z0.b, z25.b \n" \ - "usdot z21.s, z1.b, z25.b \n" \ - \ - "subhnb z16.b, z26.h, z16.h \n" /* U */ \ - "subhnb z20.b, z26.h, z20.h \n" /* V */ \ - "subhnb z17.b, z26.h, z17.h \n" /* U */ \ - "subhnb z21.b, z26.h, z21.h \n" /* V */ \ - \ - "uzp1 z16.h, z16.h, z17.h \n" \ - "uzp1 z20.h, z20.h, z21.h \n" \ - \ - "st1b {z16.h}, p5, [%[dst_u]] \n" /* U */ \ - "st1b {z20.h}, p5, [%[dst_v]] \n" /* V */ \ - "inch %[dst_u] \n" \ - "inch %[dst_v] \n" - -static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const int8_t* uvconstants) { - const uint8_t* src_argb_1 = src_argb + src_stride_argb; - uint64_t vl; - asm("cntd %x0" : "=r"(vl)); - - // Width is a multiple of two here, so halve it. - width >>= 1; - - asm volatile( - "ptrue p0.b \n" - "ld1rw {z24.s}, p0/z, [%[uvconstants]] \n" - "ld1rw {z25.s}, p0/z, [%[uvconstants], #4] \n" - "mov z26.h, #0x8000 \n" // 128.0 (0x8000) - - // Generate some TBL indices to undo the interleaving from ADDP. - "index z0.s, #0, #1 \n" - "index z1.s, #1, #1 \n" - "uzp1 z27.s, z0.s, z1.s \n" - - "subs %w[width], %w[width], %w[vl], lsl #2 \n" - "b.lt 2f \n" - - "ptrue p1.d \n" - "ptrue p2.d \n" - "ptrue p3.d \n" - "ptrue p4.d \n" - "ptrue p5.h \n" - "1: \n" // - ABCDTOUVMATRIX_SVE - "b.gt 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl], lsl #2 \n" - "b.eq 99f \n" - - "3: \n" - "whilelt p1.d, wzr, %w[width] \n" - "whilelt p2.d, %w[vl], %w[width] \n" - "whilelt p3.d, %w[vl2], %w[width] \n" - "whilelt p4.d, %w[vl3], %w[width] \n" - "whilelt p5.h, wzr, %w[width] \n" // - ABCDTOUVMATRIX_SVE - "b.gt 3b \n" - - "99: \n" - : [src0] "+r"(src_argb), // %[src0] - [src1] "+r"(src_argb_1), // %[src1] - [dst_u] "+r"(dst_u), // %[dst_u] - [dst_v] "+r"(dst_v), // %[dst_v] - [width] "+r"(width) // %[width] - : [uvconstants] "r"(uvconstants), // %[uvconstants] - [vl] "r"(vl), // %[vl] - [vl2] "r"(vl * 2), // %[vl2] - [vl3] "r"(vl * 3) // %[vl3] - : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", - "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", - "z27", "p0", "p1", "p2", "p3", "p4", "p5"); -} - void ARGBToUVRow_SVE2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width, - kARGBToUVCoefficients); + ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, + kARGBToUVCoefficients); } void ARGBToUVJRow_SVE2(const uint8_t* src_argb, @@ -403,8 +231,8 @@ void ARGBToUVJRow_SVE2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width, - kARGBToUVJCoefficients); + ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, + kARGBToUVJCoefficients); } void ABGRToUVJRow_SVE2(const uint8_t* src_abgr, @@ -412,8 +240,8 @@ void ABGRToUVJRow_SVE2(const uint8_t* src_abgr, uint8_t* dst_uj, uint8_t* dst_vj, int width) { - ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_uj, dst_vj, width, - kABGRToUVJCoefficients); + ARGBToUVMatrixRow_SVE_SC(src_abgr, src_stride_abgr, dst_uj, dst_vj, width, + kABGRToUVJCoefficients); } void BGRAToUVRow_SVE2(const uint8_t* src_bgra, @@ -421,8 +249,8 @@ void BGRAToUVRow_SVE2(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_SVE2(src_bgra, src_stride_bgra, dst_u, dst_v, width, - kBGRAToUVCoefficients); + ARGBToUVMatrixRow_SVE_SC(src_bgra, src_stride_bgra, dst_u, dst_v, width, + kBGRAToUVCoefficients); } void ABGRToUVRow_SVE2(const uint8_t* src_abgr, @@ -430,8 +258,8 @@ void ABGRToUVRow_SVE2(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_u, dst_v, width, - kABGRToUVCoefficients); + ARGBToUVMatrixRow_SVE_SC(src_abgr, src_stride_abgr, dst_u, dst_v, width, + kABGRToUVCoefficients); } void RGBAToUVRow_SVE2(const uint8_t* src_rgba, @@ -439,8 +267,8 @@ void RGBAToUVRow_SVE2(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_SVE2(src_rgba, src_stride_rgba, dst_u, dst_v, width, - kRGBAToUVCoefficients); + ARGBToUVMatrixRow_SVE_SC(src_rgba, src_stride_rgba, dst_u, dst_v, width, + kRGBAToUVCoefficients); } #define ARGBTORGB565_SVE \