From 004352ba164b68e22f13ca4b18b0b483d32c152c Mon Sep 17 00:00:00 2001 From: George Steed Date: Mon, 22 Apr 2024 12:02:39 +0100 Subject: [PATCH] [AArch64] Add SVE2 implementations for AYUVTo{UV,VU}Row These kernels are mostly identical to each other except for the order of the results, so we can use a single macro to parameterize the pairwise addition and use the same macro for both implementations, just with the register order flipped. Similar to other 2x2 kernels the implementation here differs slightly for the last element if the problem size is odd, so use an "any" kernel to avoid needing to handle this in the common code path. Observed reduction in runtime compared to the existing Neon code: | AYUVToUVRow | AYUVToVURow Cortex-A510 | -33.1% | -33.0% Cortex-A720 | -25.1% | -25.1% Cortex-X2 | -59.5% | -53.9% Cortex-X4 | -39.2% | -39.4% Bug: libyuv:973 Change-Id: I957db9ea31c8830535c243175790db0ff2a3ccae Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5522316 Reviewed-by: Justin Green Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- include/libyuv/row.h | 18 ++++++++ source/convert.cc | 16 +++++++ source/row_any.cc | 6 +++ source/row_sve.cc | 100 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f1084dd8a..6a7d30c0d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -580,6 +580,8 @@ extern "C" { #define HAS_ARGBTORGB565ROW_SVE2 #define HAS_ARGBTOUVJROW_SVE2 #define HAS_ARGBTOUVROW_SVE2 +#define HAS_AYUVTOUVROW_SVE2 +#define HAS_AYUVTOVUROW_SVE2 #define HAS_BGRATOUVROW_SVE2 #define HAS_I422ALPHATOARGBROW_SVE2 #define HAS_I422TOARGBROW_SVE2 @@ -5766,19 +5768,35 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width); +void AYUVToUVRow_SVE2(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width); void AYUVToVURow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width); +void AYUVToVURow_SVE2(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width); void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, int width); +void AYUVToUVRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_vu, + int width); void AYUVToVURow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, int width); +void AYUVToVURow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_vu, + int width); void I422ToYUY2Row_C(const uint8_t* src_y, const uint8_t* src_u, diff --git a/source/convert.cc b/source/convert.cc index fdd0cb644..2d9a24518 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1784,6 +1784,14 @@ int AYUVToNV12(const uint8_t* src_ayuv, } } #endif +#if defined(HAS_AYUVTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + AYUVToUVRow = AYUVToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + AYUVToUVRow = AYUVToUVRow_SVE2; + } + } +#endif for (y = 0; y < height - 1; y += 2) { AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width); @@ -1853,6 +1861,14 @@ int AYUVToNV21(const uint8_t* src_ayuv, } } #endif +#if defined(HAS_AYUVTOVUROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + AYUVToVURow = AYUVToVURow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + AYUVToVURow = AYUVToVURow_SVE2; + } + } +#endif for (y = 0; y < height - 1; y += 2) { AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width); diff --git a/source/row_any.cc b/source/row_any.cc index 351aa4a2c..f7a5d838d 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2446,6 +2446,12 @@ ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31) ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15) ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) #endif +#ifdef HAS_AYUVTOUVROW_SVE2 +ANY11S(AYUVToUVRow_Any_SVE2, AYUVToUVRow_SVE2, 0, 4, 1) +#endif +#ifdef HAS_AYUVTOVUROW_SVE2 +ANY11S(AYUVToVURow_Any_SVE2, AYUVToVURow_SVE2, 0, 4, 1) +#endif #undef ANY11S #define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK) \ diff --git a/source/row_sve.cc b/source/row_sve.cc index b1d2cfe2f..4ef937d1c 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -651,6 +651,106 @@ void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555, : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0", "p1", "p2"); } +// clang-format off +#define AYUVTOUV_SVE(zU0, zV0, zU1, zV1) /* e.g. */ \ + "ld2h {z0.h, z1.h}, p0/z, [%[src0]] \n" /* VUVU.. YAYA.. */ \ + "ld2h {z1.h, z2.h}, p1/z, [%[src0], #2, mul vl] \n" /* VUVU.. YAYA.. */ \ + "ld2h {z2.h, z3.h}, p0/z, [%[src1]] \n" /* VUVU.. YAYA.. */ \ + "ld2h {z3.h, z4.h}, p1/z, [%[src1], #2, mul vl] \n" /* VUVU.. YAYA.. */ \ + "incb %[src0], all, mul #4 \n" \ + "incb %[src1], all, mul #4 \n" \ + "uaddlb z4.h, z0.b, z2.b \n" /* V */ \ + "uaddlt z5.h, z0.b, z2.b \n" /* U */ \ + "uaddlb z6.h, z1.b, z3.b \n" /* V */ \ + "uaddlt z7.h, z1.b, z3.b \n" /* U */ \ + "addp " #zU0 ".h, p0/m, " #zU0 ".h, " #zV0 ".h \n" /* UV */ \ + "addp " #zU1 ".h, p1/m, " #zU1 ".h, " #zV1 ".h \n" /* UV */ \ + "subs %w[width], %w[width], %w[vl] \n" \ + "urshr " #zU0 ".h, p0/m, " #zU0 ".h, #2 \n" /* U0V0 */ \ + "urshr " #zU1 ".h, p1/m, " #zU1 ".h, #2 \n" /* U0V0 */ \ + "st1b {" #zU0 ".h}, p0, [%[dst]] \n" \ + "st1b {" #zU1 ".h}, p1, [%[dst], #1, mul vl] \n" \ + "incb %[dst] \n" +// clang-format on + +// Filter 2 rows of AYUV UV's (444) into UV (420). +// AYUV is VUYA in memory. UV for NV12 is UV order in memory. +void AYUVToUVRow_SVE2(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + // Output a row of UV values, filtering 2x2 rows of AYUV. + const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv; + int vl; + asm("cntb %x[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + "ptrue p0.h \n" + "ptrue p1.h \n" + "1: \n" + AYUVTOUV_SVE(z5, z4, z7, z6) + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + "cnth %x[vl] \n" + "whilelt p0.h, wzr, %w[width] \n" // first row + "whilelt p1.h, %w[vl], %w[width] \n" // second row + AYUVTOUV_SVE(z5, z4, z7, z6) + + "99: \n" + : [src0]"+r"(src_ayuv), // %[src0] + [src1]"+r"(src_ayuv1), // %[src1] + [dst]"+r"(dst_uv), // %[dst] + [width]"+r"(width), // %[width] + [vl]"=&r"(vl) // %[vl] + : + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0", + "p1"); +} + +// Filter 2 rows of AYUV UV's (444) into VU (420). +void AYUVToVURow_SVE2(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + // Output a row of VU values, filtering 2x2 rows of AYUV. + const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv; + int vl; + asm("cntb %x[vl] \n" + "cmp %w[width], %w[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + "ptrue p0.h \n" + "ptrue p1.h \n" + "1: \n" + AYUVTOUV_SVE(z4, z5, z6, z7) + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + "cnth %x[vl] \n" + "whilelt p0.h, wzr, %w[width] \n" // first row + "whilelt p1.h, %w[vl], %w[width] \n" // second row + AYUVTOUV_SVE(z4, z5, z6, z7) + + "99: \n" + : [src0]"+r"(src_ayuv), // %[src0] + [src1]"+r"(src_ayuv1), // %[src1] + [dst]"+r"(dst_vu), // %[dst] + [width]"+r"(width), // %[width] + [vl]"=&r"(vl) // %[vl] + : + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0", + "p1"); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus