From 9144583f22ba23900e89c03c8483d2f42c712f6c Mon Sep 17 00:00:00 2001 From: George Steed Date: Thu, 12 Sep 2024 10:21:17 +0100 Subject: [PATCH] [AArch64] Add SME impls of MultiplyRow_16 and ARGBMultiplyRow Mostly just a translation of the existing Neon code to SME. Change-Id: Ic3d6b8ac774c9a1bb9204ed6c78c8802668bffe9 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6067147 Reviewed-by: Frank Barchard --- include/libyuv/row.h | 10 ++++ source/planar_functions.cc | 10 ++++ source/row_sme.cc | 96 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 1953f9fe0..4dafb63b8 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -589,8 +589,10 @@ extern "C" { // The following are available on AArch64 SME platforms: #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ defined(__aarch64__) +#define HAS_ARGBMULTIPLYROW_SME #define HAS_I422TOARGBROW_SME #define HAS_I444TOARGBROW_SME +#define HAS_MULTIPLYROW_16_SME #endif // The following are available on AArch64 platforms: @@ -3372,6 +3374,10 @@ void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int scale, int width); +void MultiplyRow_16_SME(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); void DivideRow_16_C(const uint16_t* src_y, uint16_t* dst_y, @@ -5039,6 +5045,10 @@ void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void ARGBMultiplyRow_SME(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 3c2f3f850..cc909eb73 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -829,6 +829,11 @@ void ConvertToMSBPlane_16(const uint16_t* src_y, } } #endif +#if defined(HAS_MULTIPLYROW_16_SME) + if (TestCpuFlag(kCpuHasSME)) { + MultiplyRow_16 = MultiplyRow_16_SME; + } +#endif for (y = 0; y < height; ++y) { MultiplyRow_16(src_y, dst_y, scale, width); @@ -3134,6 +3139,11 @@ int ARGBMultiply(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBMultiplyRow = ARGBMultiplyRow_SME; + } +#endif #if defined(HAS_ARGBMULTIPLYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; diff --git a/source/row_sme.cc b/source/row_sme.cc index 7676d9e64..da94cd7be 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -216,6 +216,102 @@ __arm_locally_streaming void I422ToARGBRow_SME( : "cc", "memory", YUVTORGB_SVE_REGS); } +__arm_locally_streaming void MultiplyRow_16_SME(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // Streaming-SVE only, no use of ZA tile. + int vl; + asm volatile( + "cnth %x[vl] \n" + "mov z0.h, %w[scale] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p0.h \n" + "1: \n" + "ld1h {z1.h}, p0/z, [%[src_y]] \n" + "incb %[src_y] \n" + "mul z1.h, z0.h, z1.h \n" + "subs %w[width], %w[width], %w[vl] \n" + "st1h {z1.h}, p0, [%[dst_y]] \n" + "incb %[dst_y] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "whilelt p0.h, wzr, %w[width] \n" + "ld1h {z1.h}, p0/z, [%[src_y]] \n" + "mul z1.h, z0.h, z1.h \n" + "st1h {z1.h}, p0, [%[dst_y]] \n" + + "99: \n" + : [src_y] "+r"(src_y), // %[src_y] + [dst_y] "+r"(dst_y), // %[dst_y] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [scale] "r"(scale) // %[scale] + : "memory", "cc", "z0", "z1", "p0"); +} + +__arm_locally_streaming void ARGBMultiplyRow_SME(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + // Streaming-SVE only, no use of ZA tile. + width *= 4; + int vl; + asm volatile( + "cntb %x[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p0.b \n" + "1: \n" + "ld1b {z0.b}, p0/z, [%[src_argb]] \n" + "ld1b {z1.b}, p0/z, [%[src_argb1]] \n" + "incb %[src_argb] \n" + "incb %[src_argb1] \n" + "umullb z2.h, z0.b, z1.b \n" + "umullt z1.h, z0.b, z1.b \n" + "rshrnb z0.b, z2.h, #8 \n" + "rshrnt z0.b, z1.h, #8 \n" + "subs %w[width], %w[width], %w[vl] \n" + "st1b {z0.b}, p0, [%[dst_argb]] \n" + "incb %[dst_argb] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "whilelt p0.b, wzr, %w[width] \n" + "ld1b {z0.b}, p0/z, [%[src_argb]] \n" + "ld1b {z1.b}, p0/z, [%[src_argb1]] \n" + "umullb z2.h, z0.b, z1.b \n" + "umullt z1.h, z0.b, z1.b \n" + "rshrnb z0.b, z2.h, #8 \n" + "rshrnt z0.b, z1.h, #8 \n" + "st1b {z0.b}, p0, [%[dst_argb]] \n" + + "99: \n" + : [src_argb] "+r"(src_argb), // %[src_argb] + [src_argb1] "+r"(src_argb1), // %[src_argb1] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : + : "memory", "cc", "z0", "z1", "z2", "p0", "p1"); +} + #endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && // defined(__aarch64__)