From c4a0c8d34aa28cec37fd4ac9ea6fd2a1eb187c10 Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 28 Jan 2025 10:21:16 +0000 Subject: [PATCH] [AArch64] Add SVE2 and SME implementations for Convert8To8Row SVE can make use of the UMULH instruction to avoid needing separate widening multiply and narrowing steps for the scale application. Reduction in runtime for Convert8To8Row_SVE2 observed compared to the existing Neon implementation: Cortex-A510: -13.2% Cortex-A520: -16.4% Cortex-A710: -37.1% Cortex-A715: -38.5% Cortex-A720: -38.4% Cortex-X2: -33.2% Cortex-X3: -31.8% Cortex-X4: -31.8% Cortex-X925: -13.9% Change-Id: I17c0cb81661c5fbce786b47cdf481549cfdcbfc7 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6207692 Reviewed-by: Wan-Teh Chang Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- include/libyuv/row.h | 12 +++++++++ include/libyuv/row_sve.h | 54 ++++++++++++++++++++++++++++++++++++++ source/planar_functions.cc | 10 +++++++ source/row_sme.cc | 8 ++++++ source/row_sve.cc | 8 ++++++ 5 files changed, 92 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 98808b75e..3ec11cb8d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -553,6 +553,7 @@ extern "C" { #define HAS_AYUVTOUVROW_SVE2 #define HAS_AYUVTOVUROW_SVE2 #define HAS_BGRATOUVROW_SVE2 +#define HAS_CONVERT8TO8ROW_SVE2 #define HAS_DIVIDEROW_16_SVE2 #define HAS_HALFFLOATROW_SVE2 #define HAS_I210ALPHATOARGBROW_SVE2 @@ -595,6 +596,7 @@ extern "C" { defined(__aarch64__) #define HAS_ARGBMULTIPLYROW_SME #define HAS_CONVERT16TO8ROW_SME +#define HAS_CONVERT8TO8ROW_SME #define HAS_COPYROW_SME #define HAS_I210ALPHATOARGBROW_SME #define HAS_I210TOAR30ROW_SME @@ -3658,6 +3660,16 @@ void Convert8To8Row_Any_NEON(const uint8_t* src_ptr, int scale, int bias, int width); +void Convert8To8Row_SVE2(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width); +void Convert8To8Row_SME(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width); void Convert8To8Row_AVX2(const uint8_t* src_y, uint8_t* dst_y, int scale, diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h index b52a38a99..1e10c1e9a 100644 --- a/include/libyuv/row_sve.h +++ b/include/libyuv/row_sve.h @@ -1725,6 +1725,60 @@ static inline void I212ToARGBRow_SVE_SC(const uint16_t* src_y, : "cc", "memory", YUVTORGB_SVE_REGS); } +#define CONVERT8TO8_SVE \ + "ld1b {z0.b}, p0/z, [%[src]] \n" \ + "ld1b {z1.b}, p1/z, [%[src], #1, mul vl] \n" \ + "incb %[src], all, mul #2 \n" \ + "subs %w[width], %w[width], %w[vl], lsl #1 \n" \ + "umulh z0.b, z0.b, z2.b \n" \ + "umulh z1.b, z1.b, z2.b \n" \ + "prfm pldl1keep, [%[src], 448] \n" \ + "add z0.b, z0.b, z3.b \n" \ + "add z1.b, z1.b, z3.b \n" \ + "st1b {z0.b}, p0, [%[dst]] \n" \ + "st1b {z1.b}, p1, [%[dst], #1, mul vl] \n" \ + "incb %[dst], all, mul #2 \n" + +static inline void Convert8To8Row_SVE_SC(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width) STREAMING_COMPATIBLE { + uint64_t vl; + asm volatile( + "dup z2.b, %w[scale] \n" + "dup z3.b, %w[bias] \n" + "cntb %[vl] \n" + "subs %w[width], %w[width], %w[vl], lsl #1 \n" + "b.lt 2f \n" + + // Run bulk of computation with all-true predicates to avoid predicate + // generation overhead. + "ptrue p0.b \n" + "ptrue p1.b \n" + "1: \n" // + CONVERT8TO8_SVE + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl], lsl #1 \n" + "b.eq 99f \n" + + // Calculate predicates for the final iteration to deal with the tail. + "whilelt p0.b, wzr, %w2 \n" + "whilelt p1.b, %w[vl], %w2 \n" // + CONVERT8TO8_SVE + + "99: \n" + : [src] "+r"(src_y), // %[src] + [dst] "+r"(dst_y), // %[dst] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [scale] "r"(scale), // %[scale] + [bias] "r"(bias) // %[bias] + : "cc", "memory", "z0", "z1", "z2", "z3", "p0", "p1"); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus diff --git a/source/planar_functions.cc b/source/planar_functions.cc index ca0bfea90..c2d4b67a4 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -271,6 +271,16 @@ void Convert8To8Plane(const uint8_t* src_y, } } #endif +#if defined(HAS_CONVERT8TO8ROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + Convert8To8Row = Convert8To8Row_SVE2; + } +#endif +#if defined(HAS_CONVERT8TO8ROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + Convert8To8Row = Convert8To8Row_SME; + } +#endif #if defined(HAS_CONVERT8TO8ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { Convert8To8Row = Convert8To8Row_Any_AVX2; diff --git a/source/row_sme.cc b/source/row_sme.cc index 561b7f73d..1cbc42f3e 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -1075,6 +1075,14 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr, : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0"); } +__arm_locally_streaming void Convert8To8Row_SME(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width) { + Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width); +} + #endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && // defined(__aarch64__) diff --git a/source/row_sve.cc b/source/row_sve.cc index 8076c9ebc..0bab8e16f 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -1237,6 +1237,14 @@ void I212ToARGBRow_SVE2(const uint16_t* src_y, I212ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width); } +void Convert8To8Row_SVE2(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width) { + Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus