From f27b983f382be8d49b1d473562918820aa124ed1 Mon Sep 17 00:00:00 2001 From: George Steed Date: Sun, 5 May 2024 20:51:51 +0100 Subject: [PATCH] [AArch64] Add SVE2 implementation of DivideRow_16 SVE contains the UMULH instruction which allows us to multiply and take the high half of the result in a single instruction rather than needing separate widening multiply and then narrowing shift steps. Observed reduction in runtime compared to the existing Neon code: Cortex-A510: -21.2% Cortex-A520: -20.9% Cortex-A715: -47.9% Cortex-A720: -47.6% Cortex-X2: -5.2% Cortex-X3: -2.6% Cortex-X4: -32.4% Cortex-X925: -1.5% Bug: b/42280942 Change-Id: I25154699b17772db1fb5cb84c049919181d86f4b Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5975318 Reviewed-by: Justin Green Reviewed-by: Frank Barchard --- include/libyuv/row.h | 5 ++++ source/planar_functions.cc | 5 ++++ source/row_sve.cc | 50 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index cb529f282..da7bab8f3 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -548,6 +548,7 @@ extern "C" { #define HAS_AYUVTOUVROW_SVE2 #define HAS_AYUVTOVUROW_SVE2 #define HAS_BGRATOUVROW_SVE2 +#define HAS_DIVIDEROW_16_SVE2 #define HAS_I400TOARGBROW_SVE2 #define HAS_I422ALPHATOARGBROW_SVE2 #define HAS_I422TOARGB1555ROW_SVE2 @@ -3302,6 +3303,10 @@ void DivideRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); +void DivideRow_16_SVE2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); void DivideRow_16_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int scale, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 1fc625330..e13626f81 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -877,6 +877,11 @@ void ConvertToLSBPlane_16(const uint16_t* src_y, } } #endif +#if defined(HAS_DIVIDEROW_16_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + DivideRow = DivideRow_16_SVE2; + } +#endif for (y = 0; y < height; ++y) { DivideRow(src_y, dst_y, scale, width); diff --git a/source/row_sve.cc b/source/row_sve.cc index 0762398af..b70824444 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -1620,6 +1620,56 @@ void ARGBToRAWRow_SVE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { ARGBToXYZRow_SVE2(src_argb, dst_rgb, width, kARGBToRAWRowIndices); } +void DivideRow_16_SVE2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + uint64_t vl; + asm volatile( + "cnth %x[vl] \n" + "dup z0.h, %w[scale] \n" + "subs %w[width], %w[width], %w[vl], lsl #1 \n" + "b.le 2f \n" + + // Run bulk of computation with the same predicates to avoid predicate + // generation overhead. + "ptrue p0.h \n" + "1: \n" + "ld1h {z1.h}, p0/z, [%[src]] \n" + "ld1h {z2.h}, p0/z, [%[src], #1, mul vl] \n" + "incb %[src], all, mul #2 \n" + "umulh z1.h, z1.h, z0.h \n" + "umulh z2.h, z2.h, z0.h \n" + "subs %w[width], %w[width], %w[vl], lsl #1 \n" + "st1h {z1.h}, p0, [%[dst]] \n" + "st1h {z2.h}, p0, [%[dst], #1, mul vl] \n" + "incb %[dst], all, mul #2 \n" + "b.gt 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl], lsl #1 \n" + "b.eq 99f \n" + + // Calculate a pair of predicates for the final iteration to deal with + // the tail. + "whilelt p0.h, wzr, %w[width] \n" + "whilelt p1.h, %w[vl], %w[width] \n" + "ld1h {z1.h}, p0/z, [%[src]] \n" + "ld1h {z2.h}, p1/z, [%[src], #1, mul vl] \n" + "umulh z1.h, z1.h, z0.h \n" + "umulh z2.h, z2.h, z0.h \n" + "st1h {z1.h}, p0, [%[dst]] \n" + "st1h {z2.h}, p1, [%[dst], #1, mul vl] \n" + + "99: \n" + : [src] "+r"(src_y), // %[src] + [dst] "+r"(dst_y), // %[dst] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [scale] "r"(scale) // %[scale] + : "cc", "memory", "z0", "z1", "z2", "p0", "p1"); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus