From 5c12e0b2de33e9a3031526c1f392cc0d11d49f5f Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 7 May 2024 13:26:07 +0100 Subject: [PATCH] [AArch64] Add SVE2 implementations of HalfFloat{,1}Row For HalfFloat1Row, SVE has direct 16-bit integer to half-float conversion instructions so there is no need to widen to 32-bits. For HalfFloatRow, SVE zero-extending loads avoid the need for seperate UXTL(2) instructions. Observed reductions in runtime compared to the existing Neon code: | HalfFloat1Row | HalfFloatRow Cortex-A510 | -38.3% | -17.3% Cortex-A520 | -37.6% | -18.8% Cortex-A720 | -50.1% | -7.8% Cortex-X2 | -50.2% | -0.4% Cortex-X4 | -51.5% | -12.5% Bug: b/42280942 Change-Id: I445071ccd453113144ce42d465ba03c9ee89ec9e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5975319 Reviewed-by: Justin Green Reviewed-by: Frank Barchard --- include/libyuv/row.h | 9 +++ source/planar_functions.cc | 5 ++ source/row_sve.cc | 119 +++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index da7bab8f3..ccb849862 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -549,6 +549,7 @@ extern "C" { #define HAS_AYUVTOVUROW_SVE2 #define HAS_BGRATOUVROW_SVE2 #define HAS_DIVIDEROW_16_SVE2 +#define HAS_HALFFLOATROW_SVE2 #define HAS_I400TOARGBROW_SVE2 #define HAS_I422ALPHATOARGBROW_SVE2 #define HAS_I422TOARGB1555ROW_SVE2 @@ -6609,6 +6610,10 @@ void HalfFloatRow_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, float param, int width); +void HalfFloatRow_SVE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width); void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float scale, @@ -6617,6 +6622,10 @@ void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, float param, int width); +void HalfFloat1Row_SVE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width); void HalfFloatRow_MSA(const uint16_t* src, uint16_t* dst, float scale, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index e13626f81..7e81b5cc2 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -5195,6 +5195,11 @@ int HalfFloatPlane(const uint16_t* src_y, } } #endif +#if defined(HAS_HALFFLOATROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + HalfFloatRow = scale == 1.0f ? HalfFloat1Row_SVE2 : HalfFloatRow_SVE2; + } +#endif #if defined(HAS_HALFFLOATROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { HalfFloatRow = HalfFloatRow_Any_MSA; diff --git a/source/row_sve.cc b/source/row_sve.cc index b70824444..4920fef3d 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -1670,6 +1670,125 @@ void DivideRow_16_SVE2(const uint16_t* src_y, : "cc", "memory", "z0", "z1", "z2", "p0", "p1"); } +#define HALFFLOAT_SVE \ + "scvtf z0.s, p0/m, z0.s \n" \ + "scvtf z1.s, p0/m, z1.s \n" \ + "scvtf z2.s, p0/m, z2.s \n" \ + "scvtf z3.s, p0/m, z3.s \n" \ + "fmul z0.s, z0.s, z4.s \n" \ + "fmul z1.s, z1.s, z4.s \n" \ + "fmul z2.s, z2.s, z4.s \n" \ + "fmul z3.s, z3.s, z4.s \n" \ + "uqshrnb z0.h, z0.s, #13 \n" \ + "uqshrnb z1.h, z1.s, #13 \n" \ + "uqshrnb z2.h, z2.s, #13 \n" \ + "uqshrnb z3.h, z3.s, #13 \n" + +void HalfFloatRow_SVE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + uint64_t vl; + asm("cntw %x0" : "=r"(vl)); + asm volatile( + "mov z4.s, %s[scale] \n" + "subs %w[width], %w[width], %w[vl], lsl #2 \n" + "b.lt 2f \n" + + // Run bulk of computation with all-true predicates to avoid predicate + // generation overhead. + "ptrue p0.s \n" + "1: \n" + "ld1h {z0.s}, p0/z, [%[src]] \n" + "ld1h {z1.s}, p0/z, [%[src], #1, mul vl] \n" + "ld1h {z2.s}, p0/z, [%[src], #2, mul vl] \n" + "ld1h {z3.s}, p0/z, [%[src], #3, mul vl] \n" + "incb %[src], all, mul #2 \n" HALFFLOAT_SVE + "subs %w[width], %w[width], %w[vl], lsl #2 \n" + "st1h {z0.s}, p0, [%[dst]] \n" + "st1h {z1.s}, p0, [%[dst], #1, mul vl] \n" + "st1h {z2.s}, p0, [%[dst], #2, mul vl] \n" + "st1h {z3.s}, p0, [%[dst], #3, mul vl] \n" + "incb %[dst], all, mul #2 \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl], lsl #2 \n" + "b.eq 99f \n" + + // Calculate predicates for the final iteration to deal with the tail. + "whilelt p0.s, wzr, %w[width] \n" + "whilelt p1.s, %w[vl], %w[width] \n" + "whilelt p2.s, %w[vl2], %w[width] \n" + "whilelt p3.s, %w[vl3], %w[width] \n" + "ld1h {z0.s}, p0/z, [%[src]] \n" + "ld1h {z1.s}, p1/z, [%[src], #1, mul vl] \n" + "ld1h {z2.s}, p2/z, [%[src], #2, mul vl] \n" + "ld1h {z3.s}, p3/z, [%[src], #3, mul vl] \n" HALFFLOAT_SVE + "st1h {z0.s}, p0, [%[dst]] \n" + "st1h {z1.s}, p1, [%[dst], #1, mul vl] \n" + "st1h {z2.s}, p2, [%[dst], #2, mul vl] \n" + "st1h {z3.s}, p3, [%[dst], #3, mul vl] \n" + + "99: \n" + : [src] "+r"(src), // %[src] + [dst] "+r"(dst), // %[dst] + [width] "+r"(width) // %[width] + : [vl] "r"(vl), // %[vl] + [vl2] "r"(vl * 2), // %[vl2] + [vl3] "r"(vl * 3), // %[vl3] + [scale] "w"(scale * 1.9259299444e-34f) // %[scale] + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0", "p1", "p2", "p3"); +} + +void HalfFloat1Row_SVE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + uint64_t vl; + asm volatile( + "cnth %x[vl] \n" + "subs %w[width], %w[width], %w[vl], lsl #1 \n" + "b.lt 2f \n" + + // Run bulk of computation with all-true predicates to avoid predicate + // generation overhead. + "ptrue p0.h \n" + "1: \n" + "ld1h {z0.h}, p0/z, [%[src]] \n" + "ld1h {z1.h}, p0/z, [%[src], #1, mul vl] \n" + "incb %[src], all, mul #2 \n" + "ucvtf z0.h, p0/m, z0.h \n" + "ucvtf z1.h, p0/m, z1.h \n" + "subs %w[width], %w[width], %w[vl], lsl #1 \n" + "st1h {z0.h}, p0, [%[dst]] \n" + "st1h {z1.h}, p0, [%[dst], #1, mul vl] \n" + "incb %[dst], all, mul #2 \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl], lsl #1 \n" + "b.eq 99f \n" + + // Calculate predicates for the final iteration to deal with the tail. + "whilelt p0.h, wzr, %w[width] \n" + "whilelt p1.h, %w[vl], %w[width] \n" + "ld1h {z0.h}, p0/z, [%[src]] \n" + "ld1h {z1.h}, p1/z, [%[src], #1, mul vl] \n" + "ucvtf z0.h, p0/m, z0.h \n" + "ucvtf z1.h, p0/m, z1.h \n" + "st1h {z0.h}, p0, [%[dst]] \n" + "st1h {z1.h}, p1, [%[dst], #1, mul vl] \n" + + "99: \n" + : [src] "+r"(src), // %[src] + [dst] "+r"(dst), // %[dst] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : + : "cc", "memory", "z0", "z1", "p0", "p1"); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus