From c613c3f1024d6d59b30f3816fc717a4f074b532e Mon Sep 17 00:00:00 2001 From: George Steed Date: Thu, 25 Apr 2024 14:51:56 +0100 Subject: [PATCH] [AArch64] Add SVE2 implementations for RAWTo{ARGB,RGBA}Row We can construct particular predicates to load only up to 3/4 of a full vector, allowing us to use TBL to shuffle elements into the correct place rather than needing to rely on more expensive LD3 or ST4 instructions. Reduction in runtimes observed compared to the existing Neon implementation: | RAWToARGBRow | RAWToRGBARow Cortex-A510 | -32.4% | -31.9% Cortex-A720 | -15.7% | -15.6% Cortex-X2 | -24.6% | -24.4% Bug: libyuv:973 Change-Id: I271c625d97bab3b0e08ac1e9d7fcf7d18f3d6894 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5631542 Reviewed-by: Frank Barchard Reviewed-by: Justin Green --- include/libyuv/row.h | 4 +++ source/convert_argb.cc | 10 +++++++ source/row_sve.cc | 65 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 86b024321..8e3c6cb0d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -600,6 +600,8 @@ extern "C" { #define HAS_I444TOARGBROW_SVE2 #define HAS_NV12TOARGBROW_SVE2 #define HAS_NV21TOARGBROW_SVE2 +#define HAS_RAWTOARGBROW_SVE2 +#define HAS_RAWTORGBAROW_SVE2 #define HAS_RGBATOUVROW_SVE2 #define HAS_UYVYTOARGBROW_SVE2 #define HAS_YUY2TOARGBROW_SVE2 @@ -3510,7 +3512,9 @@ void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, int width); void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width); +void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 11948726a..73f004b81 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3555,6 +3555,11 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RAWToARGBRow = RAWToARGBRow_SVE2; + } +#endif #if defined(HAS_RAWTOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RAWToARGBRow = RAWToARGBRow_Any_MSA; @@ -3635,6 +3640,11 @@ int RAWToRGBA(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTORGBAROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RAWToRGBARow = RAWToRGBARow_SVE2; + } +#endif #if defined(HAS_RAWTORGBAROW_RVV) if (TestCpuFlag(kCpuHasRVV)) { RAWToRGBARow = RAWToRGBARow_RVV; diff --git a/source/row_sve.cc b/source/row_sve.cc index 89a86d53b..66e1d17df 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -1113,6 +1113,71 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy, : "cc", "memory", YUVTORGB_SVE_REGS, "p2"); } +static inline void RAWToWXYZRow_SVE2(const uint8_t* src_raw, + uint8_t* dst_wxyz, + int width, + uint32_t idx_start, + uint32_t idx_step, + uint32_t alpha) { + uint32_t vl; + asm("cntw %x0" : "=r"(vl)); + uint32_t vl_mul3 = vl * 3; + asm volatile( + "index z31.s, %w[idx_start], %w[idx_step] \n" + "dup z30.s, %w[alpha] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Run bulk of computation with the same predicates to avoid predicate + // generation overhead. We set up p1 to only load 3/4 of a vector. + "ptrue p0.s \n" + "whilelt p1.b, wzr, %w[vl_mul3] \n" + "1: \n" + "ld1b {z0.b}, p1/z, [%[src]] \n" + "add %[src], %[src], %x[vl_mul3] \n" + "tbl z0.b, {z0.b}, z31.b \n" + "subs %w[width], %w[width], %w[vl] \n" + "orr z0.d, z0.d, z30.d \n" + "st1w {z0.s}, p0, [%[dst]] \n" + "incb %[dst] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a pair of predicates for the final iteration to deal with + // the tail. + "add %w[vl_mul3], %w[width], %w[width], lsl #1 \n" + "whilelt p0.s, wzr, %w[width] \n" + "whilelt p1.b, wzr, %w[vl_mul3] \n" + "ld1b {z0.b}, p1/z, [%[src]] \n" + "tbl z0.b, {z0.b}, z31.b \n" + "orr z0.d, z0.d, z30.d \n" + "st1w {z0.s}, p0, [%[dst]] \n" + + "99: \n" + : [src] "+r"(src_raw), // %[src] + [dst] "+r"(dst_wxyz), // %[dst] + [width] "+r"(width), // %[width] + [vl_mul3] "+r"(vl_mul3) // %[vl_mul3] + : [idx_start] "r"(idx_start), // %[idx_start] + [idx_step] "r"(idx_step), // %[idx_step] + [alpha] "r"(alpha), // %[alpha] + [vl] "r"(vl) // %[vl] + : "cc", "memory", "z0", "z30", "z31", "p0", "p1"); +} + +void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + RAWToWXYZRow_SVE2(src_raw, dst_argb, width, 0xff000102U, 0x00030303U, + 0xff000000U); +} + +void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + RAWToWXYZRow_SVE2(src_raw, dst_rgba, width, 0x000102ffU, 0x03030300U, + 0x000000ffU); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus