From 250e1e1ba3409647fcf8cc4b00b51df0a927ec81 Mon Sep 17 00:00:00 2001 From: George Steed Date: Mon, 15 Apr 2024 15:11:38 +0100 Subject: [PATCH] [AArch64] Add SVE2 implementation of ARGBToRGB565DitherRow Observed performance improvements compared to the existing Neon implementation: Cortex-A510: -21.7% Cortex-A720: -49.2% Cortex-X2: -62.6% Bug: libyuv:973 Change-Id: I2c7ae483c0b488a122bb3b80a745412ed44622df Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5505539 Reviewed-by: Frank Barchard Reviewed-by: Justin Green --- include/libyuv/row.h | 5 ++++ source/convert_argb.cc | 5 ++++ source/convert_from_argb.cc | 5 ++++ source/row_sve.cc | 47 +++++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f3be39db5..8b550246f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -575,6 +575,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #define HAS_ABGRTOUVJROW_SVE2 #define HAS_ABGRTOUVROW_SVE2 +#define HAS_ARGBTORGB565DITHERROW_SVE2 #define HAS_ARGBTORGB565ROW_SVE2 #define HAS_ARGBTOUVJROW_SVE2 #define HAS_ARGBTOUVROW_SVE2 @@ -3653,6 +3654,10 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, uint32_t dither4, int width); +void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width); void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 5c710343d..bc5478184 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -6088,6 +6088,11 @@ int I420ToRGB565Dither(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SVE2; + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index daa99505a..ed2aa4496 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1960,6 +1960,11 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SVE2; + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; diff --git a/source/row_sve.cc b/source/row_sve.cc index 43fc51d70..5b03e5a56 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -550,6 +550,53 @@ void ARGBToRGB565Row_SVE2(const uint8_t* src_argb, : "cc", "memory", "z0", "z1", "z3", "z4", "p0"); } +void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + unsigned bsl_mask = 0x7e0; + uint64_t vl; + width *= 2; + asm("mov z3.h, #3 \n" + "dup z4.h, %w[bsl_mask] \n" + "dup z2.s, %w[dither4] \n" + "zip1 z2.b, z2.b, z2.b \n" + + "cntb %[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + "ptrue p0.b \n" + "1: \n" + "ld2b {z0.b, z1.b}, p0/z, [%[src]] \n" // BR, GA + "incb %[src], all, mul #2 \n" + "uqadd z0.b, z0.b, z2.b \n" + "uqadd z1.b, z1.b, z2.b \n" + "subs %w[width], %w[width], %w[vl] \n" ARGBTORGB565_SVE + "st1b {z1.b}, p0, [%[dst]] \n" + "incb %[dst] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + "whilelt p0.b, wzr, %w[width] \n" + "ld2b {z0.b, z1.b}, p0/z, [%[src]] \n" // BR, GA + "uqadd z0.b, z0.b, z2.b \n" + "uqadd z1.b, z1.b, z2.b \n" ARGBTORGB565_SVE + "st1b {z1.b}, p0, [%[dst]] \n" + + "99: \n" + : [src] "+r"(src_argb), // %[src] + [dst] "+r"(dst_rgb), // %[dst] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [bsl_mask] "r"(bsl_mask), // %[bsl_mask] + [dither4] "r"(dither4) // %[dither4] + : "cc", "memory", "z0", "z1", "z3", "z4", "p0"); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus