diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f3be39db5..8b550246f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -575,6 +575,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #define HAS_ABGRTOUVJROW_SVE2 #define HAS_ABGRTOUVROW_SVE2 +#define HAS_ARGBTORGB565DITHERROW_SVE2 #define HAS_ARGBTORGB565ROW_SVE2 #define HAS_ARGBTOUVJROW_SVE2 #define HAS_ARGBTOUVROW_SVE2 @@ -3653,6 +3654,10 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, uint32_t dither4, int width); +void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width); void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 5c710343d..bc5478184 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -6088,6 +6088,11 @@ int I420ToRGB565Dither(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SVE2; + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index daa99505a..ed2aa4496 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1960,6 +1960,11 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SVE2; + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; diff --git a/source/row_sve.cc b/source/row_sve.cc index 43fc51d70..5b03e5a56 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -550,6 +550,53 @@ void ARGBToRGB565Row_SVE2(const uint8_t* src_argb, : "cc", "memory", "z0", "z1", "z3", "z4", "p0"); } +void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + unsigned bsl_mask = 0x7e0; + uint64_t vl; + width *= 2; + asm("mov z3.h, #3 \n" + "dup z4.h, %w[bsl_mask] \n" + "dup z2.s, %w[dither4] \n" + "zip1 z2.b, z2.b, z2.b \n" + + "cntb %[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + "ptrue p0.b \n" + "1: \n" + "ld2b {z0.b, z1.b}, p0/z, [%[src]] \n" // BR, GA + "incb %[src], all, mul #2 \n" + "uqadd z0.b, z0.b, z2.b \n" + "uqadd z1.b, z1.b, z2.b \n" + "subs %w[width], %w[width], %w[vl] \n" ARGBTORGB565_SVE + "st1b {z1.b}, p0, [%[dst]] \n" + "incb %[dst] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + "whilelt p0.b, wzr, %w[width] \n" + "ld2b {z0.b, z1.b}, p0/z, [%[src]] \n" // BR, GA + "uqadd z0.b, z0.b, z2.b \n" + "uqadd z1.b, z1.b, z2.b \n" ARGBTORGB565_SVE + "st1b {z1.b}, p0, [%[dst]] \n" + + "99: \n" + : [src] "+r"(src_argb), // %[src] + [dst] "+r"(dst_rgb), // %[dst] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [bsl_mask] "r"(bsl_mask), // %[bsl_mask] + [dither4] "r"(dither4) // %[dither4] + : "cc", "memory", "z0", "z1", "z3", "z4", "p0"); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus