diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 98808b75e..3ec11cb8d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -553,6 +553,7 @@ extern "C" { #define HAS_AYUVTOUVROW_SVE2 #define HAS_AYUVTOVUROW_SVE2 #define HAS_BGRATOUVROW_SVE2 +#define HAS_CONVERT8TO8ROW_SVE2 #define HAS_DIVIDEROW_16_SVE2 #define HAS_HALFFLOATROW_SVE2 #define HAS_I210ALPHATOARGBROW_SVE2 @@ -595,6 +596,7 @@ extern "C" { defined(__aarch64__) #define HAS_ARGBMULTIPLYROW_SME #define HAS_CONVERT16TO8ROW_SME +#define HAS_CONVERT8TO8ROW_SME #define HAS_COPYROW_SME #define HAS_I210ALPHATOARGBROW_SME #define HAS_I210TOAR30ROW_SME @@ -3658,6 +3660,16 @@ void Convert8To8Row_Any_NEON(const uint8_t* src_ptr, int scale, int bias, int width); +void Convert8To8Row_SVE2(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width); +void Convert8To8Row_SME(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width); void Convert8To8Row_AVX2(const uint8_t* src_y, uint8_t* dst_y, int scale, diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h index b52a38a99..1e10c1e9a 100644 --- a/include/libyuv/row_sve.h +++ b/include/libyuv/row_sve.h @@ -1725,6 +1725,60 @@ static inline void I212ToARGBRow_SVE_SC(const uint16_t* src_y, : "cc", "memory", YUVTORGB_SVE_REGS); } +#define CONVERT8TO8_SVE \ + "ld1b {z0.b}, p0/z, [%[src]] \n" \ + "ld1b {z1.b}, p1/z, [%[src], #1, mul vl] \n" \ + "incb %[src], all, mul #2 \n" \ + "subs %w[width], %w[width], %w[vl], lsl #1 \n" \ + "umulh z0.b, z0.b, z2.b \n" \ + "umulh z1.b, z1.b, z2.b \n" \ + "prfm pldl1keep, [%[src], 448] \n" \ + "add z0.b, z0.b, z3.b \n" \ + "add z1.b, z1.b, z3.b \n" \ + "st1b {z0.b}, p0, [%[dst]] \n" \ + "st1b {z1.b}, p1, [%[dst], #1, mul vl] \n" \ + "incb %[dst], all, mul #2 \n" + +static inline void Convert8To8Row_SVE_SC(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width) STREAMING_COMPATIBLE { + uint64_t vl; + asm volatile( + "dup z2.b, %w[scale] \n" + "dup z3.b, %w[bias] \n" + "cntb %[vl] \n" + "subs %w[width], %w[width], %w[vl], lsl #1 \n" + "b.lt 2f \n" + + // Run bulk of computation with all-true predicates to avoid predicate + // generation overhead. + "ptrue p0.b \n" + "ptrue p1.b \n" + "1: \n" // + CONVERT8TO8_SVE + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl], lsl #1 \n" + "b.eq 99f \n" + + // Calculate predicates for the final iteration to deal with the tail. + "whilelt p0.b, wzr, %w2 \n" + "whilelt p1.b, %w[vl], %w2 \n" // + CONVERT8TO8_SVE + + "99: \n" + : [src] "+r"(src_y), // %[src] + [dst] "+r"(dst_y), // %[dst] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [scale] "r"(scale), // %[scale] + [bias] "r"(bias) // %[bias] + : "cc", "memory", "z0", "z1", "z2", "z3", "p0", "p1"); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus diff --git a/source/planar_functions.cc b/source/planar_functions.cc index ca0bfea90..c2d4b67a4 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -271,6 +271,16 @@ void Convert8To8Plane(const uint8_t* src_y, } } #endif +#if defined(HAS_CONVERT8TO8ROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + Convert8To8Row = Convert8To8Row_SVE2; + } +#endif +#if defined(HAS_CONVERT8TO8ROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + Convert8To8Row = Convert8To8Row_SME; + } +#endif #if defined(HAS_CONVERT8TO8ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { Convert8To8Row = Convert8To8Row_Any_AVX2; diff --git a/source/row_sme.cc b/source/row_sme.cc index 561b7f73d..1cbc42f3e 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -1075,6 +1075,14 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr, : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0"); } +__arm_locally_streaming void Convert8To8Row_SME(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width) { + Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width); +} + #endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && // defined(__aarch64__) diff --git a/source/row_sve.cc b/source/row_sve.cc index 8076c9ebc..0bab8e16f 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -1237,6 +1237,14 @@ void I212ToARGBRow_SVE2(const uint16_t* src_y, I212ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width); } +void Convert8To8Row_SVE2(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width) { + Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus