diff --git a/include/libyuv/row.h b/include/libyuv/row.h index e83773f11..aa2c69372 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -589,6 +589,7 @@ extern "C" { #define HAS_I410TOAR30ROW_SVE2 #define HAS_I410TOARGBROW_SVE2 #define HAS_I422ALPHATOARGBROW_SVE2 +#define HAS_I422TOAR30ROW_SVE2 #define HAS_I422TOARGB1555ROW_SVE2 #define HAS_I422TOARGB4444ROW_SVE2 #define HAS_I422TOARGBROW_SVE2 @@ -632,6 +633,7 @@ extern "C" { #define HAS_I410TOAR30ROW_SME #define HAS_I410TOARGBROW_SME #define HAS_I422ALPHATOARGBROW_SME +#define HAS_I422TOAR30ROW_SME #define HAS_I422TOARGB1555ROW_SME #define HAS_I422TOARGB4444ROW_SME #define HAS_I422TOARGBROW_SME @@ -1363,6 +1365,18 @@ void I422ToAR30Row_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToAR30Row_SVE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToAR30Row_SME(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I444AlphaToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h index 5c0c210df..8ae079ce2 100644 --- a/include/libyuv/row_sve.h +++ b/include/libyuv/row_sve.h @@ -769,6 +769,53 @@ static inline void I422ToRGBARow_SVE_SC(const uint8_t* src_y, : "cc", "memory", YUVTORGB_SVE_REGS); } +static inline void I422ToAR30Row_SVE_SC(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) STREAMING_COMPATIBLE { + uint64_t vl; + // The limit is used for saturating the 2.14 red channel in STOREAR30_SVE. + const uint16_t limit = 0x3ff0; + asm volatile( + "cnth %[vl] \n" + "ptrue p0.b \n" // + YUVTORGB_SVE_SETUP + "dup z19.b, #255 \n" // Alpha + "dup z23.h, %w[limit] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.le 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p1.h \n" + "1: \n" // + READYUV422_SVE I4XXTORGB_SVE STOREAR30_SVE + "subs %w[width], %w[width], %w[vl] \n" + "b.gt 1b \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + "whilelt p1.h, wzr, %w[width] \n" // + READYUV422_SVE I4XXTORGB_SVE STOREAR30_SVE + + "99: \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [limit] "r"(limit) // %[limit] + : "cc", "memory", YUVTORGB_SVE_REGS); +} + static inline void I422AlphaToARGBRow_SVE_SC( const uint8_t* src_y, const uint8_t* src_u, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index e9346f3e8..e1f78c07d 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -6710,6 +6710,16 @@ int I420ToAR30Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOAR30ROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + I422ToAR30Row = I422ToAR30Row_SVE2; + } +#endif +#if defined(HAS_I422TOAR30ROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + I422ToAR30Row = I422ToAR30Row_SME; + } +#endif for (y = 0; y < height; ++y) { I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); diff --git a/source/row_sme.cc b/source/row_sme.cc index c6917bf3c..d8f51241f 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -158,6 +158,16 @@ __arm_locally_streaming void I422ToRGBARow_SME( I422ToRGBARow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width); } +__arm_locally_streaming void I422ToAR30Row_SME( + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + I422ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width); +} + __arm_locally_streaming void I422AlphaToARGBRow_SME( const uint8_t* src_y, const uint8_t* src_u, diff --git a/source/row_sve.cc b/source/row_sve.cc index 474c7950c..a4acb69a4 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -154,6 +154,15 @@ void I422ToRGBARow_SVE2(const uint8_t* src_y, I422ToRGBARow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width); } +void I422ToAR30Row_SVE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + I422ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width); +} + void I422AlphaToARGBRow_SVE2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v,