diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 86b024321..8e3c6cb0d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -600,6 +600,8 @@ extern "C" { #define HAS_I444TOARGBROW_SVE2 #define HAS_NV12TOARGBROW_SVE2 #define HAS_NV21TOARGBROW_SVE2 +#define HAS_RAWTOARGBROW_SVE2 +#define HAS_RAWTORGBAROW_SVE2 #define HAS_RGBATOUVROW_SVE2 #define HAS_UYVYTOARGBROW_SVE2 #define HAS_YUY2TOARGBROW_SVE2 @@ -3510,7 +3512,9 @@ void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, int width); void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width); +void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 11948726a..73f004b81 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3555,6 +3555,11 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RAWToARGBRow = RAWToARGBRow_SVE2; + } +#endif #if defined(HAS_RAWTOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RAWToARGBRow = RAWToARGBRow_Any_MSA; @@ -3635,6 +3640,11 @@ int RAWToRGBA(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTORGBAROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RAWToRGBARow = RAWToRGBARow_SVE2; + } +#endif #if defined(HAS_RAWTORGBAROW_RVV) if (TestCpuFlag(kCpuHasRVV)) { RAWToRGBARow = RAWToRGBARow_RVV; diff --git a/source/row_sve.cc b/source/row_sve.cc index 89a86d53b..66e1d17df 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -1113,6 +1113,71 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy, : "cc", "memory", YUVTORGB_SVE_REGS, "p2"); } +static inline void RAWToWXYZRow_SVE2(const uint8_t* src_raw, + uint8_t* dst_wxyz, + int width, + uint32_t idx_start, + uint32_t idx_step, + uint32_t alpha) { + uint32_t vl; + asm("cntw %x0" : "=r"(vl)); + uint32_t vl_mul3 = vl * 3; + asm volatile( + "index z31.s, %w[idx_start], %w[idx_step] \n" + "dup z30.s, %w[alpha] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Run bulk of computation with the same predicates to avoid predicate + // generation overhead. We set up p1 to only load 3/4 of a vector. + "ptrue p0.s \n" + "whilelt p1.b, wzr, %w[vl_mul3] \n" + "1: \n" + "ld1b {z0.b}, p1/z, [%[src]] \n" + "add %[src], %[src], %x[vl_mul3] \n" + "tbl z0.b, {z0.b}, z31.b \n" + "subs %w[width], %w[width], %w[vl] \n" + "orr z0.d, z0.d, z30.d \n" + "st1w {z0.s}, p0, [%[dst]] \n" + "incb %[dst] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a pair of predicates for the final iteration to deal with + // the tail. + "add %w[vl_mul3], %w[width], %w[width], lsl #1 \n" + "whilelt p0.s, wzr, %w[width] \n" + "whilelt p1.b, wzr, %w[vl_mul3] \n" + "ld1b {z0.b}, p1/z, [%[src]] \n" + "tbl z0.b, {z0.b}, z31.b \n" + "orr z0.d, z0.d, z30.d \n" + "st1w {z0.s}, p0, [%[dst]] \n" + + "99: \n" + : [src] "+r"(src_raw), // %[src] + [dst] "+r"(dst_wxyz), // %[dst] + [width] "+r"(width), // %[width] + [vl_mul3] "+r"(vl_mul3) // %[vl_mul3] + : [idx_start] "r"(idx_start), // %[idx_start] + [idx_step] "r"(idx_step), // %[idx_step] + [alpha] "r"(alpha), // %[alpha] + [vl] "r"(vl) // %[vl] + : "cc", "memory", "z0", "z30", "z31", "p0", "p1"); +} + +void RAWToARGBRow_SVE2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + RAWToWXYZRow_SVE2(src_raw, dst_argb, width, 0xff000102U, 0x00030303U, + 0xff000000U); +} + +void RAWToRGBARow_SVE2(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + RAWToWXYZRow_SVE2(src_raw, dst_rgba, width, 0x000102ffU, 0x03030300U, + 0x000000ffU); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus