diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f1084dd8a..6a7d30c0d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -580,6 +580,8 @@ extern "C" { #define HAS_ARGBTORGB565ROW_SVE2 #define HAS_ARGBTOUVJROW_SVE2 #define HAS_ARGBTOUVROW_SVE2 +#define HAS_AYUVTOUVROW_SVE2 +#define HAS_AYUVTOVUROW_SVE2 #define HAS_BGRATOUVROW_SVE2 #define HAS_I422ALPHATOARGBROW_SVE2 #define HAS_I422TOARGBROW_SVE2 @@ -5766,19 +5768,35 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width); +void AYUVToUVRow_SVE2(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width); void AYUVToVURow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width); +void AYUVToVURow_SVE2(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width); void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, int width); +void AYUVToUVRow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_vu, + int width); void AYUVToVURow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, int width); +void AYUVToVURow_Any_SVE2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_vu, + int width); void I422ToYUY2Row_C(const uint8_t* src_y, const uint8_t* src_u, diff --git a/source/convert.cc b/source/convert.cc index fdd0cb644..2d9a24518 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1784,6 +1784,14 @@ int AYUVToNV12(const uint8_t* src_ayuv, } } #endif +#if defined(HAS_AYUVTOUVROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + AYUVToUVRow = AYUVToUVRow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + AYUVToUVRow = AYUVToUVRow_SVE2; + } + } +#endif for (y = 0; y < height - 1; y += 2) { AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width); @@ -1853,6 +1861,14 @@ int AYUVToNV21(const uint8_t* src_ayuv, } } #endif +#if defined(HAS_AYUVTOVUROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + AYUVToVURow = AYUVToVURow_Any_SVE2; + if (IS_ALIGNED(width, 2)) { + AYUVToVURow = AYUVToVURow_SVE2; + } + } +#endif for (y = 0; y < height - 1; y += 2) { AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width); diff --git a/source/row_any.cc b/source/row_any.cc index 351aa4a2c..f7a5d838d 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2446,6 +2446,12 @@ ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31) ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15) ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) #endif +#ifdef HAS_AYUVTOUVROW_SVE2 +ANY11S(AYUVToUVRow_Any_SVE2, AYUVToUVRow_SVE2, 0, 4, 1) +#endif +#ifdef HAS_AYUVTOVUROW_SVE2 +ANY11S(AYUVToVURow_Any_SVE2, AYUVToVURow_SVE2, 0, 4, 1) +#endif #undef ANY11S #define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK) \ diff --git a/source/row_sve.cc b/source/row_sve.cc index b1d2cfe2f..4ef937d1c 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -651,6 +651,106 @@ void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555, : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0", "p1", "p2"); } +// clang-format off +#define AYUVTOUV_SVE(zU0, zV0, zU1, zV1) /* e.g. */ \ + "ld2h {z0.h, z1.h}, p0/z, [%[src0]] \n" /* VUVU.. YAYA.. */ \ + "ld2h {z1.h, z2.h}, p1/z, [%[src0], #2, mul vl] \n" /* VUVU.. YAYA.. */ \ + "ld2h {z2.h, z3.h}, p0/z, [%[src1]] \n" /* VUVU.. YAYA.. */ \ + "ld2h {z3.h, z4.h}, p1/z, [%[src1], #2, mul vl] \n" /* VUVU.. YAYA.. */ \ + "incb %[src0], all, mul #4 \n" \ + "incb %[src1], all, mul #4 \n" \ + "uaddlb z4.h, z0.b, z2.b \n" /* V */ \ + "uaddlt z5.h, z0.b, z2.b \n" /* U */ \ + "uaddlb z6.h, z1.b, z3.b \n" /* V */ \ + "uaddlt z7.h, z1.b, z3.b \n" /* U */ \ + "addp " #zU0 ".h, p0/m, " #zU0 ".h, " #zV0 ".h \n" /* UV */ \ + "addp " #zU1 ".h, p1/m, " #zU1 ".h, " #zV1 ".h \n" /* UV */ \ + "subs %w[width], %w[width], %w[vl] \n" \ + "urshr " #zU0 ".h, p0/m, " #zU0 ".h, #2 \n" /* U0V0 */ \ + "urshr " #zU1 ".h, p1/m, " #zU1 ".h, #2 \n" /* U0V0 */ \ + "st1b {" #zU0 ".h}, p0, [%[dst]] \n" \ + "st1b {" #zU1 ".h}, p1, [%[dst], #1, mul vl] \n" \ + "incb %[dst] \n" +// clang-format on + +// Filter 2 rows of AYUV UV's (444) into UV (420). +// AYUV is VUYA in memory. UV for NV12 is UV order in memory. +void AYUVToUVRow_SVE2(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + // Output a row of UV values, filtering 2x2 rows of AYUV. + const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv; + int vl; + asm("cntb %x[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + "ptrue p0.h \n" + "ptrue p1.h \n" + "1: \n" + AYUVTOUV_SVE(z5, z4, z7, z6) + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + "cnth %x[vl] \n" + "whilelt p0.h, wzr, %w[width] \n" // first row + "whilelt p1.h, %w[vl], %w[width] \n" // second row + AYUVTOUV_SVE(z5, z4, z7, z6) + + "99: \n" + : [src0]"+r"(src_ayuv), // %[src0] + [src1]"+r"(src_ayuv1), // %[src1] + [dst]"+r"(dst_uv), // %[dst] + [width]"+r"(width), // %[width] + [vl]"=&r"(vl) // %[vl] + : + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0", + "p1"); +} + +// Filter 2 rows of AYUV UV's (444) into VU (420). +void AYUVToVURow_SVE2(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + // Output a row of VU values, filtering 2x2 rows of AYUV. + const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv; + int vl; + asm("cntb %x[vl] \n" + "cmp %w[width], %w[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + "ptrue p0.h \n" + "ptrue p1.h \n" + "1: \n" + AYUVTOUV_SVE(z4, z5, z6, z7) + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + "cnth %x[vl] \n" + "whilelt p0.h, wzr, %w[width] \n" // first row + "whilelt p1.h, %w[vl], %w[width] \n" // second row + AYUVTOUV_SVE(z4, z5, z6, z7) + + "99: \n" + : [src0]"+r"(src_ayuv), // %[src0] + [src1]"+r"(src_ayuv1), // %[src1] + [dst]"+r"(dst_vu), // %[dst] + [width]"+r"(width), // %[width] + [vl]"=&r"(vl) // %[vl] + : + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0", + "p1"); +} + #endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__) #ifdef __cplusplus