[AArch64] Add SVE2 implementations for AYUVTo{UV,VU}Row

These kernels are mostly identical to each other except for the order of
the results, so we can use a single macro to parameterize the pairwise
addition and use the same macro for both implementations, just with the
register order flipped.

Similar to other 2x2 kernels the implementation here differs slightly
for the last element if the problem size is odd, so use an "any" kernel
to avoid needing to handle this in the common code path.

Observed reduction in runtime compared to the existing Neon code:

            | AYUVToUVRow | AYUVToVURow
Cortex-A510 |      -33.1% |      -33.0%
Cortex-A720 |      -25.1% |      -25.1%
  Cortex-X2 |      -59.5% |      -53.9%
  Cortex-X4 |      -39.2% |      -39.4%

Bug: libyuv:973
Change-Id: I957db9ea31c8830535c243175790db0ff2a3ccae
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5522316
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-22 12:02:39 +01:00 committed by Frank Barchard
parent d0da5a3298
commit 004352ba16
4 changed files with 140 additions and 0 deletions

View File

@ -580,6 +580,8 @@ extern "C" {
#define HAS_ARGBTORGB565ROW_SVE2
#define HAS_ARGBTOUVJROW_SVE2
#define HAS_ARGBTOUVROW_SVE2
#define HAS_AYUVTOUVROW_SVE2
#define HAS_AYUVTOVUROW_SVE2
#define HAS_BGRATOUVROW_SVE2
#define HAS_I422ALPHATOARGBROW_SVE2
#define HAS_I422TOARGBROW_SVE2
@ -5766,19 +5768,35 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width);
void AYUVToUVRow_SVE2(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width);
void AYUVToVURow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_vu,
int width);
void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_vu,
int width);
void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_vu,
int width);
void AYUVToUVRow_Any_SVE2(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_vu,
int width);
void AYUVToVURow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_vu,
int width);
void AYUVToVURow_Any_SVE2(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_vu,
int width);
void I422ToYUY2Row_C(const uint8_t* src_y,
const uint8_t* src_u,

View File

@ -1784,6 +1784,14 @@ int AYUVToNV12(const uint8_t* src_ayuv,
}
}
#endif
#if defined(HAS_AYUVTOUVROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
AYUVToUVRow = AYUVToUVRow_Any_SVE2;
if (IS_ALIGNED(width, 2)) {
AYUVToUVRow = AYUVToUVRow_SVE2;
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
@ -1853,6 +1861,14 @@ int AYUVToNV21(const uint8_t* src_ayuv,
}
}
#endif
#if defined(HAS_AYUVTOVUROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
AYUVToVURow = AYUVToVURow_Any_SVE2;
if (IS_ALIGNED(width, 2)) {
AYUVToVURow = AYUVToVURow_SVE2;
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);

View File

@ -2446,6 +2446,12 @@ ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31)
ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
#endif
#ifdef HAS_AYUVTOUVROW_SVE2
ANY11S(AYUVToUVRow_Any_SVE2, AYUVToUVRow_SVE2, 0, 4, 1)
#endif
#ifdef HAS_AYUVTOVUROW_SVE2
ANY11S(AYUVToVURow_Any_SVE2, AYUVToVURow_SVE2, 0, 4, 1)
#endif
#undef ANY11S
#define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK) \

View File

@ -651,6 +651,106 @@ void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555,
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0", "p1", "p2");
}
// clang-format off
#define AYUVTOUV_SVE(zU0, zV0, zU1, zV1) /* e.g. */ \
"ld2h {z0.h, z1.h}, p0/z, [%[src0]] \n" /* VUVU.. YAYA.. */ \
"ld2h {z1.h, z2.h}, p1/z, [%[src0], #2, mul vl] \n" /* VUVU.. YAYA.. */ \
"ld2h {z2.h, z3.h}, p0/z, [%[src1]] \n" /* VUVU.. YAYA.. */ \
"ld2h {z3.h, z4.h}, p1/z, [%[src1], #2, mul vl] \n" /* VUVU.. YAYA.. */ \
"incb %[src0], all, mul #4 \n" \
"incb %[src1], all, mul #4 \n" \
"uaddlb z4.h, z0.b, z2.b \n" /* V */ \
"uaddlt z5.h, z0.b, z2.b \n" /* U */ \
"uaddlb z6.h, z1.b, z3.b \n" /* V */ \
"uaddlt z7.h, z1.b, z3.b \n" /* U */ \
"addp " #zU0 ".h, p0/m, " #zU0 ".h, " #zV0 ".h \n" /* UV */ \
"addp " #zU1 ".h, p1/m, " #zU1 ".h, " #zV1 ".h \n" /* UV */ \
"subs %w[width], %w[width], %w[vl] \n" \
"urshr " #zU0 ".h, p0/m, " #zU0 ".h, #2 \n" /* U0V0 */ \
"urshr " #zU1 ".h, p1/m, " #zU1 ".h, #2 \n" /* U0V0 */ \
"st1b {" #zU0 ".h}, p0, [%[dst]] \n" \
"st1b {" #zU1 ".h}, p1, [%[dst], #1, mul vl] \n" \
"incb %[dst] \n"
// clang-format on
// Filter 2 rows of AYUV UV's (444) into UV (420).
// AYUV is VUYA in memory. UV for NV12 is UV order in memory.
void AYUVToUVRow_SVE2(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width) {
// Output a row of UV values, filtering 2x2 rows of AYUV.
const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
int vl;
asm("cntb %x[vl] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
"ptrue p0.h \n"
"ptrue p1.h \n"
"1: \n"
AYUVTOUV_SVE(z5, z4, z7, z6)
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
"cnth %x[vl] \n"
"whilelt p0.h, wzr, %w[width] \n" // first row
"whilelt p1.h, %w[vl], %w[width] \n" // second row
AYUVTOUV_SVE(z5, z4, z7, z6)
"99: \n"
: [src0]"+r"(src_ayuv), // %[src0]
[src1]"+r"(src_ayuv1), // %[src1]
[dst]"+r"(dst_uv), // %[dst]
[width]"+r"(width), // %[width]
[vl]"=&r"(vl) // %[vl]
:
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0",
"p1");
}
// Filter 2 rows of AYUV UV's (444) into VU (420).
void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_vu,
int width) {
// Output a row of VU values, filtering 2x2 rows of AYUV.
const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
int vl;
asm("cntb %x[vl] \n"
"cmp %w[width], %w[vl] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
"ptrue p0.h \n"
"ptrue p1.h \n"
"1: \n"
AYUVTOUV_SVE(z4, z5, z6, z7)
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
"cnth %x[vl] \n"
"whilelt p0.h, wzr, %w[width] \n" // first row
"whilelt p1.h, %w[vl], %w[width] \n" // second row
AYUVTOUV_SVE(z4, z5, z6, z7)
"99: \n"
: [src0]"+r"(src_ayuv), // %[src0]
[src1]"+r"(src_ayuv1), // %[src1]
[dst]"+r"(dst_vu), // %[dst]
[width]"+r"(width), // %[width]
[vl]"=&r"(vl) // %[vl]
:
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0",
"p1");
}
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
#ifdef __cplusplus