[AArch64] Add SME implementation of MergeUVRow{,_16}

Mostly just a straightforward copy of the Neon code ported to
Streaming-SVE, we can use predication to avoid needing an `Any` kernel
and use ST2 to avoid needing a separate ZIP instruction.

These is no benefit from this kernel when the SVE vector length is only
128 bits, so skip writing a non-streaming SVE implementation.

Change-Id: I5ae36afe699b88f119dc545e49c59c5d85e98742
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6070785
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-11-16 21:00:36 +00:00 committed by Frank Barchard
parent 3e75e41e79
commit 7391559cb4
5 changed files with 153 additions and 0 deletions

View File

@ -594,6 +594,8 @@ extern "C" {
#define HAS_ARGBMULTIPLYROW_SME
#define HAS_I422TOARGBROW_SME
#define HAS_I444TOARGBROW_SME
#define HAS_MERGEUVROW_16_SME
#define HAS_MERGEUVROW_SME
#define HAS_MULTIPLYROW_16_SME
#endif
@ -2796,6 +2798,10 @@ void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width);
void MergeUVRow_SME(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width);
void MergeUVRow_MSA(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
@ -3339,6 +3345,11 @@ void MergeUVRow_16_Any_NEON(const uint16_t* src_u,
uint16_t* dst_uv,
int depth,
int width);
void MergeUVRow_16_SME(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
int depth,
int width);
void SplitUVRow_16_C(const uint16_t* src_uv,
uint16_t* dst_u,

View File

@ -746,6 +746,11 @@ int I010ToNV12(const uint16_t* src_y,
}
}
#endif
#if defined(HAS_MERGEUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
MergeUVRow = MergeUVRow_SME;
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow = MergeUVRow_Any_MSA;
@ -1188,6 +1193,11 @@ int I422ToNV21(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_MERGEUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
MergeUVRow = MergeUVRow_SME;
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow = MergeUVRow_Any_MSA;

View File

@ -486,6 +486,11 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_MERGEUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
MergeUVRow_ = MergeUVRow_SME;
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow_ = MergeUVRow_Any_MSA;
@ -702,6 +707,11 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_MERGEUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
MergeUVRow_ = MergeUVRow_SME;
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow_ = MergeUVRow_Any_MSA;
@ -905,6 +915,11 @@ int ABGRToNV12(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_MERGEUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
MergeUVRow_ = MergeUVRow_SME;
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow_ = MergeUVRow_Any_MSA;
@ -1109,6 +1124,11 @@ int ABGRToNV21(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_MERGEUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
MergeUVRow_ = MergeUVRow_SME;
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow_ = MergeUVRow_Any_MSA;
@ -3522,6 +3542,11 @@ int RAWToJNV21(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_MERGEUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
MergeUVRow_ = MergeUVRow_SME;
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow_ = MergeUVRow_Any_MSA;

View File

@ -635,6 +635,11 @@ void MergeUVPlane(const uint8_t* src_u,
}
}
#endif
#if defined(HAS_MERGEUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
MergeUVRow = MergeUVRow_SME;
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow = MergeUVRow_Any_MSA;
@ -774,6 +779,11 @@ void MergeUVPlane_16(const uint16_t* src_u,
}
}
#endif
#if defined(HAS_MERGEUVROW_16_SME)
if (TestCpuFlag(kCpuHasSME)) {
MergeUVRow_16 = MergeUVRow_16_SME;
}
#endif
for (y = 0; y < height; ++y) {
// Merge a row of U and V into a row of UV.

View File

@ -312,6 +312,103 @@ __arm_locally_streaming void ARGBMultiplyRow_SME(const uint8_t* src_argb,
: "memory", "cc", "z0", "z1", "z2", "p0", "p1");
}
__arm_locally_streaming void MergeUVRow_SME(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
// Streaming-SVE only, no use of ZA tile.
int vl;
asm volatile(
"cntb %x[vl] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.b \n"
"1: \n"
"ld1b {z1.b}, p0/z, [%[src_u]] \n"
"ld1b {z2.b}, p0/z, [%[src_v]] \n"
"incb %[src_u] \n"
"incb %[src_v] \n"
"subs %w[width], %w[width], %w[vl] \n"
"st2b {z1.b, z2.b}, p0, [%[dst_uv]] \n"
"incb %[dst_uv], all, mul #2 \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.b, wzr, %w[width] \n"
"ld1b {z1.b}, p0/z, [%[src_u]] \n"
"ld1b {z2.b}, p0/z, [%[src_v]] \n"
"subs %w[width], %w[width], %w[vl] \n"
"st2b {z1.b, z2.b}, p0, [%[dst_uv]] \n"
"99: \n"
: [src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_uv] "+r"(dst_uv), // %[dst_uv]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
:
: "memory", "cc", "z0", "z1", "z2", "p0");
}
__arm_locally_streaming void MergeUVRow_16_SME(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
int depth,
int width) {
int shift = 16 - depth;
// Streaming-SVE only, no use of ZA tile.
int vl;
asm volatile(
"cnth %x[vl] \n"
"mov z0.h, %w[shift] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.h \n"
"1: \n"
"ld1h {z1.h}, p0/z, [%[src_u]] \n"
"ld1h {z2.h}, p0/z, [%[src_v]] \n"
"incb %[src_u] \n"
"incb %[src_v] \n"
"lsl z1.h, p0/m, z1.h, z0.h \n"
"lsl z2.h, p0/m, z2.h, z0.h \n"
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z1.h, z2.h}, p0, [%[dst_uv]] \n"
"incb %[dst_uv], all, mul #2 \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.h, wzr, %w[width] \n"
"ld1h {z1.h}, p0/z, [%[src_u]] \n"
"ld1h {z2.h}, p0/z, [%[src_v]] \n"
"lsl z1.h, p0/m, z1.h, z0.h \n"
"lsl z2.h, p0/m, z2.h, z0.h \n"
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z1.h, z2.h}, p0, [%[dst_uv]] \n"
"99: \n"
: [src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_uv] "+r"(dst_uv), // %[dst_uv]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [shift] "r"(shift) // %[shift]
: "memory", "cc", "z0", "z1", "z2", "p0");
}
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
// defined(__aarch64__)