mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Add SME implementation of MergeUVRow{,_16}
Mostly just a straightforward copy of the Neon code ported to Streaming-SVE, we can use predication to avoid needing an `Any` kernel and use ST2 to avoid needing a separate ZIP instruction. These is no benefit from this kernel when the SVE vector length is only 128 bits, so skip writing a non-streaming SVE implementation. Change-Id: I5ae36afe699b88f119dc545e49c59c5d85e98742 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6070785 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
3e75e41e79
commit
7391559cb4
@ -594,6 +594,8 @@ extern "C" {
|
||||
#define HAS_ARGBMULTIPLYROW_SME
|
||||
#define HAS_I422TOARGBROW_SME
|
||||
#define HAS_I444TOARGBROW_SME
|
||||
#define HAS_MERGEUVROW_16_SME
|
||||
#define HAS_MERGEUVROW_SME
|
||||
#define HAS_MULTIPLYROW_16_SME
|
||||
#endif
|
||||
|
||||
@ -2796,6 +2798,10 @@ void MergeUVRow_NEON(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void MergeUVRow_SME(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void MergeUVRow_MSA(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
@ -3339,6 +3345,11 @@ void MergeUVRow_16_Any_NEON(const uint16_t* src_u,
|
||||
uint16_t* dst_uv,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeUVRow_16_SME(const uint16_t* src_u,
|
||||
const uint16_t* src_v,
|
||||
uint16_t* dst_uv,
|
||||
int depth,
|
||||
int width);
|
||||
|
||||
void SplitUVRow_16_C(const uint16_t* src_uv,
|
||||
uint16_t* dst_u,
|
||||
|
||||
@ -746,6 +746,11 @@ int I010ToNV12(const uint16_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
MergeUVRow = MergeUVRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
MergeUVRow = MergeUVRow_Any_MSA;
|
||||
@ -1188,6 +1193,11 @@ int I422ToNV21(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
MergeUVRow = MergeUVRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
MergeUVRow = MergeUVRow_Any_MSA;
|
||||
|
||||
@ -486,6 +486,11 @@ int ARGBToNV12(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
MergeUVRow_ = MergeUVRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_MSA;
|
||||
@ -702,6 +707,11 @@ int ARGBToNV21(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
MergeUVRow_ = MergeUVRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_MSA;
|
||||
@ -905,6 +915,11 @@ int ABGRToNV12(const uint8_t* src_abgr,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
MergeUVRow_ = MergeUVRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_MSA;
|
||||
@ -1109,6 +1124,11 @@ int ABGRToNV21(const uint8_t* src_abgr,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
MergeUVRow_ = MergeUVRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_MSA;
|
||||
@ -3522,6 +3542,11 @@ int RAWToJNV21(const uint8_t* src_raw,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
MergeUVRow_ = MergeUVRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
MergeUVRow_ = MergeUVRow_Any_MSA;
|
||||
|
||||
@ -635,6 +635,11 @@ void MergeUVPlane(const uint8_t* src_u,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
MergeUVRow = MergeUVRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
MergeUVRow = MergeUVRow_Any_MSA;
|
||||
@ -774,6 +779,11 @@ void MergeUVPlane_16(const uint16_t* src_u,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEUVROW_16_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
MergeUVRow_16 = MergeUVRow_16_SME;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
// Merge a row of U and V into a row of UV.
|
||||
|
||||
@ -312,6 +312,103 @@ __arm_locally_streaming void ARGBMultiplyRow_SME(const uint8_t* src_argb,
|
||||
: "memory", "cc", "z0", "z1", "z2", "p0", "p1");
|
||||
}
|
||||
|
||||
__arm_locally_streaming void MergeUVRow_SME(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cntb %x[vl] \n"
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// Run bulk of computation with an all-true predicate to avoid predicate
|
||||
// generation overhead.
|
||||
"ptrue p0.b \n"
|
||||
"1: \n"
|
||||
"ld1b {z1.b}, p0/z, [%[src_u]] \n"
|
||||
"ld1b {z2.b}, p0/z, [%[src_v]] \n"
|
||||
"incb %[src_u] \n"
|
||||
"incb %[src_v] \n"
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2b {z1.b, z2.b}, p0, [%[dst_uv]] \n"
|
||||
"incb %[dst_uv], all, mul #2 \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[width], %w[width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"whilelt p0.b, wzr, %w[width] \n"
|
||||
"ld1b {z1.b}, p0/z, [%[src_u]] \n"
|
||||
"ld1b {z2.b}, p0/z, [%[src_v]] \n"
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2b {z1.b, z2.b}, p0, [%[dst_uv]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_u] "+r"(src_u), // %[src_u]
|
||||
[src_v] "+r"(src_v), // %[src_v]
|
||||
[dst_uv] "+r"(dst_uv), // %[dst_uv]
|
||||
[width] "+r"(width), // %[width]
|
||||
[vl] "=&r"(vl) // %[vl]
|
||||
:
|
||||
: "memory", "cc", "z0", "z1", "z2", "p0");
|
||||
}
|
||||
|
||||
__arm_locally_streaming void MergeUVRow_16_SME(const uint16_t* src_u,
|
||||
const uint16_t* src_v,
|
||||
uint16_t* dst_uv,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 16 - depth;
|
||||
// Streaming-SVE only, no use of ZA tile.
|
||||
int vl;
|
||||
asm volatile(
|
||||
"cnth %x[vl] \n"
|
||||
"mov z0.h, %w[shift] \n"
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// Run bulk of computation with an all-true predicate to avoid predicate
|
||||
// generation overhead.
|
||||
"ptrue p0.h \n"
|
||||
"1: \n"
|
||||
"ld1h {z1.h}, p0/z, [%[src_u]] \n"
|
||||
"ld1h {z2.h}, p0/z, [%[src_v]] \n"
|
||||
"incb %[src_u] \n"
|
||||
"incb %[src_v] \n"
|
||||
"lsl z1.h, p0/m, z1.h, z0.h \n"
|
||||
"lsl z2.h, p0/m, z2.h, z0.h \n"
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2h {z1.h, z2.h}, p0, [%[dst_uv]] \n"
|
||||
"incb %[dst_uv], all, mul #2 \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[width], %w[width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"whilelt p0.h, wzr, %w[width] \n"
|
||||
"ld1h {z1.h}, p0/z, [%[src_u]] \n"
|
||||
"ld1h {z2.h}, p0/z, [%[src_v]] \n"
|
||||
"lsl z1.h, p0/m, z1.h, z0.h \n"
|
||||
"lsl z2.h, p0/m, z2.h, z0.h \n"
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2h {z1.h, z2.h}, p0, [%[dst_uv]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_u] "+r"(src_u), // %[src_u]
|
||||
[src_v] "+r"(src_v), // %[src_v]
|
||||
[dst_uv] "+r"(dst_uv), // %[dst_uv]
|
||||
[width] "+r"(width), // %[width]
|
||||
[vl] "=&r"(vl) // %[vl]
|
||||
: [shift] "r"(shift) // %[shift]
|
||||
: "memory", "cc", "z0", "z1", "z2", "p0");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
|
||||
// defined(__aarch64__)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user