mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Add SME implementation of TransposeUVWxH
We can make use of the ZA tile register to do the transpose and de-interleaving of UV components without any explicit permute instructions: the tile is loaded horizontally placing UV components into alternative columns, then we can just store the independent components vertically. Change-Id: I67bd82dc840a43888290be1c9db8a3c05f16d730 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5703588 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
15ecca81f7
commit
b5f9d7cb76
@ -76,6 +76,7 @@ extern "C" {
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_SME) && defined(__aarch64__)
|
||||
#define HAS_TRANSPOSEWXH_SME
|
||||
#define HAS_TRANSPOSEUVWXH_SME
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
@ -210,6 +211,14 @@ void TransposeUVWx8_NEON(const uint8_t* src,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width);
|
||||
void TransposeUVWxH_SME(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width,
|
||||
int height);
|
||||
void TransposeUVWx16_MSA(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
|
||||
@ -273,6 +273,11 @@ void SplitTransposeUV(const uint8_t* src,
|
||||
int width,
|
||||
int height) {
|
||||
int i = height;
|
||||
#if defined(HAS_TRANSPOSEUVWXH_SME)
|
||||
void (*TransposeUVWxH)(const uint8_t* src, int src_stride, uint8_t* dst_a,
|
||||
int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
|
||||
int width, int height) = TransposeUVWxH_C;
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSEUVWX16_MSA)
|
||||
void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
|
||||
int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
|
||||
@ -310,6 +315,11 @@ void SplitTransposeUV(const uint8_t* src,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSEUVWXH_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
TransposeUVWxH = TransposeUVWxH_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
|
||||
@ -320,6 +330,13 @@ void SplitTransposeUV(const uint8_t* src,
|
||||
#endif
|
||||
#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
|
||||
|
||||
#if defined(HAS_TRANSPOSEUVWXH_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
TransposeUVWxH(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
|
||||
width, i);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSEUVWX16_MSA)
|
||||
// Work through the source in 8x8 tiles.
|
||||
while (i >= 16) {
|
||||
|
||||
@ -87,6 +87,83 @@ __arm_locally_streaming __arm_new("za") void TransposeWxH_SME(
|
||||
} while (height > 0);
|
||||
}
|
||||
|
||||
__arm_locally_streaming __arm_new("za") void TransposeUVWxH_SME(
|
||||
const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_a,
|
||||
int dst_stride_a,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width,
|
||||
int height) {
|
||||
int vl;
|
||||
asm("cnth %x0" : "=r"(vl));
|
||||
|
||||
do {
|
||||
const uint8_t* src2 = src;
|
||||
uint8_t* dst2_a = dst_a;
|
||||
uint8_t* dst2_b = dst_b;
|
||||
|
||||
// Process up to VL bytes per iteration of the inner loop.
|
||||
int block_height = height > vl * 2 ? vl * 2 : height;
|
||||
|
||||
int width2 = width;
|
||||
do {
|
||||
const uint8_t* src3 = src2;
|
||||
|
||||
// Process up to VL 16-bit elements per iteration of the inner loop.
|
||||
int block_width = width2 > vl ? vl : width2;
|
||||
|
||||
asm volatile(
|
||||
"mov w12, #0 \n"
|
||||
|
||||
// Create a predicate to handle loading partial rows,
|
||||
// %[block_width] is always a multiple of two here.
|
||||
"whilelt p0.b, wzr, %w[block_width] \n"
|
||||
|
||||
// Load H <= VL rows into ZA0, such that U/V components exist in
|
||||
// alternating columns.
|
||||
"1: \n"
|
||||
"ld1b {za0h.b[w12, 0]}, p0/z, [%[src]] \n"
|
||||
"add %[src], %[src], %[src_stride] \n"
|
||||
"add w12, w12, #1 \n"
|
||||
"cmp w12, %w[block_height] \n"
|
||||
"b.ne 1b \n"
|
||||
|
||||
// Create a predicate to handle storing partial columns.
|
||||
"whilelt p0.b, wzr, %w[block_height] \n"
|
||||
"mov w12, #0 \n"
|
||||
|
||||
// Store alternating UV data from pairs of ZA0 columns.
|
||||
"2: \n"
|
||||
"st1b {za0v.b[w12, 0]}, p0, [%[dst_a]] \n"
|
||||
"st1b {za0v.b[w12, 1]}, p0, [%[dst_b]] \n"
|
||||
"add %[dst_a], %[dst_a], %[dst_stride_a] \n"
|
||||
"add %[dst_b], %[dst_b], %[dst_stride_b] \n"
|
||||
"add w12, w12, #2 \n"
|
||||
"cmp w12, %w[block_width] \n"
|
||||
"b.ne 2b \n"
|
||||
: [src] "+r"(src3), // %[src]
|
||||
[dst_a] "+r"(dst2_a), // %[dst_a]
|
||||
[dst_b] "+r"(dst2_b) // %[dst_b]
|
||||
: [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride]
|
||||
[dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a]
|
||||
[dst_stride_b] "r"((ptrdiff_t)dst_stride_b), // %[dst_stride_b]
|
||||
[block_width] "r"(block_width * 2), // %[block_width]
|
||||
[block_height] "r"(block_height) // %[block_height]
|
||||
: "cc", "memory", "p0", "w12", "za");
|
||||
|
||||
src2 += 2 * vl;
|
||||
width2 -= vl;
|
||||
} while (width2 > 0);
|
||||
|
||||
src += 2 * vl * src_stride;
|
||||
dst_a += 2 * vl;
|
||||
dst_b += 2 * vl;
|
||||
height -= 2 * vl;
|
||||
} while (height > 0);
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_SME) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user