From b5f9d7cb76a1e31f1893df0d903a8a421f2fbba0 Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 28 May 2024 16:18:15 +0100 Subject: [PATCH] [AArch64] Add SME implementation of TransposeUVWxH We can make use of the ZA tile register to do the transpose and de-interleaving of UV components without any explicit permute instructions: the tile is loaded horizontally placing UV components into alternative columns, then we can just store the independent components vertically. Change-Id: I67bd82dc840a43888290be1c9db8a3c05f16d730 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5703588 Reviewed-by: Frank Barchard --- include/libyuv/rotate_row.h | 9 +++++ source/rotate.cc | 17 ++++++++ source/rotate_sme.cc | 77 +++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+) diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index a18437f4d..9ce9fad15 100644 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -76,6 +76,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_SME) && defined(__aarch64__) #define HAS_TRANSPOSEWXH_SME +#define HAS_TRANSPOSEUVWXH_SME #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) @@ -210,6 +211,14 @@ void TransposeUVWx8_NEON(const uint8_t* src, uint8_t* dst_b, int dst_stride_b, int width); +void TransposeUVWxH_SME(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); void TransposeUVWx16_MSA(const uint8_t* src, int src_stride, uint8_t* dst_a, diff --git a/source/rotate.cc b/source/rotate.cc index 16d4f0b4e..5f898fd03 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -273,6 +273,11 @@ void SplitTransposeUV(const uint8_t* src, int width, int height) { int i = height; +#if defined(HAS_TRANSPOSEUVWXH_SME) + void (*TransposeUVWxH)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, + int width, int height) = TransposeUVWxH_C; +#endif #if defined(HAS_TRANSPOSEUVWX16_MSA) void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, @@ -310,6 +315,11 @@ void SplitTransposeUV(const uint8_t* src, } } #endif +#if defined(HAS_TRANSPOSEUVWXH_SME) + if (TestCpuFlag(kCpuHasSME)) { + TransposeUVWxH = TransposeUVWxH_SME; + } +#endif #if defined(HAS_TRANSPOSEUVWX8_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { TransposeUVWx8 = TransposeUVWx8_Any_SSE2; @@ -320,6 +330,13 @@ void SplitTransposeUV(const uint8_t* src, #endif #endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */ +#if defined(HAS_TRANSPOSEUVWXH_SME) + if (TestCpuFlag(kCpuHasSME)) { + TransposeUVWxH(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width, i); + return; + } +#endif #if defined(HAS_TRANSPOSEUVWX16_MSA) // Work through the source in 8x8 tiles. while (i >= 16) { diff --git a/source/rotate_sme.cc b/source/rotate_sme.cc index 182785e95..70e2a0d40 100644 --- a/source/rotate_sme.cc +++ b/source/rotate_sme.cc @@ -87,6 +87,83 @@ __arm_locally_streaming __arm_new("za") void TransposeWxH_SME( } while (height > 0); } +__arm_locally_streaming __arm_new("za") void TransposeUVWxH_SME( + const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int vl; + asm("cnth %x0" : "=r"(vl)); + + do { + const uint8_t* src2 = src; + uint8_t* dst2_a = dst_a; + uint8_t* dst2_b = dst_b; + + // Process up to VL bytes per iteration of the inner loop. + int block_height = height > vl * 2 ? vl * 2 : height; + + int width2 = width; + do { + const uint8_t* src3 = src2; + + // Process up to VL 16-bit elements per iteration of the inner loop. + int block_width = width2 > vl ? vl : width2; + + asm volatile( + "mov w12, #0 \n" + + // Create a predicate to handle loading partial rows, + // %[block_width] is always a multiple of two here. + "whilelt p0.b, wzr, %w[block_width] \n" + + // Load H <= VL rows into ZA0, such that U/V components exist in + // alternating columns. + "1: \n" + "ld1b {za0h.b[w12, 0]}, p0/z, [%[src]] \n" + "add %[src], %[src], %[src_stride] \n" + "add w12, w12, #1 \n" + "cmp w12, %w[block_height] \n" + "b.ne 1b \n" + + // Create a predicate to handle storing partial columns. + "whilelt p0.b, wzr, %w[block_height] \n" + "mov w12, #0 \n" + + // Store alternating UV data from pairs of ZA0 columns. + "2: \n" + "st1b {za0v.b[w12, 0]}, p0, [%[dst_a]] \n" + "st1b {za0v.b[w12, 1]}, p0, [%[dst_b]] \n" + "add %[dst_a], %[dst_a], %[dst_stride_a] \n" + "add %[dst_b], %[dst_b], %[dst_stride_b] \n" + "add w12, w12, #2 \n" + "cmp w12, %w[block_width] \n" + "b.ne 2b \n" + : [src] "+r"(src3), // %[src] + [dst_a] "+r"(dst2_a), // %[dst_a] + [dst_b] "+r"(dst2_b) // %[dst_b] + : [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride] + [dst_stride_a] "r"((ptrdiff_t)dst_stride_a), // %[dst_stride_a] + [dst_stride_b] "r"((ptrdiff_t)dst_stride_b), // %[dst_stride_b] + [block_width] "r"(block_width * 2), // %[block_width] + [block_height] "r"(block_height) // %[block_height] + : "cc", "memory", "p0", "w12", "za"); + + src2 += 2 * vl; + width2 -= vl; + } while (width2 > 0); + + src += 2 * vl * src_stride; + dst_a += 2 * vl; + dst_b += 2 * vl; + height -= 2 * vl; + } while (height > 0); +} + #endif // !defined(LIBYUV_DISABLE_SME) && defined(__aarch64__) #ifdef __cplusplus