From b5f9d7cb76a1e31f1893df0d903a8a421f2fbba0 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 28 May 2024 16:18:15 +0100
Subject: [PATCH] [AArch64] Add SME implementation of TransposeUVWxH

We can make use of the ZA tile register to do the transpose and
de-interleaving of UV components without any explicit permute
instructions: the tile is loaded horizontally placing UV components into
alternative columns, then we can just store the independent components
vertically.

Change-Id: I67bd82dc840a43888290be1c9db8a3c05f16d730
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5703588
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/rotate_row.h |  9 +++++
 source/rotate.cc            | 17 ++++++++
 source/rotate_sme.cc        | 77 +++++++++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+)

diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h
index a18437f4d..9ce9fad15 100644
--- a/include/libyuv/rotate_row.h
+++ b/include/libyuv/rotate_row.h
@@ -76,6 +76,7 @@ extern "C" {
 
 #if !defined(LIBYUV_DISABLE_SME) && defined(__aarch64__)
 #define HAS_TRANSPOSEWXH_SME
+#define HAS_TRANSPOSEUVWXH_SME
 #endif
 
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@@ -210,6 +211,14 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                          uint8_t* dst_b,
                          int dst_stride_b,
                          int width);
+void TransposeUVWxH_SME(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst_a,
+                        int dst_stride_a,
+                        uint8_t* dst_b,
+                        int dst_stride_b,
+                        int width,
+                        int height);
 void TransposeUVWx16_MSA(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst_a,
diff --git a/source/rotate.cc b/source/rotate.cc
index 16d4f0b4e..5f898fd03 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -273,6 +273,11 @@ void SplitTransposeUV(const uint8_t* src,
                       int width,
                       int height) {
   int i = height;
+#if defined(HAS_TRANSPOSEUVWXH_SME)
+  void (*TransposeUVWxH)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                         int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+                         int width, int height) = TransposeUVWxH_C;
+#endif
 #if defined(HAS_TRANSPOSEUVWX16_MSA)
   void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
                           int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
@@ -310,6 +315,11 @@ void SplitTransposeUV(const uint8_t* src,
     }
   }
 #endif
+#if defined(HAS_TRANSPOSEUVWXH_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    TransposeUVWxH = TransposeUVWxH_SME;
+  }
+#endif
 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
@@ -320,6 +330,13 @@ void SplitTransposeUV(const uint8_t* src,
 #endif
 #endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
 
+#if defined(HAS_TRANSPOSEUVWXH_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    TransposeUVWxH(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                   width, i);
+    return;
+  }
+#endif
 #if defined(HAS_TRANSPOSEUVWX16_MSA)
   // Work through the source in 8x8 tiles.
   while (i >= 16) {
diff --git a/source/rotate_sme.cc b/source/rotate_sme.cc
index 182785e95..70e2a0d40 100644
--- a/source/rotate_sme.cc
+++ b/source/rotate_sme.cc
@@ -87,6 +87,83 @@ __arm_locally_streaming __arm_new("za") void TransposeWxH_SME(
   } while (height > 0);
 }
 
+__arm_locally_streaming __arm_new("za") void TransposeUVWxH_SME(
+    const uint8_t* src,
+    int src_stride,
+    uint8_t* dst_a,
+    int dst_stride_a,
+    uint8_t* dst_b,
+    int dst_stride_b,
+    int width,
+    int height) {
+  int vl;
+  asm("cnth %x0" : "=r"(vl));
+
+  do {
+    const uint8_t* src2 = src;
+    uint8_t* dst2_a = dst_a;
+    uint8_t* dst2_b = dst_b;
+
+    // Process up to VL bytes per iteration of the inner loop.
+    int block_height = height > vl * 2 ? vl * 2 : height;
+
+    int width2 = width;
+    do {
+      const uint8_t* src3 = src2;
+
+      // Process up to VL 16-bit elements per iteration of the inner loop.
+      int block_width = width2 > vl ? vl : width2;
+
+      asm volatile(
+          "mov      w12, #0                               \n"
+
+          // Create a predicate to handle loading partial rows,
+          // %[block_width] is always a multiple of two here.
+          "whilelt  p0.b, wzr, %w[block_width]            \n"
+
+          // Load H <= VL rows into ZA0, such that U/V components exist in
+          // alternating columns.
+          "1:                                             \n"
+          "ld1b     {za0h.b[w12, 0]}, p0/z, [%[src]]      \n"
+          "add      %[src], %[src], %[src_stride]         \n"
+          "add      w12, w12, #1                          \n"
+          "cmp      w12, %w[block_height]                 \n"
+          "b.ne     1b                                    \n"
+
+          // Create a predicate to handle storing partial columns.
+          "whilelt  p0.b, wzr, %w[block_height]           \n"
+          "mov      w12, #0                               \n"
+
+          // Store alternating UV data from pairs of ZA0 columns.
+          "2:                                             \n"
+          "st1b     {za0v.b[w12, 0]}, p0, [%[dst_a]]      \n"
+          "st1b     {za0v.b[w12, 1]}, p0, [%[dst_b]]      \n"
+          "add      %[dst_a], %[dst_a], %[dst_stride_a]   \n"
+          "add      %[dst_b], %[dst_b], %[dst_stride_b]   \n"
+          "add      w12, w12, #2                          \n"
+          "cmp      w12, %w[block_width]                  \n"
+          "b.ne     2b                                    \n"
+          : [src] "+r"(src3),                             // %[src]
+            [dst_a] "+r"(dst2_a),                         // %[dst_a]
+            [dst_b] "+r"(dst2_b)                          // %[dst_b]
+          : [src_stride] "r"((ptrdiff_t)src_stride),      // %[src_stride]
+            [dst_stride_a] "r"((ptrdiff_t)dst_stride_a),  // %[dst_stride_a]
+            [dst_stride_b] "r"((ptrdiff_t)dst_stride_b),  // %[dst_stride_b]
+            [block_width] "r"(block_width * 2),           // %[block_width]
+            [block_height] "r"(block_height)              // %[block_height]
+          : "cc", "memory", "p0", "w12", "za");
+
+      src2 += 2 * vl;
+      width2 -= vl;
+    } while (width2 > 0);
+
+    src += 2 * vl * src_stride;
+    dst_a += 2 * vl;
+    dst_b += 2 * vl;
+    height -= 2 * vl;
+  } while (height > 0);
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(__aarch64__)
 
 #ifdef __cplusplus