[AArch64] Add SME implementation of CopyRow

Add a streaming-SVE implementation of CopyRow using normal vector load/store instructions. Change-Id: Ia551413f9740a96473fa2e8a0958953be2f4b04e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6074374 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2024-11-20 15:51:42 +00:00 · 2024-11-20 15:51:42 +00:00 · 2d8652f3e7
commit 2d8652f3e7
parent 418b6df0de
5 changed files with 57 additions and 1 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -592,12 +592,13 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
    defined(__aarch64__)
 #define HAS_ARGBMULTIPLYROW_SME
+#define HAS_CONVERT16TO8ROW_SME
+#define HAS_COPYROW_SME
 #define HAS_I422TOARGBROW_SME
 #define HAS_I444TOARGBROW_SME
 #define HAS_MERGEUVROW_16_SME
 #define HAS_MERGEUVROW_SME
 #define HAS_MULTIPLYROW_16_SME
-#define HAS_CONVERT16TO8ROW_SME
 #endif

 // The following are available on AArch64 platforms:
@ -3495,6 +3496,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_SME(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -77,6 +77,11 @@ void CopyPlane(const uint8_t* src_y,
    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
  }
 #endif
+#if defined(HAS_COPYROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    CopyRow = CopyRow_SME;
+  }
+#endif
 #if defined(HAS_COPYROW_RVV)
  if (TestCpuFlag(kCpuHasRVV)) {
    CopyRow = CopyRow_RVV;
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -249,6 +249,11 @@ void RotatePlane180(const uint8_t* src,
    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
  }
 #endif
+#if defined(HAS_COPYROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    CopyRow = CopyRow_SME;
+  }
+#endif
 #if defined(HAS_COPYROW_RVV)
  if (TestCpuFlag(kCpuHasRVV)) {
    CopyRow = CopyRow_RVV;
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@ -204,6 +204,11 @@ static int ARGBRotate180(const uint8_t* src_argb,
    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
  }
 #endif
+#if defined(HAS_COPYROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    CopyRow = CopyRow_SME;
+  }
+#endif
 #if defined(HAS_COPYROW_RVV)
  if (TestCpuFlag(kCpuHasRVV)) {
    CopyRow = CopyRow_RVV;
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -471,6 +471,45 @@ __arm_locally_streaming void Convert16To8Row_SME(const uint16_t* src_y,
      : "cc", "memory", "z0", "z1", "z2", "p0", "p1", "p2");
 }

+__arm_locally_streaming void CopyRow_SME(const uint8_t* src,
+                                         uint8_t* dst,
+                                         int width) {
+  // Streaming-SVE only, no use of ZA tile.
+  int vl;
+  asm volatile(
+      "cntb    %x[vl]                                   \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "b.lt    2f                                       \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue   p0.b                                     \n"
+      "1:                                               \n"
+      "ld1b    {z0.b}, p0/z, [%[src]]                   \n"
+      "incb    %[src]                                   \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "st1b    {z0.b}, p0, [%[dst]]                     \n"
+      "incb    %[dst]                                   \n"
+      "b.ge    1b                                       \n"
+
+      "2:                                               \n"
+      "adds    %w[width], %w[width], %w[vl]             \n"
+      "b.eq    99f                                      \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt p0.b, wzr, %w[width]                     \n"
+      "ld1b    {z0.b}, p0/z, [%[src]]                   \n"
+      "st1b    {z0.b}, p0, [%[dst]]                     \n"
+
+      "99:                                              \n"
+      : [src] "+r"(src),      // %[src]
+        [dst] "+r"(dst),      // %[dst]
+        [width] "+r"(width),  // %[width]
+        [vl] "=&r"(vl)        // %[vl]
+      :
+      : "memory", "cc", "z0", "p0");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
        // defined(__aarch64__)