From 9144583f22ba23900e89c03c8483d2f42c712f6c Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 12 Sep 2024 10:21:17 +0100
Subject: [PATCH] [AArch64] Add SME impls of MultiplyRow_16 and ARGBMultiplyRow

Mostly just a translation of the existing Neon code to SME.

Change-Id: Ic3d6b8ac774c9a1bb9204ed6c78c8802668bffe9
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6067147
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h       | 10 ++++
 source/planar_functions.cc | 10 ++++
 source/row_sme.cc          | 96 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 1953f9fe0..4dafb63b8 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -589,8 +589,10 @@ extern "C" {
 // The following are available on AArch64 SME platforms:
 #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
     defined(__aarch64__)
+#define HAS_ARGBMULTIPLYROW_SME
 #define HAS_I422TOARGBROW_SME
 #define HAS_I444TOARGBROW_SME
+#define HAS_MULTIPLYROW_16_SME
 #endif
 
 // The following are available on AArch64 platforms:
@@ -3372,6 +3374,10 @@ void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr,
                              uint16_t* dst_ptr,
                              int scale,
                              int width);
+void MultiplyRow_16_SME(const uint16_t* src_y,
+                        uint16_t* dst_y,
+                        int scale,
+                        int width);
 
 void DivideRow_16_C(const uint16_t* src_y,
                     uint16_t* dst_y,
@@ -5039,6 +5045,10 @@ void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
                               int width);
+void ARGBMultiplyRow_SME(const uint8_t* src_argb,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
 void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 3c2f3f850..cc909eb73 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -829,6 +829,11 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_MULTIPLYROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MultiplyRow_16 = MultiplyRow_16_SME;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     MultiplyRow_16(src_y, dst_y, scale, width);
@@ -3134,6 +3139,11 @@ int ARGBMultiply(const uint8_t* src_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBMULTIPLYROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_SME;
+  }
+#endif
 #if defined(HAS_ARGBMULTIPLYROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
diff --git a/source/row_sme.cc b/source/row_sme.cc
index 7676d9e64..da94cd7be 100644
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@@ -216,6 +216,102 @@ __arm_locally_streaming void I422ToARGBRow_SME(
       : "cc", "memory", YUVTORGB_SVE_REGS);
 }
 
+__arm_locally_streaming void MultiplyRow_16_SME(const uint16_t* src_y,
+                                                uint16_t* dst_y,
+                                                int scale,
+                                                int width) {
+  // Streaming-SVE only, no use of ZA tile.
+  int vl;
+  asm volatile(
+      "cnth    %x[vl]                                   \n"
+      "mov     z0.h, %w[scale]                          \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "b.lt    2f                                       \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue   p0.h                                     \n"
+      "1:                                               \n"
+      "ld1h    {z1.h}, p0/z, [%[src_y]]                 \n"
+      "incb    %[src_y]                                 \n"
+      "mul     z1.h, z0.h, z1.h                         \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "st1h    {z1.h}, p0, [%[dst_y]]                   \n"
+      "incb    %[dst_y]                                 \n"
+      "b.ge    1b                                       \n"
+
+      "2:                                               \n"
+      "adds    %w[width], %w[width], %w[vl]             \n"
+      "b.eq    99f                                      \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt p0.h, wzr, %w[width]                     \n"
+      "ld1h    {z1.h}, p0/z, [%[src_y]]                 \n"
+      "mul     z1.h, z0.h, z1.h                         \n"
+      "st1h    {z1.h}, p0, [%[dst_y]]                   \n"
+
+      "99:                                              \n"
+      : [src_y] "+r"(src_y),  // %[src_y]
+        [dst_y] "+r"(dst_y),  // %[dst_y]
+        [width] "+r"(width),  // %[width]
+        [vl] "=&r"(vl)        // %[vl]
+      : [scale] "r"(scale)    // %[scale]
+      : "memory", "cc", "z0", "z1", "p0");
+}
+
+__arm_locally_streaming void ARGBMultiplyRow_SME(const uint8_t* src_argb,
+                                                 const uint8_t* src_argb1,
+                                                 uint8_t* dst_argb,
+                                                 int width) {
+  // Streaming-SVE only, no use of ZA tile.
+  width *= 4;
+  int vl;
+  asm volatile(
+      "cntb    %x[vl]                                   \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "b.lt    2f                                       \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue   p0.b                                     \n"
+      "1:                                               \n"
+      "ld1b    {z0.b}, p0/z, [%[src_argb]]              \n"
+      "ld1b    {z1.b}, p0/z, [%[src_argb1]]             \n"
+      "incb    %[src_argb]                              \n"
+      "incb    %[src_argb1]                             \n"
+      "umullb  z2.h, z0.b, z1.b                         \n"
+      "umullt  z1.h, z0.b, z1.b                         \n"
+      "rshrnb  z0.b, z2.h, #8                           \n"
+      "rshrnt  z0.b, z1.h, #8                           \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "st1b    {z0.b}, p0, [%[dst_argb]]                \n"
+      "incb    %[dst_argb]                              \n"
+      "b.ge    1b                                       \n"
+
+      "2:                                               \n"
+      "adds    %w[width], %w[width], %w[vl]             \n"
+      "b.eq    99f                                      \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt p0.b, wzr, %w[width]                     \n"
+      "ld1b    {z0.b}, p0/z, [%[src_argb]]              \n"
+      "ld1b    {z1.b}, p0/z, [%[src_argb1]]             \n"
+      "umullb  z2.h, z0.b, z1.b                         \n"
+      "umullt  z1.h, z0.b, z1.b                         \n"
+      "rshrnb  z0.b, z2.h, #8                           \n"
+      "rshrnt  z0.b, z1.h, #8                           \n"
+      "st1b    {z0.b}, p0, [%[dst_argb]]                \n"
+
+      "99:                                              \n"
+      : [src_argb] "+r"(src_argb),    // %[src_argb]
+        [src_argb1] "+r"(src_argb1),  // %[src_argb1]
+        [dst_argb] "+r"(dst_argb),    // %[dst_argb]
+        [width] "+r"(width),          // %[width]
+        [vl] "=&r"(vl)                // %[vl]
+      :
+      : "memory", "cc", "z0", "z1", "z2", "p0", "p1");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
         // defined(__aarch64__)