[AArch64] Add SME impls of MultiplyRow_16 and ARGBMultiplyRow

Mostly just a translation of the existing Neon code to SME. Change-Id: Ic3d6b8ac774c9a1bb9204ed6c78c8802668bffe9 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6067147 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-06 16:56:55 +08:00 · 2024-09-12 10:21:17 +01:00 · 2024-09-12 10:21:17 +01:00 · 9144583f22
commit 9144583f22
parent 88a3472f52
3 changed files with 116 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -589,8 +589,10 @@ extern "C" {
 // The following are available on AArch64 SME platforms:
 #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
    defined(__aarch64__)
+#define HAS_ARGBMULTIPLYROW_SME
 #define HAS_I422TOARGBROW_SME
 #define HAS_I444TOARGBROW_SME
+#define HAS_MULTIPLYROW_16_SME
 #endif

 // The following are available on AArch64 platforms:
@ -3372,6 +3374,10 @@ void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr,
                             uint16_t* dst_ptr,
                             int scale,
                             int width);
+void MultiplyRow_16_SME(const uint16_t* src_y,
+                        uint16_t* dst_y,
+                        int scale,
+                        int width);

 void DivideRow_16_C(const uint16_t* src_y,
                    uint16_t* dst_y,
@ -5039,6 +5045,10 @@ void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf,
                              const uint8_t* uv_buf,
                              uint8_t* dst_ptr,
                              int width);
+void ARGBMultiplyRow_SME(const uint8_t* src_argb,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
 void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
                         const uint8_t* src_argb1,
                         uint8_t* dst_argb,
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -829,6 +829,11 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
    }
  }
 #endif
+#if defined(HAS_MULTIPLYROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MultiplyRow_16 = MultiplyRow_16_SME;
+  }
+#endif

  for (y = 0; y < height; ++y) {
    MultiplyRow_16(src_y, dst_y, scale, width);
@ -3134,6 +3139,11 @@ int ARGBMultiply(const uint8_t* src_argb0,
    }
  }
 #endif
+#if defined(HAS_ARGBMULTIPLYROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_SME;
+  }
+#endif
 #if defined(HAS_ARGBMULTIPLYROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -216,6 +216,102 @@ __arm_locally_streaming void I422ToARGBRow_SME(
      : "cc", "memory", YUVTORGB_SVE_REGS);
 }

+__arm_locally_streaming void MultiplyRow_16_SME(const uint16_t* src_y,
+                                                uint16_t* dst_y,
+                                                int scale,
+                                                int width) {
+  // Streaming-SVE only, no use of ZA tile.
+  int vl;
+  asm volatile(
+      "cnth    %x[vl]                                   \n"
+      "mov     z0.h, %w[scale]                          \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "b.lt    2f                                       \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue   p0.h                                     \n"
+      "1:                                               \n"
+      "ld1h    {z1.h}, p0/z, [%[src_y]]                 \n"
+      "incb    %[src_y]                                 \n"
+      "mul     z1.h, z0.h, z1.h                         \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "st1h    {z1.h}, p0, [%[dst_y]]                   \n"
+      "incb    %[dst_y]                                 \n"
+      "b.ge    1b                                       \n"
+
+      "2:                                               \n"
+      "adds    %w[width], %w[width], %w[vl]             \n"
+      "b.eq    99f                                      \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt p0.h, wzr, %w[width]                     \n"
+      "ld1h    {z1.h}, p0/z, [%[src_y]]                 \n"
+      "mul     z1.h, z0.h, z1.h                         \n"
+      "st1h    {z1.h}, p0, [%[dst_y]]                   \n"
+
+      "99:                                              \n"
+      : [src_y] "+r"(src_y),  // %[src_y]
+        [dst_y] "+r"(dst_y),  // %[dst_y]
+        [width] "+r"(width),  // %[width]
+        [vl] "=&r"(vl)        // %[vl]
+      : [scale] "r"(scale)    // %[scale]
+      : "memory", "cc", "z0", "z1", "p0");
+}
+
+__arm_locally_streaming void ARGBMultiplyRow_SME(const uint8_t* src_argb,
+                                                 const uint8_t* src_argb1,
+                                                 uint8_t* dst_argb,
+                                                 int width) {
+  // Streaming-SVE only, no use of ZA tile.
+  width *= 4;
+  int vl;
+  asm volatile(
+      "cntb    %x[vl]                                   \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "b.lt    2f                                       \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue   p0.b                                     \n"
+      "1:                                               \n"
+      "ld1b    {z0.b}, p0/z, [%[src_argb]]              \n"
+      "ld1b    {z1.b}, p0/z, [%[src_argb1]]             \n"
+      "incb    %[src_argb]                              \n"
+      "incb    %[src_argb1]                             \n"
+      "umullb  z2.h, z0.b, z1.b                         \n"
+      "umullt  z1.h, z0.b, z1.b                         \n"
+      "rshrnb  z0.b, z2.h, #8                           \n"
+      "rshrnt  z0.b, z1.h, #8                           \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "st1b    {z0.b}, p0, [%[dst_argb]]                \n"
+      "incb    %[dst_argb]                              \n"
+      "b.ge    1b                                       \n"
+
+      "2:                                               \n"
+      "adds    %w[width], %w[width], %w[vl]             \n"
+      "b.eq    99f                                      \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt p0.b, wzr, %w[width]                     \n"
+      "ld1b    {z0.b}, p0/z, [%[src_argb]]              \n"
+      "ld1b    {z1.b}, p0/z, [%[src_argb1]]             \n"
+      "umullb  z2.h, z0.b, z1.b                         \n"
+      "umullt  z1.h, z0.b, z1.b                         \n"
+      "rshrnb  z0.b, z2.h, #8                           \n"
+      "rshrnt  z0.b, z1.h, #8                           \n"
+      "st1b    {z0.b}, p0, [%[dst_argb]]                \n"
+
+      "99:                                              \n"
+      : [src_argb] "+r"(src_argb),    // %[src_argb]
+        [src_argb1] "+r"(src_argb1),  // %[src_argb1]
+        [dst_argb] "+r"(dst_argb),    // %[dst_argb]
+        [width] "+r"(width),          // %[width]
+        [vl] "=&r"(vl)                // %[vl]
+      :
+      : "memory", "cc", "z0", "z1", "z2", "p0", "p1");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
        // defined(__aarch64__)