[AArch64] Add SME implementation of MergeUVRow{,_16}

Mostly just a straightforward copy of the Neon code ported to Streaming-SVE, we can use predication to avoid needing an `Any` kernel and use ST2 to avoid needing a separate ZIP instruction. These is no benefit from this kernel when the SVE vector length is only 128 bits, so skip writing a non-streaming SVE implementation. Change-Id: I5ae36afe699b88f119dc545e49c59c5d85e98742 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6070785 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-02-06 01:39:49 +08:00 · 2024-11-16 21:00:36 +00:00 · 2024-11-16 21:00:36 +00:00 · 7391559cb4
commit 7391559cb4
parent 3e75e41e79
5 changed files with 153 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -594,6 +594,8 @@ extern "C" {
 #define HAS_ARGBMULTIPLYROW_SME
 #define HAS_I422TOARGBROW_SME
 #define HAS_I444TOARGBROW_SME
+#define HAS_MERGEUVROW_16_SME
+#define HAS_MERGEUVROW_SME
 #define HAS_MULTIPLYROW_16_SME
 #endif

@ -2796,6 +2798,10 @@ void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width);
+void MergeUVRow_SME(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
 void MergeUVRow_MSA(const uint8_t* src_u,
                    const uint8_t* src_v,
                    uint8_t* dst_uv,
@ -3339,6 +3345,11 @@ void MergeUVRow_16_Any_NEON(const uint16_t* src_u,
                            uint16_t* dst_uv,
                            int depth,
                            int width);
+void MergeUVRow_16_SME(const uint16_t* src_u,
+                       const uint16_t* src_v,
+                       uint16_t* dst_uv,
+                       int depth,
+                       int width);

 void SplitUVRow_16_C(const uint16_t* src_uv,
                     uint16_t* dst_u,
--- a/source/convert.cc
+++ b/source/convert.cc
@ -746,6 +746,11 @@ int I010ToNV12(const uint16_t* src_y,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow = MergeUVRow_SME;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    MergeUVRow = MergeUVRow_Any_MSA;
@ -1188,6 +1193,11 @@ int I422ToNV21(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow = MergeUVRow_SME;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    MergeUVRow = MergeUVRow_Any_MSA;
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@ -486,6 +486,11 @@ int ARGBToNV12(const uint8_t* src_argb,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow_ = MergeUVRow_SME;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    MergeUVRow_ = MergeUVRow_Any_MSA;
@ -702,6 +707,11 @@ int ARGBToNV21(const uint8_t* src_argb,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow_ = MergeUVRow_SME;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    MergeUVRow_ = MergeUVRow_Any_MSA;
@ -905,6 +915,11 @@ int ABGRToNV12(const uint8_t* src_abgr,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow_ = MergeUVRow_SME;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    MergeUVRow_ = MergeUVRow_Any_MSA;
@ -1109,6 +1124,11 @@ int ABGRToNV21(const uint8_t* src_abgr,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow_ = MergeUVRow_SME;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    MergeUVRow_ = MergeUVRow_Any_MSA;
@ -3522,6 +3542,11 @@ int RAWToJNV21(const uint8_t* src_raw,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow_ = MergeUVRow_SME;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    MergeUVRow_ = MergeUVRow_Any_MSA;
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -635,6 +635,11 @@ void MergeUVPlane(const uint8_t* src_u,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow = MergeUVRow_SME;
+  }
+#endif
 #if defined(HAS_MERGEUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    MergeUVRow = MergeUVRow_Any_MSA;
@ -774,6 +779,11 @@ void MergeUVPlane_16(const uint16_t* src_u,
    }
  }
 #endif
+#if defined(HAS_MERGEUVROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    MergeUVRow_16 = MergeUVRow_16_SME;
+  }
+#endif

  for (y = 0; y < height; ++y) {
    // Merge a row of U and V into a row of UV.
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -312,6 +312,103 @@ __arm_locally_streaming void ARGBMultiplyRow_SME(const uint8_t* src_argb,
      : "memory", "cc", "z0", "z1", "z2", "p0", "p1");
 }

+__arm_locally_streaming void MergeUVRow_SME(const uint8_t* src_u,
+                                            const uint8_t* src_v,
+                                            uint8_t* dst_uv,
+                                            int width) {
+  // Streaming-SVE only, no use of ZA tile.
+  int vl;
+  asm volatile(
+      "cntb    %x[vl]                                   \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "b.lt    2f                                       \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue   p0.b                                     \n"
+      "1:                                               \n"
+      "ld1b    {z1.b}, p0/z, [%[src_u]]                 \n"
+      "ld1b    {z2.b}, p0/z, [%[src_v]]                 \n"
+      "incb    %[src_u]                                 \n"
+      "incb    %[src_v]                                 \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "st2b    {z1.b, z2.b}, p0, [%[dst_uv]]            \n"
+      "incb    %[dst_uv], all, mul #2                   \n"
+      "b.ge    1b                                       \n"
+
+      "2:                                               \n"
+      "adds    %w[width], %w[width], %w[vl]             \n"
+      "b.eq    99f                                      \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt p0.b, wzr, %w[width]                     \n"
+      "ld1b    {z1.b}, p0/z, [%[src_u]]                 \n"
+      "ld1b    {z2.b}, p0/z, [%[src_v]]                 \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "st2b    {z1.b, z2.b}, p0, [%[dst_uv]]            \n"
+
+      "99:                                              \n"
+      : [src_u] "+r"(src_u),    // %[src_u]
+        [src_v] "+r"(src_v),    // %[src_v]
+        [dst_uv] "+r"(dst_uv),  // %[dst_uv]
+        [width] "+r"(width),    // %[width]
+        [vl] "=&r"(vl)          // %[vl]
+      :
+      : "memory", "cc", "z0", "z1", "z2", "p0");
+}
+
+__arm_locally_streaming void MergeUVRow_16_SME(const uint16_t* src_u,
+                                               const uint16_t* src_v,
+                                               uint16_t* dst_uv,
+                                               int depth,
+                                               int width) {
+  int shift = 16 - depth;
+  // Streaming-SVE only, no use of ZA tile.
+  int vl;
+  asm volatile(
+      "cnth    %x[vl]                                   \n"
+      "mov     z0.h, %w[shift]                          \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "b.lt    2f                                       \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue   p0.h                                     \n"
+      "1:                                               \n"
+      "ld1h    {z1.h}, p0/z, [%[src_u]]                 \n"
+      "ld1h    {z2.h}, p0/z, [%[src_v]]                 \n"
+      "incb    %[src_u]                                 \n"
+      "incb    %[src_v]                                 \n"
+      "lsl     z1.h, p0/m, z1.h, z0.h                   \n"
+      "lsl     z2.h, p0/m, z2.h, z0.h                   \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "st2h    {z1.h, z2.h}, p0, [%[dst_uv]]            \n"
+      "incb    %[dst_uv], all, mul #2                   \n"
+      "b.ge    1b                                       \n"
+
+      "2:                                               \n"
+      "adds    %w[width], %w[width], %w[vl]             \n"
+      "b.eq    99f                                      \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt p0.h, wzr, %w[width]                     \n"
+      "ld1h    {z1.h}, p0/z, [%[src_u]]                 \n"
+      "ld1h    {z2.h}, p0/z, [%[src_v]]                 \n"
+      "lsl     z1.h, p0/m, z1.h, z0.h                   \n"
+      "lsl     z2.h, p0/m, z2.h, z0.h                   \n"
+      "subs    %w[width], %w[width], %w[vl]             \n"
+      "st2h    {z1.h, z2.h}, p0, [%[dst_uv]]            \n"
+
+      "99:                                              \n"
+      : [src_u] "+r"(src_u),    // %[src_u]
+        [src_v] "+r"(src_v),    // %[src_v]
+        [dst_uv] "+r"(dst_uv),  // %[dst_uv]
+        [width] "+r"(width),    // %[width]
+        [vl] "=&r"(vl)          // %[vl]
+      : [shift] "r"(shift)      // %[shift]
+      : "memory", "cc", "z0", "z1", "z2", "p0");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
        // defined(__aarch64__)