[AArch64] Add SME implementation of Convert16To8Row

Mostly just a straightforward copy of the Neon code ported to Streaming-SVE, we can use predication to avoid needing an `Any` kernel. SVE has a "widening multiply get high half" instruction in UMULH, however using the same technique as the Neon code to avoid the need for a widening multiply at all is more performant here. These is no benefit from this kernel when the SVE vector length is only 128 bits, so skip writing a non-streaming SVE implementation. Change-Id: Ib12699c5b8b168d004ebc74c0281ea3772ca8d32 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6070786 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
2026-01-01 03:12:16 +08:00 · 2024-11-16 21:34:39 +00:00 · 2024-11-16 21:34:39 +00:00 · 418b6df0de
commit 418b6df0de
parent 192b8c2238
4 changed files with 77 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -597,6 +597,7 @@ extern "C" {
 #define HAS_MERGEUVROW_16_SME
 #define HAS_MERGEUVROW_SME
 #define HAS_MULTIPLYROW_16_SME
+#define HAS_CONVERT16TO8ROW_SME
 #endif

 // The following are available on AArch64 platforms:
@ -3484,6 +3485,10 @@ void Convert16To8Row_Any_NEON(const uint16_t* src_ptr,
                              uint8_t* dst_ptr,
                              int scale,
                              int width);
+void Convert16To8Row_SME(const uint16_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int width);

 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
--- a/source/convert.cc
+++ b/source/convert.cc
@ -689,6 +689,11 @@ int I010ToNV12(const uint16_t* src_y,
    }
  }
 #endif
+#if defined(HAS_CONVERT16TO8ROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    Convert16To8Row = Convert16To8Row_SME;
+  }
+#endif
 #if defined(HAS_CONVERT16TO8ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    Convert16To8Row = Convert16To8Row_Any_SSSE3;
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -138,6 +138,11 @@ void Convert16To8Plane(const uint16_t* src_y,
    }
  }
 #endif
+#if defined(HAS_CONVERT16TO8ROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    Convert16To8Row = Convert16To8Row_SME;
+  }
+#endif
 #if defined(HAS_CONVERT16TO8ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    Convert16To8Row = Convert16To8Row_Any_SSSE3;
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -409,6 +409,68 @@ __arm_locally_streaming void MergeUVRow_16_SME(const uint16_t* src_u,
      : "memory", "cc", "z0", "z1", "z2", "p0");
 }

+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits = shr 1
+// 16384 = 10 bits = shr 2
+// 4096 = 12 bits = shr 4
+// 256 = 16 bits = shr 8
+__arm_locally_streaming void Convert16To8Row_SME(const uint16_t* src_y,
+                                                 uint8_t* dst_y,
+                                                 int scale,
+                                                 int width) {
+  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
+  // saturate, then we can just use UZP2 to narrow rather than a pair of
+  // saturating narrow instructions.
+  int shift = 23 - __builtin_clz((int32_t)scale);
+  int vl;
+  asm volatile(
+      "cntb     %x[vl]                                  \n"
+      "dup      z0.h, %w[shift]                         \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.b                                    \n"
+      "1:                                               \n"
+      "ld1h     {z1.h}, p0/z, [%[src_y]]                \n"
+      "ld1h     {z2.h}, p0/z, [%[src_y], #1, mul vl]    \n"
+      "incb     %[src_y], all, mul #2                   \n"
+      "uqshl    z1.h, p0/m, z1.h, z0.h                  \n"
+      "uqshl    z2.h, p0/m, z2.h, z0.h                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "uzp2     z1.b, z1.b, z2.b                        \n"
+      "st1b     {z1.b}, p0, [%[dst_y]]                  \n"
+      "incb     %[dst_y]                                \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      // We need separate predicates for the load and store instructions since
+      // they are operating on different element sizes (.b vs .h).
+      "cnth     %x[vl]                                  \n"
+      "whilelt  p0.h, wzr, %w[width]                    \n"
+      "whilelt  p1.h, %w[vl], %w[width]                 \n"
+      "whilelt  p2.b, wzr, %w[width]                    \n"
+      "ld1h     {z1.h}, p0/z, [%[src_y]]                \n"
+      "ld1h     {z2.h}, p1/z, [%[src_y], #1, mul vl]    \n"
+      "uqshl    z1.h, p0/m, z1.h, z0.h                  \n"
+      "uqshl    z2.h, p1/m, z2.h, z0.h                  \n"
+      "uzp2     z1.b, z1.b, z2.b                        \n"
+      "st1b     {z1.b}, p2, [%[dst_y]]                  \n"
+
+      "99:                                              \n"
+      : [src_y] "+r"(src_y),  // %[src_y]
+        [dst_y] "+r"(dst_y),  // %[dst_y]
+        [width] "+r"(width),  // %[width]
+        [vl] "=&r"(vl)        // %[vl]
+      : [shift] "r"(shift)    // %[shift]
+      : "cc", "memory", "z0", "z1", "z2", "p0", "p1", "p2");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
        // defined(__aarch64__)