[AArch64] Add SME implementation of Convert16To8Row

Mostly just a straightforward copy of the Neon code ported to Streaming-SVE, we can use predication to avoid needing an `Any` kernel. SVE has a "widening multiply get high half" instruction in UMULH, however using the same technique as the Neon code to avoid the need for a widening multiply at all is more performant here. These is no benefit from this kernel when the SVE vector length is only 128 bits, so skip writing a non-streaming SVE implementation. Change-Id: Ib12699c5b8b168d004ebc74c0281ea3772ca8d32 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6070786 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
2026-02-07 02:09:50 +08:00 · 2024-11-16 21:34:39 +00:00 · 2024-11-16 21:34:39 +00:00 · 418b6df0de
commit 418b6df0de
parent 192b8c2238
4 changed files with 77 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -597,6 +597,7 @@ extern "C" {
 #define HAS_MERGEUVROW_16_SME
 #define HAS_MERGEUVROW_SME
 #define HAS_MULTIPLYROW_16_SME
 #define HAS_CONVERT16TO8ROW_SME
 #endif
 // The following are available on AArch64 platforms:
@ -3484,6 +3485,10 @@ void Convert16To8Row_Any_NEON(const uint16_t* src_ptr,
                              uint8_t* dst_ptr,
                              int scale,
                              int width);
 void Convert16To8Row_SME(const uint16_t* src_y,
                         uint8_t* dst_y,
                         int scale,
                         int width);
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
--- a/source/convert.cc
+++ b/source/convert.cc
@ -689,6 +689,11 @@ int I010ToNV12(const uint16_t* src_y,
    }
  }
 #endif
 #if defined(HAS_CONVERT16TO8ROW_SME)
  if (TestCpuFlag(kCpuHasSME)) {
    Convert16To8Row = Convert16To8Row_SME;
  }
 #endif
 #if defined(HAS_CONVERT16TO8ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    Convert16To8Row = Convert16To8Row_Any_SSSE3;
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -138,6 +138,11 @@ void Convert16To8Plane(const uint16_t* src_y,
    }
  }
 #endif
 #if defined(HAS_CONVERT16TO8ROW_SME)
  if (TestCpuFlag(kCpuHasSME)) {
    Convert16To8Row = Convert16To8Row_SME;
  }
 #endif
 #if defined(HAS_CONVERT16TO8ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    Convert16To8Row = Convert16To8Row_Any_SSSE3;
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -409,6 +409,68 @@ __arm_locally_streaming void MergeUVRow_16_SME(const uint16_t* src_u,
      : "memory", "cc", "z0", "z1", "z2", "p0");
 }
 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 32768 = 9 bits = shr 1
 // 16384 = 10 bits = shr 2
 // 4096 = 12 bits = shr 4
 // 256 = 16 bits = shr 8
 __arm_locally_streaming void Convert16To8Row_SME(const uint16_t* src_y,
                                                 uint8_t* dst_y,
                                                 int scale,
                                                 int width) {
  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
  // saturate, then we can just use UZP2 to narrow rather than a pair of
  // saturating narrow instructions.
  int shift = 23 - __builtin_clz((int32_t)scale);
  int vl;
  asm volatile(
      "cntb     %x[vl]                                  \n"
      "dup      z0.h, %w[shift]                         \n"
      "subs     %w[width], %w[width], %w[vl]            \n"
      "b.lt     2f                                      \n"
      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue    p0.b                                    \n"
      "1:                                               \n"
      "ld1h     {z1.h}, p0/z, [%[src_y]]                \n"
      "ld1h     {z2.h}, p0/z, [%[src_y], #1, mul vl]    \n"
      "incb     %[src_y], all, mul #2                   \n"
      "uqshl    z1.h, p0/m, z1.h, z0.h                  \n"
      "uqshl    z2.h, p0/m, z2.h, z0.h                  \n"
      "subs     %w[width], %w[width], %w[vl]            \n"
      "uzp2     z1.b, z1.b, z2.b                        \n"
      "st1b     {z1.b}, p0, [%[dst_y]]                  \n"
      "incb     %[dst_y]                                \n"
      "b.ge     1b                                      \n"
      "2:                                               \n"
      "adds     %w[width], %w[width], %w[vl]            \n"
      "b.eq     99f                                     \n"
      // Calculate a predicate for the final iteration to deal with the tail.
      // We need separate predicates for the load and store instructions since
      // they are operating on different element sizes (.b vs .h).
      "cnth     %x[vl]                                  \n"
      "whilelt  p0.h, wzr, %w[width]                    \n"
      "whilelt  p1.h, %w[vl], %w[width]                 \n"
      "whilelt  p2.b, wzr, %w[width]                    \n"
      "ld1h     {z1.h}, p0/z, [%[src_y]]                \n"
      "ld1h     {z2.h}, p1/z, [%[src_y], #1, mul vl]    \n"
      "uqshl    z1.h, p0/m, z1.h, z0.h                  \n"
      "uqshl    z2.h, p1/m, z2.h, z0.h                  \n"
      "uzp2     z1.b, z1.b, z2.b                        \n"
      "st1b     {z1.b}, p2, [%[dst_y]]                  \n"
      "99:                                              \n"
      : [src_y] "+r"(src_y),  // %[src_y]
        [dst_y] "+r"(dst_y),  // %[dst_y]
        [width] "+r"(width),  // %[width]
        [vl] "=&r"(vl)        // %[vl]
      : [shift] "r"(shift)    // %[shift]
      : "cc", "memory", "z0", "z1", "z2", "p0", "p1", "p2");
 }
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
        // defined(__aarch64__)