[AArch64] Add SME implementations of InterpolateRow{,_16,_16To8}

InterpolateRow_SME and InterpolateRow_16_SME need special cases to handle if source_y_fraction is 256 since this would overflow a byte and can just be a call to memcpy instead. InterpolateRow_16To8_SME is never called with a source_y_fraction value of 256 so there is no need for a special case here. Change-Id: I67805b5db2c411acb93ada626cf414b35620f467 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6074375 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-06 16:56:55 +08:00 · 2024-11-18 11:33:16 +00:00 · 2024-11-18 11:33:16 +00:00 · c2e7f8389a
commit c2e7f8389a
parent 2d8652f3e7
8 changed files with 501 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -596,6 +596,9 @@ extern "C" {
 #define HAS_COPYROW_SME
 #define HAS_I422TOARGBROW_SME
 #define HAS_I444TOARGBROW_SME
+#define HAS_INTERPOLATEROW_16_SME
+#define HAS_INTERPOLATEROW_16TO8_SME
+#define HAS_INTERPOLATEROW_SME
 #define HAS_MERGEUVROW_16_SME
 #define HAS_MERGEUVROW_SME
 #define HAS_MULTIPLYROW_16_SME
@ -6468,6 +6471,11 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
                         ptrdiff_t src_stride,
                         int dst_width,
                         int source_y_fraction);
+void InterpolateRow_SME(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int dst_width,
+                        int source_y_fraction);
 void InterpolateRow_MSA(uint8_t* dst_ptr,
                        const uint8_t* src_ptr,
                        ptrdiff_t src_stride,
@ -6524,6 +6532,11 @@ void InterpolateRow_16_Any_NEON(uint16_t* dst_ptr,
                                ptrdiff_t src_stride,
                                int width,
                                int source_y_fraction);
+void InterpolateRow_16_SME(uint16_t* dst_ptr,
+                           const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           int width,
+                           int source_y_fraction);

 void InterpolateRow_16To8_C(uint8_t* dst_ptr,
                            const uint16_t* src_ptr,
@ -6543,6 +6556,12 @@ void InterpolateRow_16To8_Any_NEON(uint8_t* dst_ptr,
                                   int scale,
                                   int width,
                                   int source_y_fraction);
+void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
+                              const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              int scale,
+                              int width,
+                              int source_y_fraction);
 void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
                               const uint16_t* src_ptr,
                               ptrdiff_t src_stride,
--- a/source/convert.cc
+++ b/source/convert.cc
@ -1248,6 +1248,11 @@ int I422ToNV21(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow = InterpolateRow_Any_MSA;
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -4443,6 +4443,11 @@ int InterpolatePlane(const uint8_t* src0,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow = InterpolateRow_Any_MSA;
@ -4528,6 +4533,11 @@ int InterpolatePlane_16(const uint16_t* src0,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow_16 = InterpolateRow_16_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_16_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow_16 = InterpolateRow_16_Any_MSA;
@ -5732,6 +5742,11 @@ int UYVYToNV12(const uint8_t* src_uyvy,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow = InterpolateRow_Any_MSA;
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -510,6 +510,408 @@ __arm_locally_streaming void CopyRow_SME(const uint8_t* src,
      : "memory", "cc", "z0", "p0");
 }

+__arm_locally_streaming static void HalfRow_SME(uint8_t* dst_ptr,
+                                                const uint8_t* src_ptr,
+                                                ptrdiff_t src_stride,
+                                                int width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+
+  int vl;
+  asm volatile(
+      "cntb     %x[vl]                                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.b                                    \n"
+      "1:                                               \n"
+      "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
+      "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "urhadd   z2.b, p0/m, z2.b, z3.b                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "st1b     {z2.b}, p0, [%[dst_ptr]]                \n"
+      "incb     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.b, wzr, %w[width]                    \n"
+      "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
+      "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
+      "urhadd   z2.b, p0/m, z2.b, z3.b                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "st1b     {z2.b}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),    // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),  // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),    // %[dst_ptr]
+        [width] "+r"(width),        // %[width]
+        [vl] "=&r"(vl)              // %[vl]
+      :
+      : "cc", "memory", "z0", "z1", "z2", "z3", "p0");
+}
+
+__arm_locally_streaming void InterpolateRow_SME(uint8_t* dst_ptr,
+                                                const uint8_t* src_ptr,
+                                                ptrdiff_t src_stride,
+                                                int width,
+                                                int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+
+  if (y0_fraction == 0) {
+    CopyRow_SME(src_ptr1, dst_ptr, width);
+    return;
+  }
+  if (y0_fraction == 128) {
+    HalfRow_SME(dst_ptr, src_ptr, src_stride, width);
+    return;
+  }
+  if (y0_fraction == 256) {
+    CopyRow_SME(src_ptr, dst_ptr, width);
+    return;
+  }
+
+  int vl;
+  asm volatile(
+      "cntb     %x[vl]                                  \n"
+      "dup      z0.b, %w[y0_fraction]                   \n"
+      "dup      z1.b, %w[y1_fraction]                   \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.b                                    \n"
+      "1:                                               \n"
+      "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
+      "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "umullb   z4.h, z2.b, z0.b                        \n"
+      "umullt   z2.h, z2.b, z0.b                        \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "umlalb   z4.h, z3.b, z1.b                        \n"
+      "umlalt   z2.h, z3.b, z1.b                        \n"
+      "rshrnb   z3.b, z4.h, #8                          \n"
+      "rshrnt   z3.b, z2.h, #8                          \n"
+      "st1b     {z3.b}, p0, [%[dst_ptr]]                \n"
+      "incb     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.b, wzr, %w[width]                    \n"
+      "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
+      "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
+      "umullb   z4.h, z2.b, z0.b                        \n"
+      "umullt   z2.h, z2.b, z0.b                        \n"
+      "umlalb   z4.h, z3.b, z1.b                        \n"
+      "umlalt   z2.h, z3.b, z1.b                        \n"
+      "rshrnb   z3.b, z4.h, #8                          \n"
+      "rshrnt   z3.b, z2.h, #8                          \n"
+      "st1b     {z3.b}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),         // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),       // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),         // %[dst_ptr]
+        [width] "+r"(width),             // %[width]
+        [vl] "=&r"(vl)                   // %[vl]
+      : [y0_fraction] "r"(y0_fraction),  // %[y0_fraction]
+        [y1_fraction] "r"(y1_fraction)   // %[y1_fraction]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0");
+}
+
+__arm_locally_streaming static void HalfRow_16_SME(uint16_t* dst_ptr,
+                                                   const uint16_t* src_ptr,
+                                                   ptrdiff_t src_stride,
+                                                   int width) {
+  int y1_fraction = 128;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+  int vl;
+  asm volatile(
+      "cnth     %x[vl]                                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.h                                    \n"
+      "1:                                               \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "st1h     {z2.h}, p0, [%[dst_ptr]]                \n"
+      "incb     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.h, wzr, %w[width]                    \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
+      "st1h     {z2.h}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),    // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),  // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),    // %[dst_ptr]
+        [width] "+r"(width),        // %[width]
+        [vl] "=&r"(vl)              // %[vl]
+      :
+      : "cc", "memory", "z0", "z1", "z2", "z3", "p0");
+}
+
+__arm_locally_streaming void InterpolateRow_16_SME(uint16_t* dst_ptr,
+                                                   const uint16_t* src_ptr,
+                                                   ptrdiff_t src_stride,
+                                                   int width,
+                                                   int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+  if (y0_fraction == 0) {
+    CopyRow_SME((const uint8_t*)src_ptr1, (uint8_t*)dst_ptr,
+                width * sizeof(uint16_t));
+    return;
+  }
+  if (y0_fraction == 128) {
+    HalfRow_16_SME(dst_ptr, src_ptr, src_stride, width);
+    return;
+  }
+  if (y0_fraction == 256) {
+    CopyRow_SME((const uint8_t*)src_ptr, (uint8_t*)dst_ptr,
+                width * sizeof(uint16_t));
+    return;
+  }
+
+  int vl;
+  asm volatile(
+      "cnth     %x[vl]                                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "dup      z0.h, %w[y0_fraction]                   \n"
+      "dup      z1.h, %w[y1_fraction]                   \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.h                                    \n"
+      "1:                                               \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "umullb   z4.s, z2.h, z0.h                        \n"
+      "umullt   z2.s, z2.h, z0.h                        \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "umlalb   z4.s, z3.h, z1.h                        \n"
+      "umlalt   z2.s, z3.h, z1.h                        \n"
+      "rshrnb   z3.h, z4.s, #8                          \n"
+      "rshrnt   z3.h, z2.s, #8                          \n"
+      "st1h     {z3.h}, p0, [%[dst_ptr]]                \n"
+      "incb     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.h, wzr, %w[width]                    \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "umullb   z4.s, z2.h, z0.h                        \n"
+      "umullt   z2.s, z2.h, z0.h                        \n"
+      "umlalb   z4.s, z3.h, z1.h                        \n"
+      "umlalt   z2.s, z3.h, z1.h                        \n"
+      "rshrnb   z3.h, z4.s, #8                          \n"
+      "rshrnt   z3.h, z2.s, #8                          \n"
+      "st1h     {z3.h}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),         // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),       // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),         // %[dst_ptr]
+        [width] "+r"(width),             // %[width]
+        [vl] "=&r"(vl)                   // %[vl]
+      : [y0_fraction] "r"(y0_fraction),  // %[y0_fraction]
+        [y1_fraction] "r"(y1_fraction)   // %[y1_fraction]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0");
+}
+
+__arm_locally_streaming static void HalfRow_16To8_SME(uint8_t* dst_ptr,
+                                                      const uint16_t* src_ptr,
+                                                      ptrdiff_t src_stride,
+                                                      int scale,
+                                                      int width) {
+  int y1_fraction = 128;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
+  // saturate, then we can just use UZP2 to narrow rather than a pair of
+  // saturating narrow instructions.
+  int shift = 23 - __builtin_clz((int32_t)scale);
+
+  int vl;
+  asm volatile(
+      "cnth     %x[vl]                                  \n"
+      "dup      z31.h, %w[shift]                        \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.h                                    \n"
+      "1:                                               \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "uqshl    z2.h, p0/m, z2.h, z31.h                 \n"
+      "shrnb    z2.b, z2.h, #8                          \n"
+      "st1b     {z2.h}, p0, [%[dst_ptr]]                \n"
+      "inch     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.h, wzr, %w[width]                    \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
+      "uqshl    z2.h, p0/m, z2.h, z31.h                 \n"
+      "shrnb    z2.b, z2.h, #8                          \n"
+      "st1b     {z2.h}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),    // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),  // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),    // %[dst_ptr]
+        [width] "+r"(width),        // %[width]
+        [vl] "=&r"(vl)              // %[vl]
+      : [shift] "r"(shift)          // %[shift]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "z31", "p0");
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+__arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
+                                                      const uint16_t* src_ptr,
+                                                      ptrdiff_t src_stride,
+                                                      int scale,
+                                                      int width,
+                                                      int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+  // y0_fraction == 0 is never called here.
+  if (y0_fraction == 128) {
+    HalfRow_16To8_SME(dst_ptr, src_ptr, src_stride, scale, width);
+    return;
+  }
+  if (y0_fraction == 256) {
+    Convert16To8Row_SME(src_ptr, dst_ptr, scale, width);
+    return;
+  }
+
+  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
+  // saturate, then we can just use UZP2 to narrow rather than a pair of
+  // saturating narrow instructions.
+  int shift = 23 - __builtin_clz((int32_t)scale);
+
+  int vl;
+  asm volatile(
+      "cnth     %x[vl]                                  \n"
+      "dup      z31.h, %w[shift]                        \n"
+      "dup      z0.h, %w[y0_fraction]                   \n"
+      "dup      z1.h, %w[y1_fraction]                   \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.h                                    \n"
+      "1:                                               \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "umullb   z4.s, z2.h, z0.h                        \n"
+      "umullt   z2.s, z2.h, z0.h                        \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "umlalb   z4.s, z3.h, z1.h                        \n"
+      "umlalt   z2.s, z3.h, z1.h                        \n"
+      "rshrnb   z3.h, z4.s, #8                          \n"
+      "rshrnt   z3.h, z2.s, #8                          \n"
+      "uqshl    z3.h, p0/m, z3.h, z31.h                 \n"
+      "shrnb    z3.b, z3.h, #8                          \n"
+      "st1b     {z3.h}, p0, [%[dst_ptr]]                \n"
+      "inch     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.h, wzr, %w[width]                    \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "umullb   z4.s, z2.h, z0.h                        \n"
+      "umullt   z2.s, z2.h, z0.h                        \n"
+      "umlalb   z4.s, z3.h, z1.h                        \n"
+      "umlalt   z2.s, z3.h, z1.h                        \n"
+      "rshrnb   z3.h, z4.s, #8                          \n"
+      "rshrnt   z3.h, z2.s, #8                          \n"
+      "uqshl    z3.h, p0/m, z3.h, z31.h                 \n"
+      "shrnb    z3.b, z3.h, #8                          \n"
+      "st1b     {z3.h}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),         // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),       // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),         // %[dst_ptr]
+        [width] "+r"(width),             // %[width]
+        [vl] "=&r"(vl)                   // %[vl]
+      : [y0_fraction] "r"(y0_fraction),  // %[y0_fraction]
+        [y1_fraction] "r"(y1_fraction),  // %[y1_fraction]
+        [shift] "r"(shift)               // %[shift]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
        // defined(__aarch64__)

--- a/source/scale.cc
+++ b/source/scale.cc
@ -1173,6 +1173,11 @@ static int ScalePlaneBilinearDown(int src_width,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow = InterpolateRow_Any_MSA;
@ -1312,6 +1317,11 @@ static int ScalePlaneBilinearDown_16(int src_width,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_16_SME;
+  }
+#endif

 #if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@ -1393,6 +1403,11 @@ static int ScalePlaneBilinearUp(int src_width,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_RVV)
  if (TestCpuFlag(kCpuHasRVV)) {
    InterpolateRow = InterpolateRow_RVV;
@ -1893,6 +1908,11 @@ static int ScalePlaneBilinearUp_16(int src_width,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_16_SME;
+  }
+#endif

  if (filtering && src_width >= 32768) {
    ScaleFilterCols = ScaleFilterCols64_16_C;
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@ -379,6 +379,11 @@ static int ScaleARGBBilinearDown(int src_width,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow = InterpolateRow_Any_MSA;
@ -507,6 +512,11 @@ static int ScaleARGBBilinearUp(int src_width,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow = InterpolateRow_Any_MSA;
@ -781,6 +791,11 @@ static int ScaleYUVToARGBBilinearUp(int src_width,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow = InterpolateRow_Any_MSA;
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -1662,6 +1662,11 @@ void ScalePlaneVertical(int src_height,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow = InterpolateRow_Any_MSA;
@ -1754,6 +1759,11 @@ void ScalePlaneVertical_16(int src_height,
      InterpolateRow = InterpolateRow_16_NEON;
    }
  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_16_SME;
+  }
 #endif
  for (j = 0; j < dst_height; ++j) {
    int yi;
@ -1811,6 +1821,11 @@ void ScalePlaneVertical_16To8(int src_height,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_16TO8_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow_16To8 = InterpolateRow_16To8_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_16TO8_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2;
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@ -414,6 +414,11 @@ static int ScaleUVBilinearDown(int src_width,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow = InterpolateRow_Any_MSA;
@ -535,6 +540,11 @@ static int ScaleUVBilinearUp(int src_width,
    }
  }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    InterpolateRow = InterpolateRow_Any_MSA;