diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index d8cf92296..10946f403 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -596,6 +596,9 @@ extern "C" {
 #define HAS_COPYROW_SME
 #define HAS_I422TOARGBROW_SME
 #define HAS_I444TOARGBROW_SME
+#define HAS_INTERPOLATEROW_16_SME
+#define HAS_INTERPOLATEROW_16TO8_SME
+#define HAS_INTERPOLATEROW_SME
 #define HAS_MERGEUVROW_16_SME
 #define HAS_MERGEUVROW_SME
 #define HAS_MULTIPLYROW_16_SME
@@ -6468,6 +6471,11 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
                          ptrdiff_t src_stride,
                          int dst_width,
                          int source_y_fraction);
+void InterpolateRow_SME(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int dst_width,
+                        int source_y_fraction);
 void InterpolateRow_MSA(uint8_t* dst_ptr,
                         const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
@@ -6524,6 +6532,11 @@ void InterpolateRow_16_Any_NEON(uint16_t* dst_ptr,
                                 ptrdiff_t src_stride,
                                 int width,
                                 int source_y_fraction);
+void InterpolateRow_16_SME(uint16_t* dst_ptr,
+                           const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           int width,
+                           int source_y_fraction);
 
 void InterpolateRow_16To8_C(uint8_t* dst_ptr,
                             const uint16_t* src_ptr,
@@ -6543,6 +6556,12 @@ void InterpolateRow_16To8_Any_NEON(uint8_t* dst_ptr,
                                    int scale,
                                    int width,
                                    int source_y_fraction);
+void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
+                              const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              int scale,
+                              int width,
+                              int source_y_fraction);
 void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
                                const uint16_t* src_ptr,
                                ptrdiff_t src_stride,
diff --git a/source/convert.cc b/source/convert.cc
index 50779c070..bf886bc1c 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -1248,6 +1248,11 @@ int I422ToNV21(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index e22d80b3d..e2c264730 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -4443,6 +4443,11 @@ int InterpolatePlane(const uint8_t* src0,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -4528,6 +4533,11 @@ int InterpolatePlane_16(const uint16_t* src0,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow_16 = InterpolateRow_16_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_16_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow_16 = InterpolateRow_16_Any_MSA;
@@ -5732,6 +5742,11 @@ int UYVYToNV12(const uint8_t* src_uyvy,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
diff --git a/source/row_sme.cc b/source/row_sme.cc
index d4b230d08..253d5c2dd 100644
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@@ -510,6 +510,408 @@ __arm_locally_streaming void CopyRow_SME(const uint8_t* src,
       : "memory", "cc", "z0", "p0");
 }
 
+__arm_locally_streaming static void HalfRow_SME(uint8_t* dst_ptr,
+                                                const uint8_t* src_ptr,
+                                                ptrdiff_t src_stride,
+                                                int width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+
+  int vl;
+  asm volatile(
+      "cntb     %x[vl]                                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.b                                    \n"
+      "1:                                               \n"
+      "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
+      "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "urhadd   z2.b, p0/m, z2.b, z3.b                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "st1b     {z2.b}, p0, [%[dst_ptr]]                \n"
+      "incb     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.b, wzr, %w[width]                    \n"
+      "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
+      "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
+      "urhadd   z2.b, p0/m, z2.b, z3.b                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "st1b     {z2.b}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),    // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),  // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),    // %[dst_ptr]
+        [width] "+r"(width),        // %[width]
+        [vl] "=&r"(vl)              // %[vl]
+      :
+      : "cc", "memory", "z0", "z1", "z2", "z3", "p0");
+}
+
+__arm_locally_streaming void InterpolateRow_SME(uint8_t* dst_ptr,
+                                                const uint8_t* src_ptr,
+                                                ptrdiff_t src_stride,
+                                                int width,
+                                                int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+
+  if (y0_fraction == 0) {
+    CopyRow_SME(src_ptr1, dst_ptr, width);
+    return;
+  }
+  if (y0_fraction == 128) {
+    HalfRow_SME(dst_ptr, src_ptr, src_stride, width);
+    return;
+  }
+  if (y0_fraction == 256) {
+    CopyRow_SME(src_ptr, dst_ptr, width);
+    return;
+  }
+
+  int vl;
+  asm volatile(
+      "cntb     %x[vl]                                  \n"
+      "dup      z0.b, %w[y0_fraction]                   \n"
+      "dup      z1.b, %w[y1_fraction]                   \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.b                                    \n"
+      "1:                                               \n"
+      "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
+      "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "umullb   z4.h, z2.b, z0.b                        \n"
+      "umullt   z2.h, z2.b, z0.b                        \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "umlalb   z4.h, z3.b, z1.b                        \n"
+      "umlalt   z2.h, z3.b, z1.b                        \n"
+      "rshrnb   z3.b, z4.h, #8                          \n"
+      "rshrnt   z3.b, z2.h, #8                          \n"
+      "st1b     {z3.b}, p0, [%[dst_ptr]]                \n"
+      "incb     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.b, wzr, %w[width]                    \n"
+      "ld1b     {z2.b}, p0/z, [%[src_ptr]]              \n"
+      "ld1b     {z3.b}, p0/z, [%[src_ptr1]]             \n"
+      "umullb   z4.h, z2.b, z0.b                        \n"
+      "umullt   z2.h, z2.b, z0.b                        \n"
+      "umlalb   z4.h, z3.b, z1.b                        \n"
+      "umlalt   z2.h, z3.b, z1.b                        \n"
+      "rshrnb   z3.b, z4.h, #8                          \n"
+      "rshrnt   z3.b, z2.h, #8                          \n"
+      "st1b     {z3.b}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),         // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),       // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),         // %[dst_ptr]
+        [width] "+r"(width),             // %[width]
+        [vl] "=&r"(vl)                   // %[vl]
+      : [y0_fraction] "r"(y0_fraction),  // %[y0_fraction]
+        [y1_fraction] "r"(y1_fraction)   // %[y1_fraction]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0");
+}
+
+__arm_locally_streaming static void HalfRow_16_SME(uint16_t* dst_ptr,
+                                                   const uint16_t* src_ptr,
+                                                   ptrdiff_t src_stride,
+                                                   int width) {
+  int y1_fraction = 128;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+  int vl;
+  asm volatile(
+      "cnth     %x[vl]                                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.h                                    \n"
+      "1:                                               \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "st1h     {z2.h}, p0, [%[dst_ptr]]                \n"
+      "incb     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.h, wzr, %w[width]                    \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
+      "st1h     {z2.h}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),    // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),  // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),    // %[dst_ptr]
+        [width] "+r"(width),        // %[width]
+        [vl] "=&r"(vl)              // %[vl]
+      :
+      : "cc", "memory", "z0", "z1", "z2", "z3", "p0");
+}
+
+__arm_locally_streaming void InterpolateRow_16_SME(uint16_t* dst_ptr,
+                                                   const uint16_t* src_ptr,
+                                                   ptrdiff_t src_stride,
+                                                   int width,
+                                                   int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+  if (y0_fraction == 0) {
+    CopyRow_SME((const uint8_t*)src_ptr1, (uint8_t*)dst_ptr,
+                width * sizeof(uint16_t));
+    return;
+  }
+  if (y0_fraction == 128) {
+    HalfRow_16_SME(dst_ptr, src_ptr, src_stride, width);
+    return;
+  }
+  if (y0_fraction == 256) {
+    CopyRow_SME((const uint8_t*)src_ptr, (uint8_t*)dst_ptr,
+                width * sizeof(uint16_t));
+    return;
+  }
+
+  int vl;
+  asm volatile(
+      "cnth     %x[vl]                                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "dup      z0.h, %w[y0_fraction]                   \n"
+      "dup      z1.h, %w[y1_fraction]                   \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.h                                    \n"
+      "1:                                               \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "umullb   z4.s, z2.h, z0.h                        \n"
+      "umullt   z2.s, z2.h, z0.h                        \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "umlalb   z4.s, z3.h, z1.h                        \n"
+      "umlalt   z2.s, z3.h, z1.h                        \n"
+      "rshrnb   z3.h, z4.s, #8                          \n"
+      "rshrnt   z3.h, z2.s, #8                          \n"
+      "st1h     {z3.h}, p0, [%[dst_ptr]]                \n"
+      "incb     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.h, wzr, %w[width]                    \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "umullb   z4.s, z2.h, z0.h                        \n"
+      "umullt   z2.s, z2.h, z0.h                        \n"
+      "umlalb   z4.s, z3.h, z1.h                        \n"
+      "umlalt   z2.s, z3.h, z1.h                        \n"
+      "rshrnb   z3.h, z4.s, #8                          \n"
+      "rshrnt   z3.h, z2.s, #8                          \n"
+      "st1h     {z3.h}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),         // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),       // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),         // %[dst_ptr]
+        [width] "+r"(width),             // %[width]
+        [vl] "=&r"(vl)                   // %[vl]
+      : [y0_fraction] "r"(y0_fraction),  // %[y0_fraction]
+        [y1_fraction] "r"(y1_fraction)   // %[y1_fraction]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0");
+}
+
+__arm_locally_streaming static void HalfRow_16To8_SME(uint8_t* dst_ptr,
+                                                      const uint16_t* src_ptr,
+                                                      ptrdiff_t src_stride,
+                                                      int scale,
+                                                      int width) {
+  int y1_fraction = 128;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
+  // saturate, then we can just use UZP2 to narrow rather than a pair of
+  // saturating narrow instructions.
+  int shift = 23 - __builtin_clz((int32_t)scale);
+
+  int vl;
+  asm volatile(
+      "cnth     %x[vl]                                  \n"
+      "dup      z31.h, %w[shift]                        \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.h                                    \n"
+      "1:                                               \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "uqshl    z2.h, p0/m, z2.h, z31.h                 \n"
+      "shrnb    z2.b, z2.h, #8                          \n"
+      "st1b     {z2.h}, p0, [%[dst_ptr]]                \n"
+      "inch     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.h, wzr, %w[width]                    \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "urhadd   z2.h, p0/m, z2.h, z3.h                  \n"
+      "uqshl    z2.h, p0/m, z2.h, z31.h                 \n"
+      "shrnb    z2.b, z2.h, #8                          \n"
+      "st1b     {z2.h}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),    // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),  // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),    // %[dst_ptr]
+        [width] "+r"(width),        // %[width]
+        [vl] "=&r"(vl)              // %[vl]
+      : [shift] "r"(shift)          // %[shift]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "z31", "p0");
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+__arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
+                                                      const uint16_t* src_ptr,
+                                                      ptrdiff_t src_stride,
+                                                      int scale,
+                                                      int width,
+                                                      int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+  // y0_fraction == 0 is never called here.
+  if (y0_fraction == 128) {
+    HalfRow_16To8_SME(dst_ptr, src_ptr, src_stride, scale, width);
+    return;
+  }
+  if (y0_fraction == 256) {
+    Convert16To8Row_SME(src_ptr, dst_ptr, scale, width);
+    return;
+  }
+
+  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
+  // saturate, then we can just use UZP2 to narrow rather than a pair of
+  // saturating narrow instructions.
+  int shift = 23 - __builtin_clz((int32_t)scale);
+
+  int vl;
+  asm volatile(
+      "cnth     %x[vl]                                  \n"
+      "dup      z31.h, %w[shift]                        \n"
+      "dup      z0.h, %w[y0_fraction]                   \n"
+      "dup      z1.h, %w[y1_fraction]                   \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p0.h                                    \n"
+      "1:                                               \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "incb     %[src_ptr]                              \n"
+      "incb     %[src_ptr1]                             \n"
+      "umullb   z4.s, z2.h, z0.h                        \n"
+      "umullt   z2.s, z2.h, z0.h                        \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "umlalb   z4.s, z3.h, z1.h                        \n"
+      "umlalt   z2.s, z3.h, z1.h                        \n"
+      "rshrnb   z3.h, z4.s, #8                          \n"
+      "rshrnt   z3.h, z2.s, #8                          \n"
+      "uqshl    z3.h, p0/m, z3.h, z31.h                 \n"
+      "shrnb    z3.b, z3.h, #8                          \n"
+      "st1b     {z3.h}, p0, [%[dst_ptr]]                \n"
+      "inch     %[dst_ptr]                              \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "whilelt  p0.h, wzr, %w[width]                    \n"
+      "ld1h     {z2.h}, p0/z, [%[src_ptr]]              \n"
+      "ld1h     {z3.h}, p0/z, [%[src_ptr1]]             \n"
+      "umullb   z4.s, z2.h, z0.h                        \n"
+      "umullt   z2.s, z2.h, z0.h                        \n"
+      "umlalb   z4.s, z3.h, z1.h                        \n"
+      "umlalt   z2.s, z3.h, z1.h                        \n"
+      "rshrnb   z3.h, z4.s, #8                          \n"
+      "rshrnt   z3.h, z2.s, #8                          \n"
+      "uqshl    z3.h, p0/m, z3.h, z31.h                 \n"
+      "shrnb    z3.b, z3.h, #8                          \n"
+      "st1b     {z3.h}, p0, [%[dst_ptr]]                \n"
+
+      "99:                                              \n"
+      : [src_ptr] "+r"(src_ptr),         // %[src_ptr]
+        [src_ptr1] "+r"(src_ptr1),       // %[src_ptr1]
+        [dst_ptr] "+r"(dst_ptr),         // %[dst_ptr]
+        [width] "+r"(width),             // %[width]
+        [vl] "=&r"(vl)                   // %[vl]
+      : [y0_fraction] "r"(y0_fraction),  // %[y0_fraction]
+        [y1_fraction] "r"(y1_fraction),  // %[y1_fraction]
+        [shift] "r"(shift)               // %[shift]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
         // defined(__aarch64__)
 
diff --git a/source/scale.cc b/source/scale.cc
index a59772ea2..76379fd6e 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1173,6 +1173,11 @@ static int ScalePlaneBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -1312,6 +1317,11 @@ static int ScalePlaneBilinearDown_16(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_16_SME;
+  }
+#endif
 
 #if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1393,6 +1403,11 @@ static int ScalePlaneBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_RVV)
   if (TestCpuFlag(kCpuHasRVV)) {
     InterpolateRow = InterpolateRow_RVV;
@@ -1893,6 +1908,11 @@ static int ScalePlaneBilinearUp_16(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_16_SME;
+  }
+#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_16_C;
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index 66082be8b..9cfb17988 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -379,6 +379,11 @@ static int ScaleARGBBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -507,6 +512,11 @@ static int ScaleARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -781,6 +791,11 @@ static int ScaleYUVToARGBBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
diff --git a/source/scale_common.cc b/source/scale_common.cc
index d07a39af9..430afc232 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -1662,6 +1662,11 @@ void ScalePlaneVertical(int src_height,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -1754,6 +1759,11 @@ void ScalePlaneVertical_16(int src_height,
       InterpolateRow = InterpolateRow_16_NEON;
     }
   }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_16_SME;
+  }
 #endif
   for (j = 0; j < dst_height; ++j) {
     int yi;
@@ -1811,6 +1821,11 @@ void ScalePlaneVertical_16To8(int src_height,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_16TO8_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow_16To8 = InterpolateRow_16To8_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_16TO8_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2;
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
index 9ef2e1387..7b318cf72 100644
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@@ -414,6 +414,11 @@ static int ScaleUVBilinearDown(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;
@@ -535,6 +540,11 @@ static int ScaleUVBilinearUp(int src_width,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    InterpolateRow = InterpolateRow_SME;
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     InterpolateRow = InterpolateRow_Any_MSA;