diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d8cf92296..10946f403 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -596,6 +596,9 @@ extern "C" { #define HAS_COPYROW_SME #define HAS_I422TOARGBROW_SME #define HAS_I444TOARGBROW_SME +#define HAS_INTERPOLATEROW_16_SME +#define HAS_INTERPOLATEROW_16TO8_SME +#define HAS_INTERPOLATEROW_SME #define HAS_MERGEUVROW_16_SME #define HAS_MERGEUVROW_SME #define HAS_MULTIPLYROW_16_SME @@ -6468,6 +6471,11 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction); +void InterpolateRow_SME(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction); void InterpolateRow_MSA(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, @@ -6524,6 +6532,11 @@ void InterpolateRow_16_Any_NEON(uint16_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); +void InterpolateRow_16_SME(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); void InterpolateRow_16To8_C(uint8_t* dst_ptr, const uint16_t* src_ptr, @@ -6543,6 +6556,12 @@ void InterpolateRow_16To8_Any_NEON(uint8_t* dst_ptr, int scale, int width, int source_y_fraction); +void InterpolateRow_16To8_SME(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction); void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, diff --git a/source/convert.cc b/source/convert.cc index 50779c070..bf886bc1c 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1248,6 +1248,11 @@ int I422ToNV21(const uint8_t* src_y, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index e22d80b3d..e2c264730 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -4443,6 +4443,11 @@ int InterpolatePlane(const uint8_t* src0, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -4528,6 +4533,11 @@ int InterpolatePlane_16(const uint16_t* src0, } } #endif +#if defined(HAS_INTERPOLATEROW_16_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow_16 = InterpolateRow_16_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_16_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow_16 = InterpolateRow_16_Any_MSA; @@ -5732,6 +5742,11 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; diff --git a/source/row_sme.cc b/source/row_sme.cc index d4b230d08..253d5c2dd 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -510,6 +510,408 @@ __arm_locally_streaming void CopyRow_SME(const uint8_t* src, : "memory", "cc", "z0", "p0"); } +__arm_locally_streaming static void HalfRow_SME(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + + int vl; + asm volatile( + "cntb %x[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p0.b \n" + "1: \n" + "ld1b {z2.b}, p0/z, [%[src_ptr]] \n" + "ld1b {z3.b}, p0/z, [%[src_ptr1]] \n" + "incb %[src_ptr] \n" + "incb %[src_ptr1] \n" + "urhadd z2.b, p0/m, z2.b, z3.b \n" + "subs %w[width], %w[width], %w[vl] \n" + "st1b {z2.b}, p0, [%[dst_ptr]] \n" + "incb %[dst_ptr] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "whilelt p0.b, wzr, %w[width] \n" + "ld1b {z2.b}, p0/z, [%[src_ptr]] \n" + "ld1b {z3.b}, p0/z, [%[src_ptr1]] \n" + "urhadd z2.b, p0/m, z2.b, z3.b \n" + "subs %w[width], %w[width], %w[vl] \n" + "st1b {z2.b}, p0, [%[dst_ptr]] \n" + + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [src_ptr1] "+r"(src_ptr1), // %[src_ptr1] + [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : + : "cc", "memory", "z0", "z1", "z2", "z3", "p0"); +} + +__arm_locally_streaming void InterpolateRow_SME(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* src_ptr1 = src_ptr + src_stride; + + if (y0_fraction == 0) { + CopyRow_SME(src_ptr1, dst_ptr, width); + return; + } + if (y0_fraction == 128) { + HalfRow_SME(dst_ptr, src_ptr, src_stride, width); + return; + } + if (y0_fraction == 256) { + CopyRow_SME(src_ptr, dst_ptr, width); + return; + } + + int vl; + asm volatile( + "cntb %x[vl] \n" + "dup z0.b, %w[y0_fraction] \n" + "dup z1.b, %w[y1_fraction] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p0.b \n" + "1: \n" + "ld1b {z2.b}, p0/z, [%[src_ptr]] \n" + "ld1b {z3.b}, p0/z, [%[src_ptr1]] \n" + "incb %[src_ptr] \n" + "incb %[src_ptr1] \n" + "umullb z4.h, z2.b, z0.b \n" + "umullt z2.h, z2.b, z0.b \n" + "subs %w[width], %w[width], %w[vl] \n" + "umlalb z4.h, z3.b, z1.b \n" + "umlalt z2.h, z3.b, z1.b \n" + "rshrnb z3.b, z4.h, #8 \n" + "rshrnt z3.b, z2.h, #8 \n" + "st1b {z3.b}, p0, [%[dst_ptr]] \n" + "incb %[dst_ptr] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "whilelt p0.b, wzr, %w[width] \n" + "ld1b {z2.b}, p0/z, [%[src_ptr]] \n" + "ld1b {z3.b}, p0/z, [%[src_ptr1]] \n" + "umullb z4.h, z2.b, z0.b \n" + "umullt z2.h, z2.b, z0.b \n" + "umlalb z4.h, z3.b, z1.b \n" + "umlalt z2.h, z3.b, z1.b \n" + "rshrnb z3.b, z4.h, #8 \n" + "rshrnt z3.b, z2.h, #8 \n" + "st1b {z3.b}, p0, [%[dst_ptr]] \n" + + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [src_ptr1] "+r"(src_ptr1), // %[src_ptr1] + [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [y0_fraction] "r"(y0_fraction), // %[y0_fraction] + [y1_fraction] "r"(y1_fraction) // %[y1_fraction] + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0"); +} + +__arm_locally_streaming static void HalfRow_16_SME(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width) { + int y1_fraction = 128; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + + int vl; + asm volatile( + "cnth %x[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p0.h \n" + "1: \n" + "ld1h {z2.h}, p0/z, [%[src_ptr]] \n" + "ld1h {z3.h}, p0/z, [%[src_ptr1]] \n" + "incb %[src_ptr] \n" + "incb %[src_ptr1] \n" + "urhadd z2.h, p0/m, z2.h, z3.h \n" + "subs %w[width], %w[width], %w[vl] \n" + "st1h {z2.h}, p0, [%[dst_ptr]] \n" + "incb %[dst_ptr] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "whilelt p0.h, wzr, %w[width] \n" + "ld1h {z2.h}, p0/z, [%[src_ptr]] \n" + "ld1h {z3.h}, p0/z, [%[src_ptr1]] \n" + "urhadd z2.h, p0/m, z2.h, z3.h \n" + "st1h {z2.h}, p0, [%[dst_ptr]] \n" + + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [src_ptr1] "+r"(src_ptr1), // %[src_ptr1] + [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : + : "cc", "memory", "z0", "z1", "z2", "z3", "p0"); +} + +__arm_locally_streaming void InterpolateRow_16_SME(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + + if (y0_fraction == 0) { + CopyRow_SME((const uint8_t*)src_ptr1, (uint8_t*)dst_ptr, + width * sizeof(uint16_t)); + return; + } + if (y0_fraction == 128) { + HalfRow_16_SME(dst_ptr, src_ptr, src_stride, width); + return; + } + if (y0_fraction == 256) { + CopyRow_SME((const uint8_t*)src_ptr, (uint8_t*)dst_ptr, + width * sizeof(uint16_t)); + return; + } + + int vl; + asm volatile( + "cnth %x[vl] \n" + "subs %w[width], %w[width], %w[vl] \n" + "dup z0.h, %w[y0_fraction] \n" + "dup z1.h, %w[y1_fraction] \n" + "b.lt 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p0.h \n" + "1: \n" + "ld1h {z2.h}, p0/z, [%[src_ptr]] \n" + "ld1h {z3.h}, p0/z, [%[src_ptr1]] \n" + "incb %[src_ptr] \n" + "incb %[src_ptr1] \n" + "umullb z4.s, z2.h, z0.h \n" + "umullt z2.s, z2.h, z0.h \n" + "subs %w[width], %w[width], %w[vl] \n" + "umlalb z4.s, z3.h, z1.h \n" + "umlalt z2.s, z3.h, z1.h \n" + "rshrnb z3.h, z4.s, #8 \n" + "rshrnt z3.h, z2.s, #8 \n" + "st1h {z3.h}, p0, [%[dst_ptr]] \n" + "incb %[dst_ptr] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "whilelt p0.h, wzr, %w[width] \n" + "ld1h {z2.h}, p0/z, [%[src_ptr]] \n" + "ld1h {z3.h}, p0/z, [%[src_ptr1]] \n" + "umullb z4.s, z2.h, z0.h \n" + "umullt z2.s, z2.h, z0.h \n" + "umlalb z4.s, z3.h, z1.h \n" + "umlalt z2.s, z3.h, z1.h \n" + "rshrnb z3.h, z4.s, #8 \n" + "rshrnt z3.h, z2.s, #8 \n" + "st1h {z3.h}, p0, [%[dst_ptr]] \n" + + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [src_ptr1] "+r"(src_ptr1), // %[src_ptr1] + [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [y0_fraction] "r"(y0_fraction), // %[y0_fraction] + [y1_fraction] "r"(y1_fraction) // %[y1_fraction] + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0"); +} + +__arm_locally_streaming static void HalfRow_16To8_SME(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width) { + int y1_fraction = 128; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + + // 15 - clz(scale), + 8 to shift result into the high half of the lane to + // saturate, then we can just use UZP2 to narrow rather than a pair of + // saturating narrow instructions. + int shift = 23 - __builtin_clz((int32_t)scale); + + int vl; + asm volatile( + "cnth %x[vl] \n" + "dup z31.h, %w[shift] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p0.h \n" + "1: \n" + "ld1h {z2.h}, p0/z, [%[src_ptr]] \n" + "ld1h {z3.h}, p0/z, [%[src_ptr1]] \n" + "incb %[src_ptr] \n" + "incb %[src_ptr1] \n" + "urhadd z2.h, p0/m, z2.h, z3.h \n" + "subs %w[width], %w[width], %w[vl] \n" + "uqshl z2.h, p0/m, z2.h, z31.h \n" + "shrnb z2.b, z2.h, #8 \n" + "st1b {z2.h}, p0, [%[dst_ptr]] \n" + "inch %[dst_ptr] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "whilelt p0.h, wzr, %w[width] \n" + "ld1h {z2.h}, p0/z, [%[src_ptr]] \n" + "ld1h {z3.h}, p0/z, [%[src_ptr1]] \n" + "urhadd z2.h, p0/m, z2.h, z3.h \n" + "uqshl z2.h, p0/m, z2.h, z31.h \n" + "shrnb z2.b, z2.h, #8 \n" + "st1b {z2.h}, p0, [%[dst_ptr]] \n" + + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [src_ptr1] "+r"(src_ptr1), // %[src_ptr1] + [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [shift] "r"(shift) // %[shift] + : "cc", "memory", "z0", "z1", "z2", "z3", "z31", "p0"); +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits +__arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int scale, + int width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint16_t* src_ptr1 = src_ptr + src_stride; + + // y0_fraction == 0 is never called here. + if (y0_fraction == 128) { + HalfRow_16To8_SME(dst_ptr, src_ptr, src_stride, scale, width); + return; + } + if (y0_fraction == 256) { + Convert16To8Row_SME(src_ptr, dst_ptr, scale, width); + return; + } + + // 15 - clz(scale), + 8 to shift result into the high half of the lane to + // saturate, then we can just use UZP2 to narrow rather than a pair of + // saturating narrow instructions. + int shift = 23 - __builtin_clz((int32_t)scale); + + int vl; + asm volatile( + "cnth %x[vl] \n" + "dup z31.h, %w[shift] \n" + "dup z0.h, %w[y0_fraction] \n" + "dup z1.h, %w[y1_fraction] \n" + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p0.h \n" + "1: \n" + "ld1h {z2.h}, p0/z, [%[src_ptr]] \n" + "ld1h {z3.h}, p0/z, [%[src_ptr1]] \n" + "incb %[src_ptr] \n" + "incb %[src_ptr1] \n" + "umullb z4.s, z2.h, z0.h \n" + "umullt z2.s, z2.h, z0.h \n" + "subs %w[width], %w[width], %w[vl] \n" + "umlalb z4.s, z3.h, z1.h \n" + "umlalt z2.s, z3.h, z1.h \n" + "rshrnb z3.h, z4.s, #8 \n" + "rshrnt z3.h, z2.s, #8 \n" + "uqshl z3.h, p0/m, z3.h, z31.h \n" + "shrnb z3.b, z3.h, #8 \n" + "st1b {z3.h}, p0, [%[dst_ptr]] \n" + "inch %[dst_ptr] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "whilelt p0.h, wzr, %w[width] \n" + "ld1h {z2.h}, p0/z, [%[src_ptr]] \n" + "ld1h {z3.h}, p0/z, [%[src_ptr1]] \n" + "umullb z4.s, z2.h, z0.h \n" + "umullt z2.s, z2.h, z0.h \n" + "umlalb z4.s, z3.h, z1.h \n" + "umlalt z2.s, z3.h, z1.h \n" + "rshrnb z3.h, z4.s, #8 \n" + "rshrnt z3.h, z2.s, #8 \n" + "uqshl z3.h, p0/m, z3.h, z31.h \n" + "shrnb z3.b, z3.h, #8 \n" + "st1b {z3.h}, p0, [%[dst_ptr]] \n" + + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [src_ptr1] "+r"(src_ptr1), // %[src_ptr1] + [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [y0_fraction] "r"(y0_fraction), // %[y0_fraction] + [y1_fraction] "r"(y1_fraction), // %[y1_fraction] + [shift] "r"(shift) // %[shift] + : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0"); +} + #endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && // defined(__aarch64__) diff --git a/source/scale.cc b/source/scale.cc index a59772ea2..76379fd6e 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1173,6 +1173,11 @@ static int ScalePlaneBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -1312,6 +1317,11 @@ static int ScalePlaneBilinearDown_16(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_16_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_16_SME; + } +#endif #if defined(HAS_SCALEFILTERCOLS_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -1393,6 +1403,11 @@ static int ScalePlaneBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_RVV) if (TestCpuFlag(kCpuHasRVV)) { InterpolateRow = InterpolateRow_RVV; @@ -1893,6 +1908,11 @@ static int ScalePlaneBilinearUp_16(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_16_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_16_SME; + } +#endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_16_C; diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 66082be8b..9cfb17988 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -379,6 +379,11 @@ static int ScaleARGBBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -507,6 +512,11 @@ static int ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -781,6 +791,11 @@ static int ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; diff --git a/source/scale_common.cc b/source/scale_common.cc index d07a39af9..430afc232 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1662,6 +1662,11 @@ void ScalePlaneVertical(int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -1754,6 +1759,11 @@ void ScalePlaneVertical_16(int src_height, InterpolateRow = InterpolateRow_16_NEON; } } +#endif +#if defined(HAS_INTERPOLATEROW_16_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_16_SME; + } #endif for (j = 0; j < dst_height; ++j) { int yi; @@ -1811,6 +1821,11 @@ void ScalePlaneVertical_16To8(int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_16TO8_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow_16To8 = InterpolateRow_16To8_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_16TO8_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2; diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 9ef2e1387..7b318cf72 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -414,6 +414,11 @@ static int ScaleUVBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; @@ -535,6 +540,11 @@ static int ScaleUVBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + InterpolateRow = InterpolateRow_SME; + } +#endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA;