diff --git a/BUILD.gn b/BUILD.gn index 50cc52937..f51d1113d 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -263,6 +263,7 @@ if (libyuv_use_sme) { sources = [ "source/rotate_sme.cc", "source/row_sme.cc", + "source/scale_sme.cc", ] deps = [ ":libyuv_internal" ] diff --git a/CMakeLists.txt b/CMakeLists.txt index 7dc437231..eab5cad37 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,7 +130,8 @@ int main(void) { return 0; } # Enable AArch64 SME kernels. add_library(${ly_lib_name}_sme OBJECT ${ly_src_dir}/rotate_sme.cc - ${ly_src_dir}/row_sme.cc) + ${ly_src_dir}/row_sme.cc + ${ly_src_dir}/scale_sme.cc) target_compile_options(${ly_lib_name}_sme PRIVATE -march=armv9-a+sme) list(APPEND ly_lib_parts $) else() diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index a8ec4776a..6e488a4cd 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -116,6 +116,12 @@ extern "C" { #define HAS_SCALEUVROWUP2_BILINEAR_16_NEON #endif +// The following are available on AArch64 SME platforms: +#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ + defined(__aarch64__) +#define HAS_SCALEROWDOWN2_SME +#endif + #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_SCALEADDROW_MSA #define HAS_SCALEARGBCOLS_MSA @@ -1397,13 +1403,15 @@ void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr, int dst_width); // ScaleRowDown2Box also used by planar functions -// NEON downscalers with interpolation. - -// Note - not static due to reuse in convert for 444 to 420. +// NEON/SME downscalers with interpolation. void ScaleRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); +void ScaleRowDown2_SME(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, diff --git a/source/scale.cc b/source/scale.cc index f4d1053e4..2f9bb4990 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -74,6 +74,13 @@ static void ScalePlaneDown2(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN2_SME) + if (TestCpuFlag(kCpuHasSME)) { + if (filtering == kFilterNone) { + ScaleRowDown2 = ScaleRowDown2_SME; + } + } +#endif #if defined(HAS_SCALEROWDOWN2_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowDown2 = diff --git a/source/scale_sme.cc b/source/scale_sme.cc new file mode 100644 index 000000000..d4f0a827d --- /dev/null +++ b/source/scale_sme.cc @@ -0,0 +1,65 @@ +/* + * Copyright 2024 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ + defined(__aarch64__) + +__arm_locally_streaming void ScaleRowDown2_SME(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + // Streaming-SVE only, no use of ZA tile. + (void)src_stride; + int vl; + asm volatile( + "cntb %x[vl] \n" + "subs %w[dst_width], %w[dst_width], %w[vl] \n" + "b.lt 2f \n" + + "1: \n" + "ptrue p0.b \n" + "ld2b {z0.b, z1.b}, p0/z, [%[src_ptr]] \n" + "incb %[src_ptr], all, mul #2 \n" + "subs %w[dst_width], %w[dst_width], %w[vl] \n" + "st1b {z1.b}, p0, [%[dst_ptr]] \n" + "incb %[dst_ptr] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[dst_width], %w[dst_width], %w[vl] \n" + "b.eq 99f \n" + + "whilelt p0.b, wzr, %w[dst_width] \n" + "ld2b {z0.b, z1.b}, p0/z, [%[src_ptr]] \n" + "st1b {z1.b}, p0, [%[dst_ptr]] \n" + + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [dst_ptr] "+r"(dst), // %[dst_ptr] + [dst_width] "+r"(dst_width), // %[dst_width] + [vl] "=r"(vl) // %[vl] + : + : "memory", "cc", "z0", "z1", "p0"); +} + +#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && + // defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif