From 85331e00cc37fd5c04e717c9a7490ed7c71529b6 Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 15 Nov 2024 21:28:49 +0000 Subject: [PATCH] [AArch64] Add SME impls of ScaleRowDown2{,Linear,Box}_16 Mostly just straightforward copies of the Neon code ported to Streaming-SVE, these follow the same pattern as the prior ScaleRowDown2 SME kernels, but operating on 16-bit data rather than 8-bit. These is no benefit from this kernel when the SVE vector length is only 128 bits, so skip writing a non-streaming SVE implementation. Change-Id: I7bad0719d24cdb1760d1039c63c0e77726b28a54 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6070784 Reviewed-by: Frank Barchard Reviewed-by: Justin Green --- include/libyuv/scale_row.h | 45 ++++++++----- source/scale.cc | 7 ++ source/scale_sme.cc | 131 +++++++++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+), 16 deletions(-) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 7d97f541e..7da122ed1 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -124,11 +124,12 @@ extern "C" { // The following are available on AArch64 SME platforms: #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ defined(__aarch64__) -#define HAS_SCALEROWDOWN2_SME -#define HAS_SCALEUVROWDOWN2_SME -#define HAS_SCALEUVROWDOWN2LINEAR_SME -#define HAS_SCALEUVROWDOWN2BOX_SME #define HAS_SCALEARGBROWDOWN2_SME +#define HAS_SCALEROWDOWN2_16_SME +#define HAS_SCALEROWDOWN2_SME +#define HAS_SCALEUVROWDOWN2BOX_SME +#define HAS_SCALEUVROWDOWN2LINEAR_SME +#define HAS_SCALEUVROWDOWN2_SME #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) @@ -326,10 +327,6 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); -void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -1441,26 +1438,34 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); -void ScaleRowDown2_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); void ScaleRowDown2_SME(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); +void ScaleRowDown2_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2_16_SME(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); -void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width); void ScaleRowDown2Linear_SME(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); +void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Linear_16_SME(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -1469,6 +1474,14 @@ void ScaleRowDown2Box_SME(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); +void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Box_16_SME(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); void ScaleRowDown4_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, diff --git a/source/scale.cc b/source/scale.cc index 8b8315043..a4d544ab6 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -193,6 +193,13 @@ static void ScalePlaneDown2_16(int src_width, : ScaleRowDown2Box_16_NEON; } #endif +#if defined(HAS_SCALEROWDOWN2_16_SME) + if (TestCpuFlag(kCpuHasSME)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SME + : filtering == kFilterLinear ? ScaleRowDown2Linear_16_SME + : ScaleRowDown2Box_16_SME; + } +#endif #if defined(HAS_SCALEROWDOWN2_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = diff --git a/source/scale_sme.cc b/source/scale_sme.cc index 9a4014f3d..6b22f24d0 100644 --- a/source/scale_sme.cc +++ b/source/scale_sme.cc @@ -57,6 +57,44 @@ __arm_locally_streaming void ScaleRowDown2_SME(const uint8_t* src_ptr, : "memory", "cc", "z0", "z1", "p0"); } +__arm_locally_streaming void ScaleRowDown2_16_SME(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + // Streaming-SVE only, no use of ZA tile. + (void)src_stride; + int vl; + asm volatile( + "cnth %x[vl] \n" + "subs %w[dst_width], %w[dst_width], %w[vl] \n" + "b.lt 2f \n" + + "1: \n" + "ptrue p0.h \n" + "ld2h {z0.h, z1.h}, p0/z, [%[src_ptr]] \n" + "incb %[src_ptr], all, mul #2 \n" + "subs %w[dst_width], %w[dst_width], %w[vl] \n" + "st1h {z1.h}, p0, [%[dst_ptr]] \n" + "incb %[dst_ptr] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[dst_width], %w[dst_width], %w[vl] \n" + "b.eq 99f \n" + + "whilelt p0.h, wzr, %w[dst_width] \n" + "ld2h {z0.h, z1.h}, p0/z, [%[src_ptr]] \n" + "st1h {z1.h}, p0, [%[dst_ptr]] \n" + + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [dst_ptr] "+r"(dst), // %[dst_ptr] + [dst_width] "+r"(dst_width), // %[dst_width] + [vl] "=r"(vl) // %[vl] + : + : "memory", "cc", "z0", "z1", "p0"); +} + __arm_locally_streaming void ScaleRowDown2Linear_SME(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -97,6 +135,46 @@ __arm_locally_streaming void ScaleRowDown2Linear_SME(const uint8_t* src_ptr, : "memory", "cc", "z0", "z1", "p0"); } +__arm_locally_streaming void ScaleRowDown2Linear_16_SME(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + // Streaming-SVE only, no use of ZA tile. + (void)src_stride; + int vl; + asm volatile( + "cnth %x[vl] \n" + "subs %w[dst_width], %w[dst_width], %w[vl] \n" + "b.lt 2f \n" + + "1: \n" + "ptrue p0.h \n" + "ld2h {z0.h, z1.h}, p0/z, [%[src_ptr]] \n" + "incb %[src_ptr], all, mul #2 \n" + "urhadd z0.h, p0/m, z0.h, z1.h \n" + "subs %w[dst_width], %w[dst_width], %w[vl] \n" + "st1h {z0.h}, p0, [%[dst_ptr]] \n" + "incb %[dst_ptr] \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[dst_width], %w[dst_width], %w[vl] \n" + "b.eq 99f \n" + + "whilelt p0.h, wzr, %w[dst_width] \n" + "ld2h {z0.h, z1.h}, p0/z, [%[src_ptr]] \n" + "urhadd z0.h, p0/m, z0.h, z1.h \n" + "st1h {z0.h}, p0, [%[dst_ptr]] \n" + + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [dst_ptr] "+r"(dst), // %[dst_ptr] + [dst_width] "+r"(dst_width), // %[dst_width] + [vl] "=r"(vl) // %[vl] + : + : "memory", "cc", "z0", "z1", "p0"); +} + #define SCALEROWDOWN2BOX_SVE \ "ld2b {z0.b, z1.b}, p0/z, [%[src_ptr]] \n" \ "ld2b {z2.b, z3.b}, p0/z, [%[src2_ptr]] \n" \ @@ -150,6 +228,59 @@ __arm_locally_streaming void ScaleRowDown2Box_SME(const uint8_t* src_ptr, #undef SCALEROWDOWN2BOX_SVE +#define SCALEROWDOWN2BOX_16_SVE \ + "ld2h {z0.h, z1.h}, p0/z, [%[src_ptr]] \n" \ + "ld2h {z2.h, z3.h}, p0/z, [%[src2_ptr]] \n" \ + "incb %[src_ptr], all, mul #2 \n" \ + "incb %[src2_ptr], all, mul #2 \n" \ + "uaddlb z4.s, z0.h, z1.h \n" \ + "uaddlt z5.s, z0.h, z1.h \n" \ + "uaddlb z6.s, z2.h, z3.h \n" \ + "uaddlt z7.s, z2.h, z3.h \n" \ + "add z4.s, z4.s, z6.s \n" \ + "add z5.s, z5.s, z7.s \n" \ + "rshrnb z0.h, z4.s, #2 \n" \ + "rshrnt z0.h, z5.s, #2 \n" \ + "subs %w[dst_width], %w[dst_width], %w[vl] \n" \ + "st1h {z0.h}, p0, [%[dst_ptr]] \n" \ + "incb %[dst_ptr] \n" + +__arm_locally_streaming void ScaleRowDown2Box_16_SME(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + // Streaming-SVE only, no use of ZA tile. + const uint16_t* src2_ptr = src_ptr + src_stride; + int vl; + asm volatile( + "cnth %x[vl] \n" + "subs %w[dst_width], %w[dst_width], %w[vl] \n" + "b.lt 2f \n" + + "ptrue p0.h \n" + "1: \n" // + SCALEROWDOWN2BOX_16_SVE + "b.ge 1b \n" + + "2: \n" + "adds %w[dst_width], %w[dst_width], %w[vl] \n" + "b.eq 99f \n" + + "whilelt p0.h, wzr, %w[dst_width] \n" // + SCALEROWDOWN2BOX_16_SVE + + "99: \n" + : [src_ptr] "+r"(src_ptr), // %[src_ptr] + [src2_ptr] "+r"(src2_ptr), // %[src2_ptr] + [dst_ptr] "+r"(dst), // %[dst_ptr] + [dst_width] "+r"(dst_width), // %[dst_width] + [vl] "=r"(vl) // %[vl] + : + : "memory", "cc", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "p0"); +} + +#undef SCALEROWDOWN2BOX_16_SVE + __arm_locally_streaming void ScaleUVRowDown2_SME(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv,