libyuv/source/row_sme.cc
George Steed c4a0c8d34a [AArch64] Add SVE2 and SME implementations for Convert8To8Row
SVE can make use of the UMULH instruction to avoid needing separate
widening multiply and narrowing steps for the scale application.

Reduction in runtime for Convert8To8Row_SVE2 observed compared to the
existing Neon implementation:

        Cortex-A510: -13.2%
        Cortex-A520: -16.4%
        Cortex-A710: -37.1%
        Cortex-A715: -38.5%
        Cortex-A720: -38.4%
          Cortex-X2: -33.2%
          Cortex-X3: -31.8%
          Cortex-X4: -31.8%
        Cortex-X925: -13.9%

Change-Id: I17c0cb81661c5fbce786b47cdf481549cfdcbfc7
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6207692
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2025-01-28 15:53:26 -08:00

1093 lines
46 KiB
C++

/*
* Copyright 2024 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/row_sve.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
defined(__aarch64__)
// Read twice as much data from YUV, putting the even elements from the Y data
// in z0.h and odd elements in z1.h.
#define READYUV444_SVE_2X \
"ld1b {z0.b}, p1/z, [%[src_y]] \n" \
"ld1b {z2.b}, p1/z, [%[src_u]] \n" \
"ld1b {z3.b}, p1/z, [%[src_v]] \n" \
"incb %[src_y] \n" \
"incb %[src_u] \n" \
"incb %[src_v] \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n" \
"trn2 z1.b, z0.b, z0.b \n" \
"trn1 z0.b, z0.b, z0.b \n"
#define I444TORGB_SVE_2X \
"umulh z0.h, z24.h, z0.h \n" /* Y0 */ \
"umulh z1.h, z24.h, z1.h \n" /* Y1 */ \
"umullb z6.h, z30.b, z2.b \n" \
"umullt z7.h, z30.b, z2.b \n" \
"umullb z4.h, z28.b, z2.b \n" /* DB */ \
"umullt z2.h, z28.b, z2.b \n" /* DB */ \
"umlalb z6.h, z31.b, z3.b \n" /* DG */ \
"umlalt z7.h, z31.b, z3.b \n" /* DG */ \
"umullb z5.h, z29.b, z3.b \n" /* DR */ \
"umullt z3.h, z29.b, z3.b \n" /* DR */ \
"add z17.h, z0.h, z26.h \n" /* G */ \
"add z21.h, z1.h, z26.h \n" /* G */ \
"add z16.h, z0.h, z4.h \n" /* B */ \
"add z20.h, z1.h, z2.h \n" /* B */ \
"add z18.h, z0.h, z5.h \n" /* R */ \
"add z22.h, z1.h, z3.h \n" /* R */ \
"uqsub z17.h, z17.h, z6.h \n" /* G */ \
"uqsub z21.h, z21.h, z7.h \n" /* G */ \
"uqsub z16.h, z16.h, z25.h \n" /* B */ \
"uqsub z20.h, z20.h, z25.h \n" /* B */ \
"uqsub z18.h, z18.h, z27.h \n" /* R */ \
"uqsub z22.h, z22.h, z27.h \n" /* R */
#define RGBTOARGB8_SVE_2X \
/* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \
"uqshrnb z16.b, z16.h, #6 \n" /* B0 */ \
"uqshrnb z17.b, z17.h, #6 \n" /* G0 */ \
"uqshrnb z18.b, z18.h, #6 \n" /* R0 */ \
"uqshrnt z16.b, z20.h, #6 \n" /* B1 */ \
"uqshrnt z17.b, z21.h, #6 \n" /* G1 */ \
"uqshrnt z18.b, z22.h, #6 \n" /* R1 */
__arm_locally_streaming void I444ToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// Streaming-SVE only, no use of ZA tile.
uint64_t vl;
asm volatile(
"cntb %[vl] \n"
"ptrue p0.b \n" //
YUVTORGB_SVE_SETUP
"dup z19.b, #255 \n" // A
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p1.b \n"
"1: \n" //
READYUV444_SVE_2X I444TORGB_SVE_2X RGBTOARGB8_SVE_2X
"subs %w[width], %w[width], %w[vl] \n"
"st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
"incb %[dst_argb], all, mul #4 \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p1.b, wzr, %w[width] \n" //
READYUV444_SVE_2X I444TORGB_SVE_2X RGBTOARGB8_SVE_2X
"st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
"99: \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_SVE_REGS);
}
__arm_locally_streaming void I400ToARGBRow_SME(
const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// Streaming-SVE only, no use of ZA tile.
I400ToARGBRow_SVE_SC(src_y, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I422ToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// Streaming-SVE only, no use of ZA tile.
I422ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I422ToRGB24Row_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I422ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I422ToRGB565Row_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
I422ToRGB565Row_SVE_SC(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
}
__arm_locally_streaming void I422ToARGB1555Row_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
I422ToARGB1555Row_SVE_SC(src_y, src_u, src_v, dst_argb1555, yuvconstants,
width);
}
__arm_locally_streaming void I422ToARGB4444Row_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
I422ToARGB4444Row_SVE_SC(src_y, src_u, src_v, dst_argb4444, yuvconstants,
width);
}
__arm_locally_streaming void I422ToRGBARow_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I422ToRGBARow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I422AlphaToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I422AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
}
__arm_locally_streaming void I444AlphaToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I444AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
}
__arm_locally_streaming void NV12ToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
NV12ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void NV21ToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
NV21ToARGBRow_SVE_SC(src_y, src_vu, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void NV12ToRGB24Row_SME(
const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
NV12ToRGB24Row_SVE_SC(src_y, src_uv, dst_rgb24, yuvconstants, width);
}
__arm_locally_streaming void NV21ToRGB24Row_SME(
const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
}
__arm_locally_streaming void YUY2ToARGBRow_SME(
const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
YUY2ToARGBRow_SVE_SC(src_yuy2, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void UYVYToARGBRow_SME(
const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
UYVYToARGBRow_SVE_SC(src_uyvy, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I210ToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I210ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I210AlphaToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I210AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
}
__arm_locally_streaming void I210ToAR30Row_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
I210ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
}
__arm_locally_streaming void P210ToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
P210ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void P210ToAR30Row_SME(
const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
P210ToAR30Row_SVE_SC(src_y, src_uv, dst_ar30, yuvconstants, width);
}
__arm_locally_streaming void I410ToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I410ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I410AlphaToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I410AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
}
__arm_locally_streaming void I410ToAR30Row_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
I410ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
}
__arm_locally_streaming void P410ToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
P410ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void P410ToAR30Row_SME(
const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
P410ToAR30Row_SVE_SC(src_y, src_uv, dst_ar30, yuvconstants, width);
}
__arm_locally_streaming void I212ToAR30Row_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
I212ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
}
__arm_locally_streaming void I212ToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I212ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void MultiplyRow_16_SME(const uint16_t* src_y,
uint16_t* dst_y,
int scale,
int width) {
// Streaming-SVE only, no use of ZA tile.
int vl;
asm volatile(
"cnth %x[vl] \n"
"mov z0.h, %w[scale] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.h \n"
"1: \n"
"ld1h {z1.h}, p0/z, [%[src_y]] \n"
"incb %[src_y] \n"
"mul z1.h, z0.h, z1.h \n"
"subs %w[width], %w[width], %w[vl] \n"
"st1h {z1.h}, p0, [%[dst_y]] \n"
"incb %[dst_y] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.h, wzr, %w[width] \n"
"ld1h {z1.h}, p0/z, [%[src_y]] \n"
"mul z1.h, z0.h, z1.h \n"
"st1h {z1.h}, p0, [%[dst_y]] \n"
"99: \n"
: [src_y] "+r"(src_y), // %[src_y]
[dst_y] "+r"(dst_y), // %[dst_y]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [scale] "r"(scale) // %[scale]
: "memory", "cc", "z0", "z1", "p0");
}
__arm_locally_streaming void ARGBMultiplyRow_SME(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
// Streaming-SVE only, no use of ZA tile.
width *= 4;
int vl;
asm volatile(
"cntb %x[vl] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.b \n"
"1: \n"
"ld1b {z0.b}, p0/z, [%[src_argb]] \n"
"ld1b {z1.b}, p0/z, [%[src_argb1]] \n"
"incb %[src_argb] \n"
"incb %[src_argb1] \n"
"umullb z2.h, z0.b, z1.b \n"
"umullt z1.h, z0.b, z1.b \n"
"rshrnb z0.b, z2.h, #8 \n"
"rshrnt z0.b, z1.h, #8 \n"
"subs %w[width], %w[width], %w[vl] \n"
"st1b {z0.b}, p0, [%[dst_argb]] \n"
"incb %[dst_argb] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.b, wzr, %w[width] \n"
"ld1b {z0.b}, p0/z, [%[src_argb]] \n"
"ld1b {z1.b}, p0/z, [%[src_argb1]] \n"
"umullb z2.h, z0.b, z1.b \n"
"umullt z1.h, z0.b, z1.b \n"
"rshrnb z0.b, z2.h, #8 \n"
"rshrnt z0.b, z1.h, #8 \n"
"st1b {z0.b}, p0, [%[dst_argb]] \n"
"99: \n"
: [src_argb] "+r"(src_argb), // %[src_argb]
[src_argb1] "+r"(src_argb1), // %[src_argb1]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
:
: "memory", "cc", "z0", "z1", "z2", "p0", "p1");
}
__arm_locally_streaming void MergeUVRow_SME(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
// Streaming-SVE only, no use of ZA tile.
int vl;
asm volatile(
"cntb %x[vl] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.b \n"
"1: \n"
"ld1b {z1.b}, p0/z, [%[src_u]] \n"
"ld1b {z2.b}, p0/z, [%[src_v]] \n"
"incb %[src_u] \n"
"incb %[src_v] \n"
"subs %w[width], %w[width], %w[vl] \n"
"st2b {z1.b, z2.b}, p0, [%[dst_uv]] \n"
"incb %[dst_uv], all, mul #2 \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.b, wzr, %w[width] \n"
"ld1b {z1.b}, p0/z, [%[src_u]] \n"
"ld1b {z2.b}, p0/z, [%[src_v]] \n"
"subs %w[width], %w[width], %w[vl] \n"
"st2b {z1.b, z2.b}, p0, [%[dst_uv]] \n"
"99: \n"
: [src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_uv] "+r"(dst_uv), // %[dst_uv]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
:
: "memory", "cc", "z0", "z1", "z2", "p0");
}
__arm_locally_streaming void MergeUVRow_16_SME(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
int depth,
int width) {
int shift = 16 - depth;
// Streaming-SVE only, no use of ZA tile.
int vl;
asm volatile(
"cnth %x[vl] \n"
"mov z0.h, %w[shift] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.h \n"
"1: \n"
"ld1h {z1.h}, p0/z, [%[src_u]] \n"
"ld1h {z2.h}, p0/z, [%[src_v]] \n"
"incb %[src_u] \n"
"incb %[src_v] \n"
"lsl z1.h, p0/m, z1.h, z0.h \n"
"lsl z2.h, p0/m, z2.h, z0.h \n"
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z1.h, z2.h}, p0, [%[dst_uv]] \n"
"incb %[dst_uv], all, mul #2 \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.h, wzr, %w[width] \n"
"ld1h {z1.h}, p0/z, [%[src_u]] \n"
"ld1h {z2.h}, p0/z, [%[src_v]] \n"
"lsl z1.h, p0/m, z1.h, z0.h \n"
"lsl z2.h, p0/m, z2.h, z0.h \n"
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z1.h, z2.h}, p0, [%[dst_uv]] \n"
"99: \n"
: [src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_uv] "+r"(dst_uv), // %[dst_uv]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [shift] "r"(shift) // %[shift]
: "memory", "cc", "z0", "z1", "z2", "p0");
}
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits = shr 1
// 16384 = 10 bits = shr 2
// 4096 = 12 bits = shr 4
// 256 = 16 bits = shr 8
__arm_locally_streaming void Convert16To8Row_SME(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width) {
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
// saturate, then we can just use UZP2 to narrow rather than a pair of
// saturating narrow instructions.
int shift = 23 - __builtin_clz((int32_t)scale);
int vl;
asm volatile(
"cntb %x[vl] \n"
"dup z0.h, %w[shift] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.b \n"
"1: \n"
"ld1h {z1.h}, p0/z, [%[src_y]] \n"
"ld1h {z2.h}, p0/z, [%[src_y], #1, mul vl] \n"
"incb %[src_y], all, mul #2 \n"
"uqshl z1.h, p0/m, z1.h, z0.h \n"
"uqshl z2.h, p0/m, z2.h, z0.h \n"
"subs %w[width], %w[width], %w[vl] \n"
"uzp2 z1.b, z1.b, z2.b \n"
"st1b {z1.b}, p0, [%[dst_y]] \n"
"incb %[dst_y] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
// We need separate predicates for the load and store instructions since
// they are operating on different element sizes (.b vs .h).
"cnth %x[vl] \n"
"whilelt p0.h, wzr, %w[width] \n"
"whilelt p1.h, %w[vl], %w[width] \n"
"whilelt p2.b, wzr, %w[width] \n"
"ld1h {z1.h}, p0/z, [%[src_y]] \n"
"ld1h {z2.h}, p1/z, [%[src_y], #1, mul vl] \n"
"uqshl z1.h, p0/m, z1.h, z0.h \n"
"uqshl z2.h, p1/m, z2.h, z0.h \n"
"uzp2 z1.b, z1.b, z2.b \n"
"st1b {z1.b}, p2, [%[dst_y]] \n"
"99: \n"
: [src_y] "+r"(src_y), // %[src_y]
[dst_y] "+r"(dst_y), // %[dst_y]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [shift] "r"(shift) // %[shift]
: "cc", "memory", "z0", "z1", "z2", "p0", "p1", "p2");
}
__arm_locally_streaming void CopyRow_SME(const uint8_t* src,
uint8_t* dst,
int width) {
// Streaming-SVE only, no use of ZA tile.
int vl;
asm volatile(
"cntb %x[vl] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.b \n"
"1: \n"
"ld1b {z0.b}, p0/z, [%[src]] \n"
"incb %[src] \n"
"subs %w[width], %w[width], %w[vl] \n"
"st1b {z0.b}, p0, [%[dst]] \n"
"incb %[dst] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.b, wzr, %w[width] \n"
"ld1b {z0.b}, p0/z, [%[src]] \n"
"st1b {z0.b}, p0, [%[dst]] \n"
"99: \n"
: [src] "+r"(src), // %[src]
[dst] "+r"(dst), // %[dst]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
:
: "memory", "cc", "z0", "p0");
}
__arm_locally_streaming static void HalfRow_SME(uint8_t* dst_ptr,
const uint8_t* src_ptr,
ptrdiff_t src_stride,
int width) {
const uint8_t* src_ptr1 = src_ptr + src_stride;
int vl;
asm volatile(
"cntb %x[vl] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.b \n"
"1: \n"
"ld1b {z2.b}, p0/z, [%[src_ptr]] \n"
"ld1b {z3.b}, p0/z, [%[src_ptr1]] \n"
"incb %[src_ptr] \n"
"incb %[src_ptr1] \n"
"urhadd z2.b, p0/m, z2.b, z3.b \n"
"subs %w[width], %w[width], %w[vl] \n"
"st1b {z2.b}, p0, [%[dst_ptr]] \n"
"incb %[dst_ptr] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.b, wzr, %w[width] \n"
"ld1b {z2.b}, p0/z, [%[src_ptr]] \n"
"ld1b {z3.b}, p0/z, [%[src_ptr1]] \n"
"urhadd z2.b, p0/m, z2.b, z3.b \n"
"subs %w[width], %w[width], %w[vl] \n"
"st1b {z2.b}, p0, [%[dst_ptr]] \n"
"99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
:
: "cc", "memory", "z0", "z1", "z2", "z3", "p0");
}
__arm_locally_streaming void InterpolateRow_SME(uint8_t* dst_ptr,
const uint8_t* src_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint8_t* src_ptr1 = src_ptr + src_stride;
if (y0_fraction == 0) {
CopyRow_SME(src_ptr1, dst_ptr, width);
return;
}
if (y0_fraction == 128) {
HalfRow_SME(dst_ptr, src_ptr, src_stride, width);
return;
}
if (y0_fraction == 256) {
CopyRow_SME(src_ptr, dst_ptr, width);
return;
}
int vl;
asm volatile(
"cntb %x[vl] \n"
"dup z0.b, %w[y0_fraction] \n"
"dup z1.b, %w[y1_fraction] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.b \n"
"1: \n"
"ld1b {z2.b}, p0/z, [%[src_ptr]] \n"
"ld1b {z3.b}, p0/z, [%[src_ptr1]] \n"
"incb %[src_ptr] \n"
"incb %[src_ptr1] \n"
"umullb z4.h, z2.b, z0.b \n"
"umullt z2.h, z2.b, z0.b \n"
"subs %w[width], %w[width], %w[vl] \n"
"umlalb z4.h, z3.b, z1.b \n"
"umlalt z2.h, z3.b, z1.b \n"
"rshrnb z3.b, z4.h, #8 \n"
"rshrnt z3.b, z2.h, #8 \n"
"st1b {z3.b}, p0, [%[dst_ptr]] \n"
"incb %[dst_ptr] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.b, wzr, %w[width] \n"
"ld1b {z2.b}, p0/z, [%[src_ptr]] \n"
"ld1b {z3.b}, p0/z, [%[src_ptr1]] \n"
"umullb z4.h, z2.b, z0.b \n"
"umullt z2.h, z2.b, z0.b \n"
"umlalb z4.h, z3.b, z1.b \n"
"umlalt z2.h, z3.b, z1.b \n"
"rshrnb z3.b, z4.h, #8 \n"
"rshrnt z3.b, z2.h, #8 \n"
"st1b {z3.b}, p0, [%[dst_ptr]] \n"
"99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [y0_fraction] "r"(y0_fraction), // %[y0_fraction]
[y1_fraction] "r"(y1_fraction) // %[y1_fraction]
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0");
}
__arm_locally_streaming static void HalfRow_16_SME(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int width) {
const uint16_t* src_ptr1 = src_ptr + src_stride;
int vl;
asm volatile(
"cnth %x[vl] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.h \n"
"1: \n"
"ld1h {z2.h}, p0/z, [%[src_ptr]] \n"
"ld1h {z3.h}, p0/z, [%[src_ptr1]] \n"
"incb %[src_ptr] \n"
"incb %[src_ptr1] \n"
"urhadd z2.h, p0/m, z2.h, z3.h \n"
"subs %w[width], %w[width], %w[vl] \n"
"st1h {z2.h}, p0, [%[dst_ptr]] \n"
"incb %[dst_ptr] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.h, wzr, %w[width] \n"
"ld1h {z2.h}, p0/z, [%[src_ptr]] \n"
"ld1h {z3.h}, p0/z, [%[src_ptr1]] \n"
"urhadd z2.h, p0/m, z2.h, z3.h \n"
"st1h {z2.h}, p0, [%[dst_ptr]] \n"
"99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
:
: "cc", "memory", "z0", "z1", "z2", "z3", "p0");
}
__arm_locally_streaming void InterpolateRow_16_SME(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
if (y0_fraction == 0) {
CopyRow_SME((const uint8_t*)src_ptr1, (uint8_t*)dst_ptr,
width * sizeof(uint16_t));
return;
}
if (y0_fraction == 128) {
HalfRow_16_SME(dst_ptr, src_ptr, src_stride, width);
return;
}
if (y0_fraction == 256) {
CopyRow_SME((const uint8_t*)src_ptr, (uint8_t*)dst_ptr,
width * sizeof(uint16_t));
return;
}
int vl;
asm volatile(
"cnth %x[vl] \n"
"subs %w[width], %w[width], %w[vl] \n"
"dup z0.h, %w[y0_fraction] \n"
"dup z1.h, %w[y1_fraction] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.h \n"
"1: \n"
"ld1h {z2.h}, p0/z, [%[src_ptr]] \n"
"ld1h {z3.h}, p0/z, [%[src_ptr1]] \n"
"incb %[src_ptr] \n"
"incb %[src_ptr1] \n"
"umullb z4.s, z2.h, z0.h \n"
"umullt z2.s, z2.h, z0.h \n"
"subs %w[width], %w[width], %w[vl] \n"
"umlalb z4.s, z3.h, z1.h \n"
"umlalt z2.s, z3.h, z1.h \n"
"rshrnb z3.h, z4.s, #8 \n"
"rshrnt z3.h, z2.s, #8 \n"
"st1h {z3.h}, p0, [%[dst_ptr]] \n"
"incb %[dst_ptr] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.h, wzr, %w[width] \n"
"ld1h {z2.h}, p0/z, [%[src_ptr]] \n"
"ld1h {z3.h}, p0/z, [%[src_ptr1]] \n"
"umullb z4.s, z2.h, z0.h \n"
"umullt z2.s, z2.h, z0.h \n"
"umlalb z4.s, z3.h, z1.h \n"
"umlalt z2.s, z3.h, z1.h \n"
"rshrnb z3.h, z4.s, #8 \n"
"rshrnt z3.h, z2.s, #8 \n"
"st1h {z3.h}, p0, [%[dst_ptr]] \n"
"99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [y0_fraction] "r"(y0_fraction), // %[y0_fraction]
[y1_fraction] "r"(y1_fraction) // %[y1_fraction]
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0");
}
__arm_locally_streaming static void HalfRow_16To8_SME(uint8_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int scale,
int width) {
const uint16_t* src_ptr1 = src_ptr + src_stride;
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
// saturate, then we can just use UZP2 to narrow rather than a pair of
// saturating narrow instructions.
int shift = 23 - __builtin_clz((int32_t)scale);
int vl;
asm volatile(
"cnth %x[vl] \n"
"dup z31.h, %w[shift] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.h \n"
"1: \n"
"ld1h {z2.h}, p0/z, [%[src_ptr]] \n"
"ld1h {z3.h}, p0/z, [%[src_ptr1]] \n"
"incb %[src_ptr] \n"
"incb %[src_ptr1] \n"
"urhadd z2.h, p0/m, z2.h, z3.h \n"
"subs %w[width], %w[width], %w[vl] \n"
"uqshl z2.h, p0/m, z2.h, z31.h \n"
"shrnb z2.b, z2.h, #8 \n"
"st1b {z2.h}, p0, [%[dst_ptr]] \n"
"inch %[dst_ptr] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.h, wzr, %w[width] \n"
"ld1h {z2.h}, p0/z, [%[src_ptr]] \n"
"ld1h {z3.h}, p0/z, [%[src_ptr1]] \n"
"urhadd z2.h, p0/m, z2.h, z3.h \n"
"uqshl z2.h, p0/m, z2.h, z31.h \n"
"shrnb z2.b, z2.h, #8 \n"
"st1b {z2.h}, p0, [%[dst_ptr]] \n"
"99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [shift] "r"(shift) // %[shift]
: "cc", "memory", "z0", "z1", "z2", "z3", "z31", "p0");
}
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
// TODO(fbarchard): change scale to bits
__arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int scale,
int width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
// y0_fraction == 0 is never called here.
if (y0_fraction == 128) {
HalfRow_16To8_SME(dst_ptr, src_ptr, src_stride, scale, width);
return;
}
if (y0_fraction == 256) {
Convert16To8Row_SME(src_ptr, dst_ptr, scale, width);
return;
}
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
// saturate, then we can just use UZP2 to narrow rather than a pair of
// saturating narrow instructions.
int shift = 23 - __builtin_clz((int32_t)scale);
int vl;
asm volatile(
"cnth %x[vl] \n"
"dup z31.h, %w[shift] \n"
"dup z0.h, %w[y0_fraction] \n"
"dup z1.h, %w[y1_fraction] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p0.h \n"
"1: \n"
"ld1h {z2.h}, p0/z, [%[src_ptr]] \n"
"ld1h {z3.h}, p0/z, [%[src_ptr1]] \n"
"incb %[src_ptr] \n"
"incb %[src_ptr1] \n"
"umullb z4.s, z2.h, z0.h \n"
"umullt z2.s, z2.h, z0.h \n"
"subs %w[width], %w[width], %w[vl] \n"
"umlalb z4.s, z3.h, z1.h \n"
"umlalt z2.s, z3.h, z1.h \n"
"rshrnb z3.h, z4.s, #8 \n"
"rshrnt z3.h, z2.s, #8 \n"
"uqshl z3.h, p0/m, z3.h, z31.h \n"
"shrnb z3.b, z3.h, #8 \n"
"st1b {z3.h}, p0, [%[dst_ptr]] \n"
"inch %[dst_ptr] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p0.h, wzr, %w[width] \n"
"ld1h {z2.h}, p0/z, [%[src_ptr]] \n"
"ld1h {z3.h}, p0/z, [%[src_ptr1]] \n"
"umullb z4.s, z2.h, z0.h \n"
"umullt z2.s, z2.h, z0.h \n"
"umlalb z4.s, z3.h, z1.h \n"
"umlalt z2.s, z3.h, z1.h \n"
"rshrnb z3.h, z4.s, #8 \n"
"rshrnt z3.h, z2.s, #8 \n"
"uqshl z3.h, p0/m, z3.h, z31.h \n"
"shrnb z3.b, z3.h, #8 \n"
"st1b {z3.h}, p0, [%[dst_ptr]] \n"
"99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [y0_fraction] "r"(y0_fraction), // %[y0_fraction]
[y1_fraction] "r"(y1_fraction), // %[y1_fraction]
[shift] "r"(shift) // %[shift]
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0");
}
__arm_locally_streaming void Convert8To8Row_SME(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width) {
Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width);
}
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
// defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif