mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
[AArch64] Add SVE2 and SME implementations for Convert8To8Row
SVE can make use of the UMULH instruction to avoid needing separate
widening multiply and narrowing steps for the scale application.
Reduction in runtime for Convert8To8Row_SVE2 observed compared to the
existing Neon implementation:
Cortex-A510: -13.2%
Cortex-A520: -16.4%
Cortex-A710: -37.1%
Cortex-A715: -38.5%
Cortex-A720: -38.4%
Cortex-X2: -33.2%
Cortex-X3: -31.8%
Cortex-X4: -31.8%
Cortex-X925: -13.9%
Change-Id: I17c0cb81661c5fbce786b47cdf481549cfdcbfc7
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6207692
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
eacb08c83e
commit
c4a0c8d34a
@ -553,6 +553,7 @@ extern "C" {
|
||||
#define HAS_AYUVTOUVROW_SVE2
|
||||
#define HAS_AYUVTOVUROW_SVE2
|
||||
#define HAS_BGRATOUVROW_SVE2
|
||||
#define HAS_CONVERT8TO8ROW_SVE2
|
||||
#define HAS_DIVIDEROW_16_SVE2
|
||||
#define HAS_HALFFLOATROW_SVE2
|
||||
#define HAS_I210ALPHATOARGBROW_SVE2
|
||||
@ -595,6 +596,7 @@ extern "C" {
|
||||
defined(__aarch64__)
|
||||
#define HAS_ARGBMULTIPLYROW_SME
|
||||
#define HAS_CONVERT16TO8ROW_SME
|
||||
#define HAS_CONVERT8TO8ROW_SME
|
||||
#define HAS_COPYROW_SME
|
||||
#define HAS_I210ALPHATOARGBROW_SME
|
||||
#define HAS_I210TOAR30ROW_SME
|
||||
@ -3658,6 +3660,16 @@ void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
|
||||
int scale,
|
||||
int bias,
|
||||
int width);
|
||||
void Convert8To8Row_SVE2(const uint8_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int bias,
|
||||
int width);
|
||||
void Convert8To8Row_SME(const uint8_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int bias,
|
||||
int width);
|
||||
void Convert8To8Row_AVX2(const uint8_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
|
||||
@ -1725,6 +1725,60 @@ static inline void I212ToARGBRow_SVE_SC(const uint16_t* src_y,
|
||||
: "cc", "memory", YUVTORGB_SVE_REGS);
|
||||
}
|
||||
|
||||
#define CONVERT8TO8_SVE \
|
||||
"ld1b {z0.b}, p0/z, [%[src]] \n" \
|
||||
"ld1b {z1.b}, p1/z, [%[src], #1, mul vl] \n" \
|
||||
"incb %[src], all, mul #2 \n" \
|
||||
"subs %w[width], %w[width], %w[vl], lsl #1 \n" \
|
||||
"umulh z0.b, z0.b, z2.b \n" \
|
||||
"umulh z1.b, z1.b, z2.b \n" \
|
||||
"prfm pldl1keep, [%[src], 448] \n" \
|
||||
"add z0.b, z0.b, z3.b \n" \
|
||||
"add z1.b, z1.b, z3.b \n" \
|
||||
"st1b {z0.b}, p0, [%[dst]] \n" \
|
||||
"st1b {z1.b}, p1, [%[dst], #1, mul vl] \n" \
|
||||
"incb %[dst], all, mul #2 \n"
|
||||
|
||||
static inline void Convert8To8Row_SVE_SC(const uint8_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int bias,
|
||||
int width) STREAMING_COMPATIBLE {
|
||||
uint64_t vl;
|
||||
asm volatile(
|
||||
"dup z2.b, %w[scale] \n"
|
||||
"dup z3.b, %w[bias] \n"
|
||||
"cntb %[vl] \n"
|
||||
"subs %w[width], %w[width], %w[vl], lsl #1 \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// Run bulk of computation with all-true predicates to avoid predicate
|
||||
// generation overhead.
|
||||
"ptrue p0.b \n"
|
||||
"ptrue p1.b \n"
|
||||
"1: \n" //
|
||||
CONVERT8TO8_SVE
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[width], %w[width], %w[vl], lsl #1 \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
// Calculate predicates for the final iteration to deal with the tail.
|
||||
"whilelt p0.b, wzr, %w2 \n"
|
||||
"whilelt p1.b, %w[vl], %w2 \n" //
|
||||
CONVERT8TO8_SVE
|
||||
|
||||
"99: \n"
|
||||
: [src] "+r"(src_y), // %[src]
|
||||
[dst] "+r"(dst_y), // %[dst]
|
||||
[width] "+r"(width), // %[width]
|
||||
[vl] "=&r"(vl) // %[vl]
|
||||
: [scale] "r"(scale), // %[scale]
|
||||
[bias] "r"(bias) // %[bias]
|
||||
: "cc", "memory", "z0", "z1", "z2", "z3", "p0", "p1");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -271,6 +271,16 @@ void Convert8To8Plane(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_CONVERT8TO8ROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
Convert8To8Row = Convert8To8Row_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_CONVERT8TO8ROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
Convert8To8Row = Convert8To8Row_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_CONVERT8TO8ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
Convert8To8Row = Convert8To8Row_Any_AVX2;
|
||||
|
||||
@ -1075,6 +1075,14 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
|
||||
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0");
|
||||
}
|
||||
|
||||
__arm_locally_streaming void Convert8To8Row_SME(const uint8_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int bias,
|
||||
int width) {
|
||||
Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width);
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
|
||||
// defined(__aarch64__)
|
||||
|
||||
|
||||
@ -1237,6 +1237,14 @@ void I212ToARGBRow_SVE2(const uint16_t* src_y,
|
||||
I212ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
|
||||
}
|
||||
|
||||
void Convert8To8Row_SVE2(const uint8_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int bias,
|
||||
int width) {
|
||||
Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width);
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user