[AArch64] Add SVE2 and SME implementations for Convert8To8Row

SVE can make use of the UMULH instruction to avoid needing separate
widening multiply and narrowing steps for the scale application.

Reduction in runtime for Convert8To8Row_SVE2 observed compared to the
existing Neon implementation:

        Cortex-A510: -13.2%
        Cortex-A520: -16.4%
        Cortex-A710: -37.1%
        Cortex-A715: -38.5%
        Cortex-A720: -38.4%
          Cortex-X2: -33.2%
          Cortex-X3: -31.8%
          Cortex-X4: -31.8%
        Cortex-X925: -13.9%

Change-Id: I17c0cb81661c5fbce786b47cdf481549cfdcbfc7
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6207692
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2025-01-28 10:21:16 +00:00 committed by Frank Barchard
parent eacb08c83e
commit c4a0c8d34a
5 changed files with 92 additions and 0 deletions

View File

@ -553,6 +553,7 @@ extern "C" {
#define HAS_AYUVTOUVROW_SVE2
#define HAS_AYUVTOVUROW_SVE2
#define HAS_BGRATOUVROW_SVE2
#define HAS_CONVERT8TO8ROW_SVE2
#define HAS_DIVIDEROW_16_SVE2
#define HAS_HALFFLOATROW_SVE2
#define HAS_I210ALPHATOARGBROW_SVE2
@ -595,6 +596,7 @@ extern "C" {
defined(__aarch64__)
#define HAS_ARGBMULTIPLYROW_SME
#define HAS_CONVERT16TO8ROW_SME
#define HAS_CONVERT8TO8ROW_SME
#define HAS_COPYROW_SME
#define HAS_I210ALPHATOARGBROW_SME
#define HAS_I210TOAR30ROW_SME
@ -3658,6 +3660,16 @@ void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
int scale,
int bias,
int width);
void Convert8To8Row_SVE2(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width);
void Convert8To8Row_SME(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width);
void Convert8To8Row_AVX2(const uint8_t* src_y,
uint8_t* dst_y,
int scale,

View File

@ -1725,6 +1725,60 @@ static inline void I212ToARGBRow_SVE_SC(const uint16_t* src_y,
: "cc", "memory", YUVTORGB_SVE_REGS);
}
#define CONVERT8TO8_SVE \
"ld1b {z0.b}, p0/z, [%[src]] \n" \
"ld1b {z1.b}, p1/z, [%[src], #1, mul vl] \n" \
"incb %[src], all, mul #2 \n" \
"subs %w[width], %w[width], %w[vl], lsl #1 \n" \
"umulh z0.b, z0.b, z2.b \n" \
"umulh z1.b, z1.b, z2.b \n" \
"prfm pldl1keep, [%[src], 448] \n" \
"add z0.b, z0.b, z3.b \n" \
"add z1.b, z1.b, z3.b \n" \
"st1b {z0.b}, p0, [%[dst]] \n" \
"st1b {z1.b}, p1, [%[dst], #1, mul vl] \n" \
"incb %[dst], all, mul #2 \n"
static inline void Convert8To8Row_SVE_SC(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width) STREAMING_COMPATIBLE {
uint64_t vl;
asm volatile(
"dup z2.b, %w[scale] \n"
"dup z3.b, %w[bias] \n"
"cntb %[vl] \n"
"subs %w[width], %w[width], %w[vl], lsl #1 \n"
"b.lt 2f \n"
// Run bulk of computation with all-true predicates to avoid predicate
// generation overhead.
"ptrue p0.b \n"
"ptrue p1.b \n"
"1: \n" //
CONVERT8TO8_SVE
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl], lsl #1 \n"
"b.eq 99f \n"
// Calculate predicates for the final iteration to deal with the tail.
"whilelt p0.b, wzr, %w2 \n"
"whilelt p1.b, %w[vl], %w2 \n" //
CONVERT8TO8_SVE
"99: \n"
: [src] "+r"(src_y), // %[src]
[dst] "+r"(dst_y), // %[dst]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [scale] "r"(scale), // %[scale]
[bias] "r"(bias) // %[bias]
: "cc", "memory", "z0", "z1", "z2", "z3", "p0", "p1");
}
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
#ifdef __cplusplus

View File

@ -271,6 +271,16 @@ void Convert8To8Plane(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_CONVERT8TO8ROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
Convert8To8Row = Convert8To8Row_SVE2;
}
#endif
#if defined(HAS_CONVERT8TO8ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
Convert8To8Row = Convert8To8Row_SME;
}
#endif
#if defined(HAS_CONVERT8TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
Convert8To8Row = Convert8To8Row_Any_AVX2;

View File

@ -1075,6 +1075,14 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z31", "p0");
}
__arm_locally_streaming void Convert8To8Row_SME(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width) {
Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width);
}
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
// defined(__aarch64__)

View File

@ -1237,6 +1237,14 @@ void I212ToARGBRow_SVE2(const uint16_t* src_y,
I212ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
void Convert8To8Row_SVE2(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width) {
Convert8To8Row_SVE_SC(src_y, dst_y, scale, bias, width);
}
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
#ifdef __cplusplus