[AArch64] Unroll I422ToAR30Row_{SVE2,SME}

The existing STOREAR30_SVE macro works fine for out of order cores,
however for in-order cores the number of dependent vector instructions
laid out consecutively impacts performance.

We can improve this by unrolling the loop to process two sets of vectors
at a time, allowing little cores to process two independent streams of
vector instructions at the same time to improve performance. Using one
set of ZIP instructions at the end allows us to (a) avoid ST4 which we
know is slow on some micro-architectures, and (b) enable the use of
predication and avoid the need for separate "any" kernels.

Reduction in run times of I422ToAR30Row_SVE2 observed compared to the
previous SVE2 implementation:

Cortex-A510: -37.7%
Cortex-A520: -38.8%
Cortex-A710: -14.8%
Cortex-A715: -17.1%
Cortex-A720: -16.9%
  Cortex-X2: -10.3%
  Cortex-X3:  -6.7%
  Cortex-X4:  -9.4%
Cortex-X925:  -7.1%

Change-Id: I160fb41300d2d08fce2e6eb92181324fd723a02d
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6632916
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
George Steed 2025-06-04 10:47:48 +01:00 committed by Frank Barchard
parent 843cda7e7b
commit 867bdc51ed

View File

@ -342,6 +342,7 @@ extern "C" {
// rather than needing a pair of shifts to saturate and then insert into the
// correct position in the lane.
#define STOREAR30_SVE \
/* Inputs: B: z16.h, G: z17.h, R: z18.h */ \
"uqshl z16.h, p0/m, z16.h, #2 \n" /* bbbbbbbbbbxxxxxx */ \
"uqshl z17.h, p0/m, z17.h, #2 \n" /* ggggggggggxxxxxx */ \
"umin z18.h, p0/m, z18.h, z23.h \n" /* 00rrrrrrrrrrxxxx */ \
@ -352,6 +353,31 @@ extern "C" {
"st2h {z17.h, z18.h}, p1, [%[dst_ar30]] \n" \
"incb %[dst_ar30], all, mul #2 \n"
#define STOREAR30_SVE_2X \
/* Inputs: B: z16.h, G: z17.h, R: z18.h */ \
/* B: z20.h, G: z21.h, R: z22.h */ \
"uqshl z16.h, p0/m, z16.h, #2 \n" /* bbbbbbbbbbxxxxxx */ \
"uqshl z20.h, p0/m, z20.h, #2 \n" /* bbbbbbbbbbxxxxxx */ \
"uqshl z17.h, p0/m, z17.h, #2 \n" /* ggggggggggxxxxxx */ \
"uqshl z21.h, p0/m, z21.h, #2 \n" /* ggggggggggxxxxxx */ \
"umin z18.h, p0/m, z18.h, z23.h \n" /* 00rrrrrrrrrrxxxx */ \
"umin z22.h, p0/m, z22.h, z23.h \n" /* 00rrrrrrrrrrxxxx */ \
"orr z18.h, z18.h, #0xc000 \n" /* 11rrrrrrrrrrxxxx */ \
"orr z22.h, z22.h, #0xc000 \n" /* 11rrrrrrrrrrxxxx */ \
"sri z18.h, z17.h, #12 \n" /* 11rrrrrrrrrrgggg */ \
"sri z22.h, z21.h, #12 \n" /* 11rrrrrrrrrrgggg */ \
"lsl z17.h, z17.h, #4 \n" /* ggggggxxxxxx0000 */ \
"lsl z19.h, z21.h, #4 \n" /* ggggggxxxxxx0000 */ \
"sri z17.h, z16.h, #6 \n" /* ggggggbbbbbbbbbb */ \
"sri z19.h, z20.h, #6 \n" /* ggggggbbbbbbbbbb */ \
"zip2 z16.h, z17.h, z19.h \n" \
"zip1 z21.h, z17.h, z19.h \n" \
"zip2 z17.h, z18.h, z22.h \n" \
"zip1 z22.h, z18.h, z22.h \n" \
"st2h {z21.h, z22.h}, p2, [%[dst_ar30]] \n" \
"st2h {z16.h, z17.h}, p3, [%[dst_ar30], #2, mul vl] \n" \
"incb %[dst_ar30], all, mul #4 \n"
#define YUVTORGB_SVE_REGS \
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", \
"z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", \
@ -776,32 +802,35 @@ static inline void I422ToAR30Row_SVE_SC(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) STREAMING_COMPATIBLE {
uint64_t vl;
// The limit is used for saturating the 2.14 red channel in STOREAR30_SVE.
// The limit is used for saturating the 2.14 red channel in STOREAR30_SVE_2X.
const uint16_t limit = 0x3ff0;
asm volatile(
"cnth %[vl] \n"
"ptrue p0.b \n" //
YUVTORGB_SVE_SETUP
"dup z19.b, #255 \n" // Alpha
"dup z23.h, %w[limit] \n"
"subs %w[width], %w[width], %w[vl] \n"
"subs %w[width], %w[width], %w[vl], lsl #1 \n"
"b.le 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p1.h \n"
"ptrue p1.b \n"
"ptrue p2.b \n"
"ptrue p3.b \n"
"1: \n" //
READYUV422_SVE I4XXTORGB_SVE STOREAR30_SVE
"subs %w[width], %w[width], %w[vl] \n"
READYUV422_SVE_2X I422TORGB_SVE_2X STOREAR30_SVE_2X
"subs %w[width], %w[width], %w[vl], lsl #1 \n"
"b.gt 1b \n"
// Calculate a predicate for the final iteration to deal with the tail.
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"adds %w[width], %w[width], %w[vl], lsl #1 \n"
"b.eq 99f \n"
"whilelt p1.h, wzr, %w[width] \n" //
READYUV422_SVE I4XXTORGB_SVE STOREAR30_SVE
"whilelt p1.b, wzr, %w[width] \n"
"whilelt p2.h, wzr, %w[width] \n"
"whilelt p3.h, %w[vl], %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X STOREAR30_SVE_2X
"99: \n"
: [src_y] "+r"(src_y), // %[src_y]