mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[AArch64] Unroll I422ToAR30Row_{SVE2,SME}
The existing STOREAR30_SVE macro works fine for out of order cores, however for in-order cores the number of dependent vector instructions laid out consecutively impacts performance. We can improve this by unrolling the loop to process two sets of vectors at a time, allowing little cores to process two independent streams of vector instructions at the same time to improve performance. Using one set of ZIP instructions at the end allows us to (a) avoid ST4 which we know is slow on some micro-architectures, and (b) enable the use of predication and avoid the need for separate "any" kernels. Reduction in run times of I422ToAR30Row_SVE2 observed compared to the previous SVE2 implementation: Cortex-A510: -37.7% Cortex-A520: -38.8% Cortex-A710: -14.8% Cortex-A715: -17.1% Cortex-A720: -16.9% Cortex-X2: -10.3% Cortex-X3: -6.7% Cortex-X4: -9.4% Cortex-X925: -7.1% Change-Id: I160fb41300d2d08fce2e6eb92181324fd723a02d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6632916 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
843cda7e7b
commit
867bdc51ed
@ -342,6 +342,7 @@ extern "C" {
|
||||
// rather than needing a pair of shifts to saturate and then insert into the
|
||||
// correct position in the lane.
|
||||
#define STOREAR30_SVE \
|
||||
/* Inputs: B: z16.h, G: z17.h, R: z18.h */ \
|
||||
"uqshl z16.h, p0/m, z16.h, #2 \n" /* bbbbbbbbbbxxxxxx */ \
|
||||
"uqshl z17.h, p0/m, z17.h, #2 \n" /* ggggggggggxxxxxx */ \
|
||||
"umin z18.h, p0/m, z18.h, z23.h \n" /* 00rrrrrrrrrrxxxx */ \
|
||||
@ -352,6 +353,31 @@ extern "C" {
|
||||
"st2h {z17.h, z18.h}, p1, [%[dst_ar30]] \n" \
|
||||
"incb %[dst_ar30], all, mul #2 \n"
|
||||
|
||||
#define STOREAR30_SVE_2X \
|
||||
/* Inputs: B: z16.h, G: z17.h, R: z18.h */ \
|
||||
/* B: z20.h, G: z21.h, R: z22.h */ \
|
||||
"uqshl z16.h, p0/m, z16.h, #2 \n" /* bbbbbbbbbbxxxxxx */ \
|
||||
"uqshl z20.h, p0/m, z20.h, #2 \n" /* bbbbbbbbbbxxxxxx */ \
|
||||
"uqshl z17.h, p0/m, z17.h, #2 \n" /* ggggggggggxxxxxx */ \
|
||||
"uqshl z21.h, p0/m, z21.h, #2 \n" /* ggggggggggxxxxxx */ \
|
||||
"umin z18.h, p0/m, z18.h, z23.h \n" /* 00rrrrrrrrrrxxxx */ \
|
||||
"umin z22.h, p0/m, z22.h, z23.h \n" /* 00rrrrrrrrrrxxxx */ \
|
||||
"orr z18.h, z18.h, #0xc000 \n" /* 11rrrrrrrrrrxxxx */ \
|
||||
"orr z22.h, z22.h, #0xc000 \n" /* 11rrrrrrrrrrxxxx */ \
|
||||
"sri z18.h, z17.h, #12 \n" /* 11rrrrrrrrrrgggg */ \
|
||||
"sri z22.h, z21.h, #12 \n" /* 11rrrrrrrrrrgggg */ \
|
||||
"lsl z17.h, z17.h, #4 \n" /* ggggggxxxxxx0000 */ \
|
||||
"lsl z19.h, z21.h, #4 \n" /* ggggggxxxxxx0000 */ \
|
||||
"sri z17.h, z16.h, #6 \n" /* ggggggbbbbbbbbbb */ \
|
||||
"sri z19.h, z20.h, #6 \n" /* ggggggbbbbbbbbbb */ \
|
||||
"zip2 z16.h, z17.h, z19.h \n" \
|
||||
"zip1 z21.h, z17.h, z19.h \n" \
|
||||
"zip2 z17.h, z18.h, z22.h \n" \
|
||||
"zip1 z22.h, z18.h, z22.h \n" \
|
||||
"st2h {z21.h, z22.h}, p2, [%[dst_ar30]] \n" \
|
||||
"st2h {z16.h, z17.h}, p3, [%[dst_ar30], #2, mul vl] \n" \
|
||||
"incb %[dst_ar30], all, mul #4 \n"
|
||||
|
||||
#define YUVTORGB_SVE_REGS \
|
||||
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", \
|
||||
"z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", \
|
||||
@ -776,32 +802,35 @@ static inline void I422ToAR30Row_SVE_SC(const uint8_t* src_y,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) STREAMING_COMPATIBLE {
|
||||
uint64_t vl;
|
||||
// The limit is used for saturating the 2.14 red channel in STOREAR30_SVE.
|
||||
// The limit is used for saturating the 2.14 red channel in STOREAR30_SVE_2X.
|
||||
const uint16_t limit = 0x3ff0;
|
||||
asm volatile(
|
||||
"cnth %[vl] \n"
|
||||
"ptrue p0.b \n" //
|
||||
YUVTORGB_SVE_SETUP
|
||||
"dup z19.b, #255 \n" // Alpha
|
||||
"dup z23.h, %w[limit] \n"
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"subs %w[width], %w[width], %w[vl], lsl #1 \n"
|
||||
"b.le 2f \n"
|
||||
|
||||
// Run bulk of computation with an all-true predicate to avoid predicate
|
||||
// generation overhead.
|
||||
"ptrue p1.h \n"
|
||||
"ptrue p1.b \n"
|
||||
"ptrue p2.b \n"
|
||||
"ptrue p3.b \n"
|
||||
"1: \n" //
|
||||
READYUV422_SVE I4XXTORGB_SVE STOREAR30_SVE
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
READYUV422_SVE_2X I422TORGB_SVE_2X STOREAR30_SVE_2X
|
||||
"subs %w[width], %w[width], %w[vl], lsl #1 \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"2: \n"
|
||||
"adds %w[width], %w[width], %w[vl] \n"
|
||||
"adds %w[width], %w[width], %w[vl], lsl #1 \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p1.h, wzr, %w[width] \n" //
|
||||
READYUV422_SVE I4XXTORGB_SVE STOREAR30_SVE
|
||||
"whilelt p1.b, wzr, %w[width] \n"
|
||||
"whilelt p2.h, wzr, %w[width] \n"
|
||||
"whilelt p3.h, %w[vl], %w[width] \n" //
|
||||
READYUV422_SVE_2X I422TORGB_SVE_2X STOREAR30_SVE_2X
|
||||
|
||||
"99: \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user