mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
[AArch64] Add SVE2 implementation of I400ToARGBRow
This is mostly a copy of the I422ToARGBRow_SVE2 implementation, but we can pre-calculate the UV component results before the loop body. Unlike in the Neon version of the code we can make use of MOVPRFX and USQADD to avoid needing to apply the bias separately from the UV coefficient multiply additions. Reduction in runtime observed compared to the existing Neon code: Cortex-A510: -26.1% Cortex-A520: -5.9% Cortex-A715: -49.5% Cortex-A720: -49.4% Cortex-X2: -22.5% Cortex-X3: -23.5% Cortex-X4: -21.6% Bug: libyuv:973 Change-Id: Ib9fc52bd53a1c6a1aac8bd865ab88539aca098ea Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5598767 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
34abe98fe2
commit
cd4113f4e8
@ -585,6 +585,7 @@ extern "C" {
|
||||
#define HAS_AYUVTOUVROW_SVE2
|
||||
#define HAS_AYUVTOVUROW_SVE2
|
||||
#define HAS_BGRATOUVROW_SVE2
|
||||
#define HAS_I400TOARGBROW_SVE2
|
||||
#define HAS_I422ALPHATOARGBROW_SVE2
|
||||
#define HAS_I422TOARGBROW_SVE2
|
||||
#define HAS_I422TORGBAROW_SVE2
|
||||
@ -4743,6 +4744,10 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I400ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I400ToARGBRow_MSA(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
|
||||
@ -2980,6 +2980,11 @@ int I400ToARGBMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I400TOARGBROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
I400ToARGBRow = I400ToARGBRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I400TOARGBROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
I400ToARGBRow = I400ToARGBRow_Any_MSA;
|
||||
|
||||
@ -43,6 +43,12 @@ extern "C" {
|
||||
"trn1 z1.h, z1.h, z1.h \n" \
|
||||
"trn1 z2.h, z2.h, z2.h \n"
|
||||
|
||||
#define READYUV400_SVE \
|
||||
"ld1b {z0.h}, p1/z, [%[src_y]] \n" \
|
||||
"inch %[src_y] \n" \
|
||||
"prfm pldl1keep, [%[src_y], 448] \n" \
|
||||
"trn1 z0.b, z0.b, z0.b \n"
|
||||
|
||||
// We need a different predicate for the UV component to handle the tail.
|
||||
// If there is a single element remaining then we want to load one Y element
|
||||
// but two UV elements.
|
||||
@ -80,6 +86,14 @@ extern "C" {
|
||||
"uqsub z16.h, z16.h, z25.h \n" /* B */ \
|
||||
"uqsub z18.h, z18.h, z27.h \n" /* R */
|
||||
|
||||
#define I400TORGB_SVE \
|
||||
"umulh z18.h, z24.h, z0.h \n" /* Y */ \
|
||||
"movprfx z16, z18 \n" \
|
||||
"usqadd z16.h, p0/m, z16.h, z4.h \n" /* B */ \
|
||||
"movprfx z17, z18 \n" \
|
||||
"usqadd z17.h, p0/m, z17.h, z6.h \n" /* G */ \
|
||||
"usqadd z18.h, p0/m, z18.h, z5.h \n" /* R */
|
||||
|
||||
// Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA
|
||||
// pairs to allow us to use ST2 for storing rather than ST4.
|
||||
#define RGBTOARGB8_SVE \
|
||||
@ -146,6 +160,52 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
: "cc", "memory", YUVTORGB_SVE_REGS);
|
||||
}
|
||||
|
||||
void I400ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
uint64_t vl;
|
||||
asm("cnth %[vl] \n"
|
||||
"ptrue p0.b \n"
|
||||
"dup z19.b, #255 \n" // A
|
||||
YUVTORGB_SVE_SETUP
|
||||
"cmp %w[width], %w[vl] \n"
|
||||
"mov z1.h, #128 \n" // U/V
|
||||
"mul z6.h, z30.h, z1.h \n"
|
||||
"mul z4.h, z28.h, z1.h \n" // DB
|
||||
"mul z5.h, z29.h, z1.h \n" // DR
|
||||
"mla z6.h, p0/m, z31.h, z1.h \n" // DG
|
||||
"sub z4.h, z4.h, z25.h \n"
|
||||
"sub z5.h, z5.h, z27.h \n"
|
||||
"sub z6.h, z26.h, z6.h \n"
|
||||
"b.le 2f \n"
|
||||
|
||||
// Run bulk of computation with an all-true predicate to avoid predicate
|
||||
// generation overhead.
|
||||
"ptrue p1.h \n"
|
||||
"sub %w[width], %w[width], %w[vl] \n"
|
||||
"1: \n" //
|
||||
READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
|
||||
"b.gt 1b \n"
|
||||
"add %w[width], %w[width], %w[vl] \n"
|
||||
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"2: \n"
|
||||
"whilelt p1.h, wzr, %w[width] \n" //
|
||||
READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[dst_argb] "+r"(dst_argb), // %[dst_argb]
|
||||
[width] "+r"(width), // %[width]
|
||||
[vl] "=&r"(vl) // %[vl]
|
||||
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
|
||||
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
|
||||
: "cc", "memory", YUVTORGB_SVE_REGS);
|
||||
}
|
||||
|
||||
void I422ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user