[AArch64] Add SVE2 implementation of I400ToARGBRow

This is mostly a copy of the I422ToARGBRow_SVE2 implementation, but we
can pre-calculate the UV component results before the loop body.

Unlike in the Neon version of the code we can make use of MOVPRFX and
USQADD to avoid needing to apply the bias separately from the UV
coefficient multiply additions.

Reduction in runtime observed compared to the existing Neon code:

Cortex-A510: -26.1%
Cortex-A520:  -5.9%
Cortex-A715: -49.5%
Cortex-A720: -49.4%
  Cortex-X2: -22.5%
  Cortex-X3: -23.5%
  Cortex-X4: -21.6%

Bug: libyuv:973
Change-Id: Ib9fc52bd53a1c6a1aac8bd865ab88539aca098ea
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5598767
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-23 15:13:47 +01:00 committed by Frank Barchard
parent 34abe98fe2
commit cd4113f4e8
3 changed files with 70 additions and 0 deletions

View File

@ -585,6 +585,7 @@ extern "C" {
#define HAS_AYUVTOUVROW_SVE2
#define HAS_AYUVTOVUROW_SVE2
#define HAS_BGRATOUVROW_SVE2
#define HAS_I400TOARGBROW_SVE2
#define HAS_I422ALPHATOARGBROW_SVE2
#define HAS_I422TOARGBROW_SVE2
#define HAS_I422TORGBAROW_SVE2
@ -4743,6 +4744,10 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I400ToARGBRow_SVE2(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I400ToARGBRow_MSA(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,

View File

@ -2980,6 +2980,11 @@ int I400ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I400TOARGBROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
I400ToARGBRow = I400ToARGBRow_SVE2;
}
#endif
#if defined(HAS_I400TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I400ToARGBRow = I400ToARGBRow_Any_MSA;

View File

@ -43,6 +43,12 @@ extern "C" {
"trn1 z1.h, z1.h, z1.h \n" \
"trn1 z2.h, z2.h, z2.h \n"
#define READYUV400_SVE \
"ld1b {z0.h}, p1/z, [%[src_y]] \n" \
"inch %[src_y] \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"trn1 z0.b, z0.b, z0.b \n"
// We need a different predicate for the UV component to handle the tail.
// If there is a single element remaining then we want to load one Y element
// but two UV elements.
@ -80,6 +86,14 @@ extern "C" {
"uqsub z16.h, z16.h, z25.h \n" /* B */ \
"uqsub z18.h, z18.h, z27.h \n" /* R */
#define I400TORGB_SVE \
"umulh z18.h, z24.h, z0.h \n" /* Y */ \
"movprfx z16, z18 \n" \
"usqadd z16.h, p0/m, z16.h, z4.h \n" /* B */ \
"movprfx z17, z18 \n" \
"usqadd z17.h, p0/m, z17.h, z6.h \n" /* G */ \
"usqadd z18.h, p0/m, z18.h, z5.h \n" /* R */
// Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA
// pairs to allow us to use ST2 for storing rather than ST4.
#define RGBTOARGB8_SVE \
@ -146,6 +160,52 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_SVE_REGS);
}
void I400ToARGBRow_SVE2(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
uint64_t vl;
asm("cnth %[vl] \n"
"ptrue p0.b \n"
"dup z19.b, #255 \n" // A
YUVTORGB_SVE_SETUP
"cmp %w[width], %w[vl] \n"
"mov z1.h, #128 \n" // U/V
"mul z6.h, z30.h, z1.h \n"
"mul z4.h, z28.h, z1.h \n" // DB
"mul z5.h, z29.h, z1.h \n" // DR
"mla z6.h, p0/m, z31.h, z1.h \n" // DG
"sub z4.h, z4.h, z25.h \n"
"sub z5.h, z5.h, z27.h \n"
"sub z6.h, z26.h, z6.h \n"
"b.le 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p1.h \n"
"sub %w[width], %w[width], %w[vl] \n"
"1: \n" //
READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
"b.gt 1b \n"
"add %w[width], %w[width], %w[vl] \n"
// Calculate a predicate for the final iteration to deal with the tail.
"2: \n"
"whilelt p1.h, wzr, %w[width] \n" //
READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
: [src_y] "+r"(src_y), // %[src_y]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_SVE_REGS);
}
void I422ToARGBRow_SVE2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,