[AArch64] Add SVE2 implementation of I422ToRGBARow

This is almost identical to the existing I422ToARGBRow_SVE2 kernel, we
just need to interleave differently for the output.

The RGBA format actually saves us an instruction compared to ARGB since
there is no need to merge in the alpha component, we can just replace
the odd elements of the alpha vector itself during the narrowing.

Also rename some existing macros to make more sense when distinguishing
between ARGB and RGBA.

Reductions in runtime observed compared to the existing Neon code:

Cortex-A510: -27.0%
Cortex-A720:  -5.3%
  Cortex-X2: -14.7%

Bug: libyuv:973
Change-Id: I1e12ff608ee49c25b918097007e16d87b39cb067
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5593797
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-23 08:22:57 +01:00 committed by Frank Barchard
parent 004352ba16
commit 96bbdb53ed
3 changed files with 90 additions and 19 deletions

View File

@ -585,6 +585,7 @@ extern "C" {
#define HAS_BGRATOUVROW_SVE2
#define HAS_I422ALPHATOARGBROW_SVE2
#define HAS_I422TOARGBROW_SVE2
#define HAS_I422TORGBAROW_SVE2
#define HAS_I444ALPHATOARGBROW_SVE2
#define HAS_I444TOARGBROW_SVE2
#define HAS_RGBATOUVROW_SVE2
@ -1154,6 +1155,12 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_SVE2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,

View File

@ -4906,6 +4906,11 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TORGBAROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
I422ToRGBARow = I422ToRGBARow_SVE2;
}
#endif
#if defined(HAS_I422TORGBAROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGBARow = I422ToRGBARow_Any_MSA;
@ -5134,6 +5139,11 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TORGBAROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
I422ToRGBARow = I422ToRGBARow_SVE2;
}
#endif
#if defined(HAS_I422TORGBAROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGBARow = I422ToRGBARow_Any_MSA;

View File

@ -66,17 +66,27 @@ extern "C" {
"uqsub z16.h, z16.h, z25.h \n" /* B */ \
"uqsub z18.h, z18.h, z27.h \n" /* R */
// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as BG and RA
// Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA
// pairs to allow us to use ST2 for storing rather than ST4.
#define RGBTORGBA8_SVE \
"uqshrnb z16.b, z16.h, #6 \n" \
"uqshrnb z18.b, z18.h, #6 \n" \
"uqshrnt z16.b, z17.h, #6 \n" \
"trn1 z17.b, z18.b, z19.b \n"
#define RGBTOARGB8_SVE \
/* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \
"uqshrnb z16.b, z16.h, #6 \n" /* B0 */ \
"uqshrnb z18.b, z18.h, #6 \n" /* R0 */ \
"uqshrnt z16.b, z17.h, #6 \n" /* BG */ \
"trn1 z17.b, z18.b, z19.b \n" /* RA */
// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as AB and GR
// pairs to allow us to use ST2 for storing rather than ST4.
#define RGBTORGBA8_SVE \
/* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \
"uqshrnt z19.b, z16.h, #6 \n" /* AB */ \
"uqshrnb z20.b, z17.h, #6 \n" /* G0 */ \
"uqshrnt z20.b, z18.h, #6 \n" /* GR */
#define YUVTORGB_SVE_REGS \
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", \
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "p0", "p1"
"z20", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "p0", \
"p1"
void I444ToARGBRow_SVE2(const uint8_t* src_y,
const uint8_t* src_u,
@ -95,7 +105,7 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
// generation overhead.
"ptrue p1.h \n"
"1: \n" READYUV444_SVE
I4XXTORGB_SVE RGBTORGBA8_SVE
I4XXTORGB_SVE RGBTOARGB8_SVE
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@ -107,7 +117,7 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p1.h, wzr, %w[width] \n" READYUV444_SVE
I4XXTORGB_SVE RGBTORGBA8_SVE
I4XXTORGB_SVE RGBTOARGB8_SVE
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"99: \n"
@ -139,7 +149,7 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
// generation overhead.
"ptrue p1.h \n"
"1: \n" READYUV422_SVE
I4XXTORGB_SVE RGBTORGBA8_SVE
I4XXTORGB_SVE RGBTOARGB8_SVE
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@ -151,7 +161,7 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p1.h, wzr, %w[width] \n" READYUV422_SVE
I4XXTORGB_SVE RGBTORGBA8_SVE
I4XXTORGB_SVE RGBTOARGB8_SVE
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"99: \n"
@ -166,6 +176,50 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_SVE_REGS);
}
void I422ToRGBARow_SVE2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
uint64_t vl;
asm("cnth %[vl] \n"
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
"dup z19.b, #255 \n" // A
"subs %w[width], %w[width], %w[vl] \n"
"b.le 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p1.h \n"
"1: \n" //
READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z19.h, z20.h}, p1, [%[dst_argb]] \n"
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
"b.gt 1b \n"
// Calculate a predicate for the final iteration to deal with the tail.
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
"whilelt p1.h, wzr, %w[width] \n" //
READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE
"st2h {z19.h, z20.h}, p1, [%[dst_argb]] \n"
"99: \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_SVE_REGS);
}
void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -183,9 +237,9 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
// generation overhead.
"ptrue p1.h \n"
"1: \n" READYUV444_SVE
"ld1b {z19.h}, p1/z, [%[src_a]] \n" // A
"add %[src_a], %[src_a], %[vl] \n" I4XXTORGB_SVE
RGBTORGBA8_SVE
"ld1b {z19.h}, p1/z, [%[src_a]] \n"
"add %[src_a], %[src_a], %[vl] \n" // A
I4XXTORGB_SVE RGBTOARGB8_SVE
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@ -198,7 +252,7 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p1.h, wzr, %w[width] \n" READYUV444_SVE
"ld1b {z19.h}, p1/z, [%[src_a]] \n" // A
I4XXTORGB_SVE RGBTORGBA8_SVE
I4XXTORGB_SVE RGBTOARGB8_SVE
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"99: \n"
@ -231,9 +285,9 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
// generation overhead.
"ptrue p1.h \n"
"1: \n" READYUV422_SVE
"ld1b {z19.h}, p1/z, [%[src_a]] \n" // A
"add %[src_a], %[src_a], %[vl] \n" I4XXTORGB_SVE
RGBTORGBA8_SVE
"ld1b {z19.h}, p1/z, [%[src_a]] \n"
"add %[src_a], %[src_a], %[vl] \n" // A
I4XXTORGB_SVE RGBTOARGB8_SVE
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
@ -246,7 +300,7 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p1.h, wzr, %w[width] \n" READYUV422_SVE
"ld1b {z19.h}, p1/z, [%[src_a]] \n" // A
I4XXTORGB_SVE RGBTORGBA8_SVE
I4XXTORGB_SVE RGBTOARGB8_SVE
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"99: \n"