mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
[AArch64] Add SVE2 implementation of I422ToRGBARow
This is almost identical to the existing I422ToARGBRow_SVE2 kernel, we just need to interleave differently for the output. The RGBA format actually saves us an instruction compared to ARGB since there is no need to merge in the alpha component, we can just replace the odd elements of the alpha vector itself during the narrowing. Also rename some existing macros to make more sense when distinguishing between ARGB and RGBA. Reductions in runtime observed compared to the existing Neon code: Cortex-A510: -27.0% Cortex-A720: -5.3% Cortex-X2: -14.7% Bug: libyuv:973 Change-Id: I1e12ff608ee49c25b918097007e16d87b39cb067 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5593797 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
004352ba16
commit
96bbdb53ed
@ -585,6 +585,7 @@ extern "C" {
|
||||
#define HAS_BGRATOUVROW_SVE2
|
||||
#define HAS_I422ALPHATOARGBROW_SVE2
|
||||
#define HAS_I422TOARGBROW_SVE2
|
||||
#define HAS_I422TORGBAROW_SVE2
|
||||
#define HAS_I444ALPHATOARGBROW_SVE2
|
||||
#define HAS_I444TOARGBROW_SVE2
|
||||
#define HAS_RGBATOUVROW_SVE2
|
||||
@ -1154,6 +1155,12 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_rgba,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToRGBARow_SVE2(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_rgba,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
|
||||
@ -4906,6 +4906,11 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TORGBAROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
I422ToRGBARow = I422ToRGBARow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TORGBAROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
I422ToRGBARow = I422ToRGBARow_Any_MSA;
|
||||
@ -5134,6 +5139,11 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TORGBAROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
I422ToRGBARow = I422ToRGBARow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TORGBAROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
I422ToRGBARow = I422ToRGBARow_Any_MSA;
|
||||
|
||||
@ -66,17 +66,27 @@ extern "C" {
|
||||
"uqsub z16.h, z16.h, z25.h \n" /* B */ \
|
||||
"uqsub z18.h, z18.h, z27.h \n" /* R */
|
||||
|
||||
// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as BG and RA
|
||||
// Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA
|
||||
// pairs to allow us to use ST2 for storing rather than ST4.
|
||||
#define RGBTORGBA8_SVE \
|
||||
"uqshrnb z16.b, z16.h, #6 \n" \
|
||||
"uqshrnb z18.b, z18.h, #6 \n" \
|
||||
"uqshrnt z16.b, z17.h, #6 \n" \
|
||||
"trn1 z17.b, z18.b, z19.b \n"
|
||||
#define RGBTOARGB8_SVE \
|
||||
/* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \
|
||||
"uqshrnb z16.b, z16.h, #6 \n" /* B0 */ \
|
||||
"uqshrnb z18.b, z18.h, #6 \n" /* R0 */ \
|
||||
"uqshrnt z16.b, z17.h, #6 \n" /* BG */ \
|
||||
"trn1 z17.b, z18.b, z19.b \n" /* RA */
|
||||
|
||||
// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as AB and GR
|
||||
// pairs to allow us to use ST2 for storing rather than ST4.
|
||||
#define RGBTORGBA8_SVE \
|
||||
/* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \
|
||||
"uqshrnt z19.b, z16.h, #6 \n" /* AB */ \
|
||||
"uqshrnb z20.b, z17.h, #6 \n" /* G0 */ \
|
||||
"uqshrnt z20.b, z18.h, #6 \n" /* GR */
|
||||
|
||||
#define YUVTORGB_SVE_REGS \
|
||||
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", \
|
||||
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "p0", "p1"
|
||||
"z20", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "p0", \
|
||||
"p1"
|
||||
|
||||
void I444ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
@ -95,7 +105,7 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
// generation overhead.
|
||||
"ptrue p1.h \n"
|
||||
"1: \n" READYUV444_SVE
|
||||
I4XXTORGB_SVE RGBTORGBA8_SVE
|
||||
I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
|
||||
@ -107,7 +117,7 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"whilelt p1.h, wzr, %w[width] \n" READYUV444_SVE
|
||||
I4XXTORGB_SVE RGBTORGBA8_SVE
|
||||
I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
|
||||
"99: \n"
|
||||
@ -139,7 +149,7 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
// generation overhead.
|
||||
"ptrue p1.h \n"
|
||||
"1: \n" READYUV422_SVE
|
||||
I4XXTORGB_SVE RGBTORGBA8_SVE
|
||||
I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
|
||||
@ -151,7 +161,7 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"whilelt p1.h, wzr, %w[width] \n" READYUV422_SVE
|
||||
I4XXTORGB_SVE RGBTORGBA8_SVE
|
||||
I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
|
||||
"99: \n"
|
||||
@ -166,6 +176,50 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
: "cc", "memory", YUVTORGB_SVE_REGS);
|
||||
}
|
||||
|
||||
void I422ToRGBARow_SVE2(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
uint64_t vl;
|
||||
asm("cnth %[vl] \n"
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
"dup z19.b, #255 \n" // A
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"b.le 2f \n"
|
||||
|
||||
// Run bulk of computation with an all-true predicate to avoid predicate
|
||||
// generation overhead.
|
||||
"ptrue p1.h \n"
|
||||
"1: \n" //
|
||||
READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2h {z19.h, z20.h}, p1, [%[dst_argb]] \n"
|
||||
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"2: \n"
|
||||
"adds %w[width], %w[width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"whilelt p1.h, wzr, %w[width] \n" //
|
||||
READYUV422_SVE I4XXTORGB_SVE RGBTORGBA8_SVE
|
||||
"st2h {z19.h, z20.h}, p1, [%[dst_argb]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_u] "+r"(src_u), // %[src_u]
|
||||
[src_v] "+r"(src_v), // %[src_v]
|
||||
[dst_argb] "+r"(dst_argb), // %[dst_argb]
|
||||
[width] "+r"(width), // %[width]
|
||||
[vl] "=&r"(vl) // %[vl]
|
||||
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
|
||||
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
|
||||
: "cc", "memory", YUVTORGB_SVE_REGS);
|
||||
}
|
||||
|
||||
void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
@ -183,9 +237,9 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
|
||||
// generation overhead.
|
||||
"ptrue p1.h \n"
|
||||
"1: \n" READYUV444_SVE
|
||||
"ld1b {z19.h}, p1/z, [%[src_a]] \n" // A
|
||||
"add %[src_a], %[src_a], %[vl] \n" I4XXTORGB_SVE
|
||||
RGBTORGBA8_SVE
|
||||
"ld1b {z19.h}, p1/z, [%[src_a]] \n"
|
||||
"add %[src_a], %[src_a], %[vl] \n" // A
|
||||
I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
|
||||
@ -198,7 +252,7 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"whilelt p1.h, wzr, %w[width] \n" READYUV444_SVE
|
||||
"ld1b {z19.h}, p1/z, [%[src_a]] \n" // A
|
||||
I4XXTORGB_SVE RGBTORGBA8_SVE
|
||||
I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
|
||||
"99: \n"
|
||||
@ -231,9 +285,9 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
|
||||
// generation overhead.
|
||||
"ptrue p1.h \n"
|
||||
"1: \n" READYUV422_SVE
|
||||
"ld1b {z19.h}, p1/z, [%[src_a]] \n" // A
|
||||
"add %[src_a], %[src_a], %[vl] \n" I4XXTORGB_SVE
|
||||
RGBTORGBA8_SVE
|
||||
"ld1b {z19.h}, p1/z, [%[src_a]] \n"
|
||||
"add %[src_a], %[src_a], %[vl] \n" // A
|
||||
I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
|
||||
@ -246,7 +300,7 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"whilelt p1.h, wzr, %w[width] \n" READYUV422_SVE
|
||||
"ld1b {z19.h}, p1/z, [%[src_a]] \n" // A
|
||||
I4XXTORGB_SVE RGBTORGBA8_SVE
|
||||
I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
|
||||
"99: \n"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user