mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-11 22:19:56 +08:00
Add SVE2 and SME implementations of I444ToRGB24Row
Move the READYUV444_SVE_2X and I444TORGB_SVE_2X macros to row_sve.h so they are usable in both SVE2 and SME implementations, and use them to add new I444ToRGB24Row implementations for SVE2 and SME. We need to use the unrolled versions here to use the ST3B interleaving store instructions, since there is no partial vector version of this store instruction. Reduction in time taken observed for the new SVE2 implementation, compared to the existing Neon implementation: Cortex-A510: -57.6% Cortex-A520: -38.1% Cortex-A710: -15.5% Cortex-A715: -9.2% Cortex-A720: -9.2% Cortex-X2: -25.8% Cortex-X3: -26.2% Cortex-X4: -23.2% Cortex-X925: -17.8% Change-Id: I6acd0b798a35e5352d4fad664769f12d3d938ed7 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6530646 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
951e43439c
commit
949cb623bf
@ -597,6 +597,7 @@ extern "C" {
|
||||
#define HAS_I422TORGBAROW_SVE2
|
||||
#define HAS_I444ALPHATOARGBROW_SVE2
|
||||
#define HAS_I444TOARGBROW_SVE2
|
||||
#define HAS_I444TORGB24ROW_SVE2
|
||||
#define HAS_NV12TOARGBROW_SVE2
|
||||
#define HAS_NV12TORGB24ROW_SVE2
|
||||
#define HAS_NV21TOARGBROW_SVE2
|
||||
@ -639,6 +640,7 @@ extern "C" {
|
||||
#define HAS_I422TORGBAROW_SME
|
||||
#define HAS_I444ALPHATOARGBROW_SME
|
||||
#define HAS_I444TOARGBROW_SME
|
||||
#define HAS_I444TORGB24ROW_SME
|
||||
#define HAS_INTERPOLATEROW_16_SME
|
||||
#define HAS_INTERPOLATEROW_16TO8_SME
|
||||
#define HAS_INTERPOLATEROW_SME
|
||||
@ -1217,6 +1219,18 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I444ToRGB24Row_SVE2(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I444ToRGB24Row_SME(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I210ToARGBRow_NEON(const uint16_t* src_y,
|
||||
const uint16_t* src_u,
|
||||
const uint16_t* src_v,
|
||||
|
||||
@ -51,6 +51,21 @@ extern "C" {
|
||||
"trn1 z0.b, z0.b, z0.b \n" \
|
||||
"prfm pldl1keep, [%[src_v], 448] \n"
|
||||
|
||||
// Read twice as much data from YUV, putting the even elements from the Y data
|
||||
// in z0.h and odd elements in z1.h.
|
||||
#define READYUV444_SVE_2X \
|
||||
"ld1b {z0.b}, p1/z, [%[src_y]] \n" \
|
||||
"ld1b {z2.b}, p1/z, [%[src_u]] \n" \
|
||||
"ld1b {z3.b}, p1/z, [%[src_v]] \n" \
|
||||
"incb %[src_y] \n" \
|
||||
"incb %[src_u] \n" \
|
||||
"incb %[src_v] \n" \
|
||||
"prfm pldl1keep, [%[src_y], 448] \n" \
|
||||
"prfm pldl1keep, [%[src_u], 128] \n" \
|
||||
"prfm pldl1keep, [%[src_v], 128] \n" \
|
||||
"trn2 z1.b, z0.b, z0.b \n" \
|
||||
"trn1 z0.b, z0.b, z0.b \n"
|
||||
|
||||
#define READYUV400_SVE \
|
||||
"ld1b {z0.h}, p1/z, [%[src_y]] \n" \
|
||||
"inch %[src_y] \n" \
|
||||
@ -193,6 +208,30 @@ extern "C" {
|
||||
"uqsub z16.h, z16.h, z25.h \n" /* B */ \
|
||||
"uqsub z18.h, z18.h, z27.h \n" /* R */
|
||||
|
||||
#define I444TORGB_SVE_2X \
|
||||
"umulh z0.h, z24.h, z0.h \n" /* Y0 */ \
|
||||
"umulh z1.h, z24.h, z1.h \n" /* Y1 */ \
|
||||
"umullb z6.h, z30.b, z2.b \n" \
|
||||
"umullt z7.h, z30.b, z2.b \n" \
|
||||
"umullb z4.h, z28.b, z2.b \n" /* DB */ \
|
||||
"umullt z2.h, z28.b, z2.b \n" /* DB */ \
|
||||
"umlalb z6.h, z31.b, z3.b \n" /* DG */ \
|
||||
"umlalt z7.h, z31.b, z3.b \n" /* DG */ \
|
||||
"umullb z5.h, z29.b, z3.b \n" /* DR */ \
|
||||
"umullt z3.h, z29.b, z3.b \n" /* DR */ \
|
||||
"add z17.h, z0.h, z26.h \n" /* G */ \
|
||||
"add z21.h, z1.h, z26.h \n" /* G */ \
|
||||
"add z16.h, z0.h, z4.h \n" /* B */ \
|
||||
"add z20.h, z1.h, z2.h \n" /* B */ \
|
||||
"add z18.h, z0.h, z5.h \n" /* R */ \
|
||||
"add z22.h, z1.h, z3.h \n" /* R */ \
|
||||
"uqsub z17.h, z17.h, z6.h \n" /* G */ \
|
||||
"uqsub z21.h, z21.h, z7.h \n" /* G */ \
|
||||
"uqsub z16.h, z16.h, z25.h \n" /* B */ \
|
||||
"uqsub z20.h, z20.h, z25.h \n" /* B */ \
|
||||
"uqsub z18.h, z18.h, z27.h \n" /* R */ \
|
||||
"uqsub z22.h, z22.h, z27.h \n" /* R */
|
||||
|
||||
// Like I4XXTORGB_SVE but U/V components are stored in even/odd .b lanes of z1
|
||||
// rather than widened .h elements of z1/z2.
|
||||
#define NVTORGB_SVE \
|
||||
@ -318,6 +357,53 @@ extern "C" {
|
||||
"z20", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", \
|
||||
"z31", "p0", "p1", "p2", "p3"
|
||||
|
||||
static inline void I444ToRGB24Row_SVE_SC(
|
||||
const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) STREAMING_COMPATIBLE {
|
||||
uint64_t vl;
|
||||
asm volatile(
|
||||
"cntb %[vl] \n"
|
||||
"ptrue p0.b \n" //
|
||||
YUVTORGB_SVE_SETUP
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// Run bulk of computation with an all-true predicate to avoid predicate
|
||||
// generation overhead.
|
||||
"ptrue p1.b \n"
|
||||
"1: \n" //
|
||||
READYUV444_SVE_2X I444TORGB_SVE_2X RGBTOARGB8_SVE_2X
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"
|
||||
"incb %[dst_rgb24], all, mul #3 \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[width], %w[width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"cnth %[vl] \n"
|
||||
"whilelt p1.b, wzr, %w[width] \n" //
|
||||
READYUV444_SVE_2X I444TORGB_SVE_2X RGBTOARGB8_SVE_2X
|
||||
"st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_u] "+r"(src_u), // %[src_u]
|
||||
[src_v] "+r"(src_v), // %[src_v]
|
||||
[dst_rgb24] "+r"(dst_rgb24), // %[dst_argb]
|
||||
[width] "+r"(width), // %[width]
|
||||
[vl] "=&r"(vl) // %[vl]
|
||||
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
|
||||
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
|
||||
: "cc", "memory", YUVTORGB_SVE_REGS);
|
||||
}
|
||||
|
||||
static inline void I400ToARGBRow_SVE_SC(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
|
||||
@ -871,6 +871,16 @@ int I444ToRGB24Matrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TORGB24ROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
I444ToRGB24Row = I444ToRGB24Row_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TORGB24ROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
I444ToRGB24Row = I444ToRGB24Row_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TORGB24ROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
|
||||
@ -7127,6 +7137,16 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TORGB24ROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
I444ToRGB24Row = I444ToRGB24Row_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TORGB24ROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
I444ToRGB24Row = I444ToRGB24Row_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TORGB24ROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
I444ToRGB24Row = I444ToRGB24Row_Any_MSA;
|
||||
@ -8952,6 +8972,16 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TORGB24ROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
I444ToRGB24Row = I444ToRGB24Row_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TORGB24ROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
I444ToRGB24Row = I444ToRGB24Row_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I444TORGB24ROW_RVV)
|
||||
if (TestCpuFlag(kCpuHasRVV)) {
|
||||
I444ToRGB24Row = I444ToRGB24Row_RVV;
|
||||
|
||||
@ -19,45 +19,6 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
|
||||
defined(__aarch64__)
|
||||
|
||||
// Read twice as much data from YUV, putting the even elements from the Y data
|
||||
// in z0.h and odd elements in z1.h.
|
||||
#define READYUV444_SVE_2X \
|
||||
"ld1b {z0.b}, p1/z, [%[src_y]] \n" \
|
||||
"ld1b {z2.b}, p1/z, [%[src_u]] \n" \
|
||||
"ld1b {z3.b}, p1/z, [%[src_v]] \n" \
|
||||
"incb %[src_y] \n" \
|
||||
"incb %[src_u] \n" \
|
||||
"incb %[src_v] \n" \
|
||||
"prfm pldl1keep, [%[src_y], 448] \n" \
|
||||
"prfm pldl1keep, [%[src_u], 128] \n" \
|
||||
"prfm pldl1keep, [%[src_v], 128] \n" \
|
||||
"trn2 z1.b, z0.b, z0.b \n" \
|
||||
"trn1 z0.b, z0.b, z0.b \n"
|
||||
|
||||
#define I444TORGB_SVE_2X \
|
||||
"umulh z0.h, z24.h, z0.h \n" /* Y0 */ \
|
||||
"umulh z1.h, z24.h, z1.h \n" /* Y1 */ \
|
||||
"umullb z6.h, z30.b, z2.b \n" \
|
||||
"umullt z7.h, z30.b, z2.b \n" \
|
||||
"umullb z4.h, z28.b, z2.b \n" /* DB */ \
|
||||
"umullt z2.h, z28.b, z2.b \n" /* DB */ \
|
||||
"umlalb z6.h, z31.b, z3.b \n" /* DG */ \
|
||||
"umlalt z7.h, z31.b, z3.b \n" /* DG */ \
|
||||
"umullb z5.h, z29.b, z3.b \n" /* DR */ \
|
||||
"umullt z3.h, z29.b, z3.b \n" /* DR */ \
|
||||
"add z17.h, z0.h, z26.h \n" /* G */ \
|
||||
"add z21.h, z1.h, z26.h \n" /* G */ \
|
||||
"add z16.h, z0.h, z4.h \n" /* B */ \
|
||||
"add z20.h, z1.h, z2.h \n" /* B */ \
|
||||
"add z18.h, z0.h, z5.h \n" /* R */ \
|
||||
"add z22.h, z1.h, z3.h \n" /* R */ \
|
||||
"uqsub z17.h, z17.h, z6.h \n" /* G */ \
|
||||
"uqsub z21.h, z21.h, z7.h \n" /* G */ \
|
||||
"uqsub z16.h, z16.h, z25.h \n" /* B */ \
|
||||
"uqsub z20.h, z20.h, z25.h \n" /* B */ \
|
||||
"uqsub z18.h, z18.h, z27.h \n" /* R */ \
|
||||
"uqsub z22.h, z22.h, z27.h \n" /* R */
|
||||
|
||||
#define RGBTOARGB8_SVE_2X \
|
||||
/* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \
|
||||
"uqshrnb z16.b, z16.h, #6 \n" /* B0 */ \
|
||||
@ -115,6 +76,16 @@ __arm_locally_streaming void I444ToARGBRow_SME(
|
||||
: "cc", "memory", YUVTORGB_SVE_REGS);
|
||||
}
|
||||
|
||||
__arm_locally_streaming void I444ToRGB24Row_SME(
|
||||
const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
I444ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
|
||||
}
|
||||
|
||||
__arm_locally_streaming void I400ToARGBRow_SME(
|
||||
const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
|
||||
@ -82,6 +82,15 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
: "cc", "memory", YUVTORGB_SVE_REGS);
|
||||
}
|
||||
|
||||
void I444ToRGB24Row_SVE2(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
I444ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
|
||||
}
|
||||
|
||||
void I400ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user