From cd4113f4e8463b653985054e7f16699de35a3a9d Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 23 Apr 2024 15:13:47 +0100 Subject: [PATCH] [AArch64] Add SVE2 implementation of I400ToARGBRow This is mostly a copy of the I422ToARGBRow_SVE2 implementation, but we can pre-calculate the UV component results before the loop body. Unlike in the Neon version of the code we can make use of MOVPRFX and USQADD to avoid needing to apply the bias separately from the UV coefficient multiply additions. Reduction in runtime observed compared to the existing Neon code: Cortex-A510: -26.1% Cortex-A520: -5.9% Cortex-A715: -49.5% Cortex-A720: -49.4% Cortex-X2: -22.5% Cortex-X3: -23.5% Cortex-X4: -21.6% Bug: libyuv:973 Change-Id: Ib9fc52bd53a1c6a1aac8bd865ab88539aca098ea Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5598767 Reviewed-by: Justin Green Reviewed-by: Frank Barchard --- include/libyuv/row.h | 5 ++++ source/convert_argb.cc | 5 ++++ source/row_sve.cc | 60 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f3e1d2eaf..0c5194b7a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -585,6 +585,7 @@ extern "C" { #define HAS_AYUVTOUVROW_SVE2 #define HAS_AYUVTOVUROW_SVE2 #define HAS_BGRATOUVROW_SVE2 +#define HAS_I400TOARGBROW_SVE2 #define HAS_I422ALPHATOARGBROW_SVE2 #define HAS_I422TOARGBROW_SVE2 #define HAS_I422TORGBAROW_SVE2 @@ -4743,6 +4744,10 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I400ToARGBRow_SVE2(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index bc37802b0..0aae2f8b3 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -2980,6 +2980,11 @@ int I400ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I400TOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + I400ToARGBRow = I400ToARGBRow_SVE2; + } +#endif #if defined(HAS_I400TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I400ToARGBRow = I400ToARGBRow_Any_MSA; diff --git a/source/row_sve.cc b/source/row_sve.cc index f06e46fbd..814023056 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -43,6 +43,12 @@ extern "C" { "trn1 z1.h, z1.h, z1.h \n" \ "trn1 z2.h, z2.h, z2.h \n" +#define READYUV400_SVE \ + "ld1b {z0.h}, p1/z, [%[src_y]] \n" \ + "inch %[src_y] \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "trn1 z0.b, z0.b, z0.b \n" + // We need a different predicate for the UV component to handle the tail. // If there is a single element remaining then we want to load one Y element // but two UV elements. @@ -80,6 +86,14 @@ extern "C" { "uqsub z16.h, z16.h, z25.h \n" /* B */ \ "uqsub z18.h, z18.h, z27.h \n" /* R */ +#define I400TORGB_SVE \ + "umulh z18.h, z24.h, z0.h \n" /* Y */ \ + "movprfx z16, z18 \n" \ + "usqadd z16.h, p0/m, z16.h, z4.h \n" /* B */ \ + "movprfx z17, z18 \n" \ + "usqadd z17.h, p0/m, z17.h, z6.h \n" /* G */ \ + "usqadd z18.h, p0/m, z18.h, z5.h \n" /* R */ + // Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA // pairs to allow us to use ST2 for storing rather than ST4. #define RGBTOARGB8_SVE \ @@ -146,6 +160,52 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y, : "cc", "memory", YUVTORGB_SVE_REGS); } +void I400ToARGBRow_SVE2(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + uint64_t vl; + asm("cnth %[vl] \n" + "ptrue p0.b \n" + "dup z19.b, #255 \n" // A + YUVTORGB_SVE_SETUP + "cmp %w[width], %w[vl] \n" + "mov z1.h, #128 \n" // U/V + "mul z6.h, z30.h, z1.h \n" + "mul z4.h, z28.h, z1.h \n" // DB + "mul z5.h, z29.h, z1.h \n" // DR + "mla z6.h, p0/m, z31.h, z1.h \n" // DG + "sub z4.h, z4.h, z25.h \n" + "sub z5.h, z5.h, z27.h \n" + "sub z6.h, z26.h, z6.h \n" + "b.le 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p1.h \n" + "sub %w[width], %w[width], %w[vl] \n" + "1: \n" // + READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE + "subs %w[width], %w[width], %w[vl] \n" + "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" + "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" + "b.gt 1b \n" + "add %w[width], %w[width], %w[vl] \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "2: \n" + "whilelt p1.h, wzr, %w[width] \n" // + READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE + "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" + : [src_y] "+r"(src_y), // %[src_y] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_SVE_REGS); +} + void I422ToARGBRow_SVE2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v,