From 949cb623bf904c5e7a7c060ab0ae609574870fb3 Mon Sep 17 00:00:00 2001 From: George Steed Date: Sun, 23 Mar 2025 10:13:57 +0000 Subject: [PATCH] Add SVE2 and SME implementations of I444ToRGB24Row Move the READYUV444_SVE_2X and I444TORGB_SVE_2X macros to row_sve.h so they are usable in both SVE2 and SME implementations, and use them to add new I444ToRGB24Row implementations for SVE2 and SME. We need to use the unrolled versions here to use the ST3B interleaving store instructions, since there is no partial vector version of this store instruction. Reduction in time taken observed for the new SVE2 implementation, compared to the existing Neon implementation: Cortex-A510: -57.6% Cortex-A520: -38.1% Cortex-A710: -15.5% Cortex-A715: -9.2% Cortex-A720: -9.2% Cortex-X2: -25.8% Cortex-X3: -26.2% Cortex-X4: -23.2% Cortex-X925: -17.8% Change-Id: I6acd0b798a35e5352d4fad664769f12d3d938ed7 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6530646 Reviewed-by: Justin Green Reviewed-by: Frank Barchard --- include/libyuv/row.h | 14 +++++++ include/libyuv/row_sve.h | 86 ++++++++++++++++++++++++++++++++++++++++ source/convert_argb.cc | 30 ++++++++++++++ source/row_sme.cc | 49 +++++------------------ source/row_sve.cc | 9 +++++ 5 files changed, 149 insertions(+), 39 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 46cec2723..e83773f11 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -597,6 +597,7 @@ extern "C" { #define HAS_I422TORGBAROW_SVE2 #define HAS_I444ALPHATOARGBROW_SVE2 #define HAS_I444TOARGBROW_SVE2 +#define HAS_I444TORGB24ROW_SVE2 #define HAS_NV12TOARGBROW_SVE2 #define HAS_NV12TORGB24ROW_SVE2 #define HAS_NV21TOARGBROW_SVE2 @@ -639,6 +640,7 @@ extern "C" { #define HAS_I422TORGBAROW_SME #define HAS_I444ALPHATOARGBROW_SME #define HAS_I444TOARGBROW_SME +#define HAS_I444TORGB24ROW_SME #define HAS_INTERPOLATEROW_16_SME #define HAS_INTERPOLATEROW_16TO8_SME #define HAS_INTERPOLATEROW_SME @@ -1217,6 +1219,18 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); +void I444ToRGB24Row_SVE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void I444ToRGB24Row_SME(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); void I210ToARGBRow_NEON(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h index 82103419c..5c0c210df 100644 --- a/include/libyuv/row_sve.h +++ b/include/libyuv/row_sve.h @@ -51,6 +51,21 @@ extern "C" { "trn1 z0.b, z0.b, z0.b \n" \ "prfm pldl1keep, [%[src_v], 448] \n" +// Read twice as much data from YUV, putting the even elements from the Y data +// in z0.h and odd elements in z1.h. +#define READYUV444_SVE_2X \ + "ld1b {z0.b}, p1/z, [%[src_y]] \n" \ + "ld1b {z2.b}, p1/z, [%[src_u]] \n" \ + "ld1b {z3.b}, p1/z, [%[src_v]] \n" \ + "incb %[src_y] \n" \ + "incb %[src_u] \n" \ + "incb %[src_v] \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "prfm pldl1keep, [%[src_u], 128] \n" \ + "prfm pldl1keep, [%[src_v], 128] \n" \ + "trn2 z1.b, z0.b, z0.b \n" \ + "trn1 z0.b, z0.b, z0.b \n" + #define READYUV400_SVE \ "ld1b {z0.h}, p1/z, [%[src_y]] \n" \ "inch %[src_y] \n" \ @@ -193,6 +208,30 @@ extern "C" { "uqsub z16.h, z16.h, z25.h \n" /* B */ \ "uqsub z18.h, z18.h, z27.h \n" /* R */ +#define I444TORGB_SVE_2X \ + "umulh z0.h, z24.h, z0.h \n" /* Y0 */ \ + "umulh z1.h, z24.h, z1.h \n" /* Y1 */ \ + "umullb z6.h, z30.b, z2.b \n" \ + "umullt z7.h, z30.b, z2.b \n" \ + "umullb z4.h, z28.b, z2.b \n" /* DB */ \ + "umullt z2.h, z28.b, z2.b \n" /* DB */ \ + "umlalb z6.h, z31.b, z3.b \n" /* DG */ \ + "umlalt z7.h, z31.b, z3.b \n" /* DG */ \ + "umullb z5.h, z29.b, z3.b \n" /* DR */ \ + "umullt z3.h, z29.b, z3.b \n" /* DR */ \ + "add z17.h, z0.h, z26.h \n" /* G */ \ + "add z21.h, z1.h, z26.h \n" /* G */ \ + "add z16.h, z0.h, z4.h \n" /* B */ \ + "add z20.h, z1.h, z2.h \n" /* B */ \ + "add z18.h, z0.h, z5.h \n" /* R */ \ + "add z22.h, z1.h, z3.h \n" /* R */ \ + "uqsub z17.h, z17.h, z6.h \n" /* G */ \ + "uqsub z21.h, z21.h, z7.h \n" /* G */ \ + "uqsub z16.h, z16.h, z25.h \n" /* B */ \ + "uqsub z20.h, z20.h, z25.h \n" /* B */ \ + "uqsub z18.h, z18.h, z27.h \n" /* R */ \ + "uqsub z22.h, z22.h, z27.h \n" /* R */ + // Like I4XXTORGB_SVE but U/V components are stored in even/odd .b lanes of z1 // rather than widened .h elements of z1/z2. #define NVTORGB_SVE \ @@ -318,6 +357,53 @@ extern "C" { "z20", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", \ "z31", "p0", "p1", "p2", "p3" +static inline void I444ToRGB24Row_SVE_SC( + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) STREAMING_COMPATIBLE { + uint64_t vl; + asm volatile( + "cntb %[vl] \n" + "ptrue p0.b \n" // + YUVTORGB_SVE_SETUP + "subs %w[width], %w[width], %w[vl] \n" + "b.lt 2f \n" + + // Run bulk of computation with an all-true predicate to avoid predicate + // generation overhead. + "ptrue p1.b \n" + "1: \n" // + READYUV444_SVE_2X I444TORGB_SVE_2X RGBTOARGB8_SVE_2X + "subs %w[width], %w[width], %w[vl] \n" + "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n" + "incb %[dst_rgb24], all, mul #3 \n" + "b.ge 1b \n" + + "2: \n" + "adds %w[width], %w[width], %w[vl] \n" + "b.eq 99f \n" + + // Calculate a predicate for the final iteration to deal with the tail. + "cnth %[vl] \n" + "whilelt p1.b, wzr, %w[width] \n" // + READYUV444_SVE_2X I444TORGB_SVE_2X RGBTOARGB8_SVE_2X + "st3b {z16.b, z17.b, z18.b}, p1, [%[dst_rgb24]] \n" + + "99: \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_argb] + [width] "+r"(width), // %[width] + [vl] "=&r"(vl) // %[vl] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_SVE_REGS); +} + static inline void I400ToARGBRow_SVE_SC(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 41997fe3b..e9346f3e8 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -871,6 +871,16 @@ int I444ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I444TORGB24ROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + I444ToRGB24Row = I444ToRGB24Row_SVE2; + } +#endif +#if defined(HAS_I444TORGB24ROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + I444ToRGB24Row = I444ToRGB24Row_SME; + } +#endif #if defined(HAS_I444TORGB24ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I444ToRGB24Row = I444ToRGB24Row_Any_MSA; @@ -7127,6 +7137,16 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y, } } #endif +#if defined(HAS_I444TORGB24ROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + I444ToRGB24Row = I444ToRGB24Row_SVE2; + } +#endif +#if defined(HAS_I444TORGB24ROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + I444ToRGB24Row = I444ToRGB24Row_SME; + } +#endif #if defined(HAS_I444TORGB24ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I444ToRGB24Row = I444ToRGB24Row_Any_MSA; @@ -8952,6 +8972,16 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y, } } #endif +#if defined(HAS_I444TORGB24ROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + I444ToRGB24Row = I444ToRGB24Row_SVE2; + } +#endif +#if defined(HAS_I444TORGB24ROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + I444ToRGB24Row = I444ToRGB24Row_SME; + } +#endif #if defined(HAS_I444TORGB24ROW_RVV) if (TestCpuFlag(kCpuHasRVV)) { I444ToRGB24Row = I444ToRGB24Row_RVV; diff --git a/source/row_sme.cc b/source/row_sme.cc index 1cbc42f3e..c6917bf3c 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -19,45 +19,6 @@ extern "C" { #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ defined(__aarch64__) -// Read twice as much data from YUV, putting the even elements from the Y data -// in z0.h and odd elements in z1.h. -#define READYUV444_SVE_2X \ - "ld1b {z0.b}, p1/z, [%[src_y]] \n" \ - "ld1b {z2.b}, p1/z, [%[src_u]] \n" \ - "ld1b {z3.b}, p1/z, [%[src_v]] \n" \ - "incb %[src_y] \n" \ - "incb %[src_u] \n" \ - "incb %[src_v] \n" \ - "prfm pldl1keep, [%[src_y], 448] \n" \ - "prfm pldl1keep, [%[src_u], 128] \n" \ - "prfm pldl1keep, [%[src_v], 128] \n" \ - "trn2 z1.b, z0.b, z0.b \n" \ - "trn1 z0.b, z0.b, z0.b \n" - -#define I444TORGB_SVE_2X \ - "umulh z0.h, z24.h, z0.h \n" /* Y0 */ \ - "umulh z1.h, z24.h, z1.h \n" /* Y1 */ \ - "umullb z6.h, z30.b, z2.b \n" \ - "umullt z7.h, z30.b, z2.b \n" \ - "umullb z4.h, z28.b, z2.b \n" /* DB */ \ - "umullt z2.h, z28.b, z2.b \n" /* DB */ \ - "umlalb z6.h, z31.b, z3.b \n" /* DG */ \ - "umlalt z7.h, z31.b, z3.b \n" /* DG */ \ - "umullb z5.h, z29.b, z3.b \n" /* DR */ \ - "umullt z3.h, z29.b, z3.b \n" /* DR */ \ - "add z17.h, z0.h, z26.h \n" /* G */ \ - "add z21.h, z1.h, z26.h \n" /* G */ \ - "add z16.h, z0.h, z4.h \n" /* B */ \ - "add z20.h, z1.h, z2.h \n" /* B */ \ - "add z18.h, z0.h, z5.h \n" /* R */ \ - "add z22.h, z1.h, z3.h \n" /* R */ \ - "uqsub z17.h, z17.h, z6.h \n" /* G */ \ - "uqsub z21.h, z21.h, z7.h \n" /* G */ \ - "uqsub z16.h, z16.h, z25.h \n" /* B */ \ - "uqsub z20.h, z20.h, z25.h \n" /* B */ \ - "uqsub z18.h, z18.h, z27.h \n" /* R */ \ - "uqsub z22.h, z22.h, z27.h \n" /* R */ - #define RGBTOARGB8_SVE_2X \ /* Inputs: B: z16.h, G: z17.h, R: z18.h, A: z19.b */ \ "uqshrnb z16.b, z16.h, #6 \n" /* B0 */ \ @@ -115,6 +76,16 @@ __arm_locally_streaming void I444ToARGBRow_SME( : "cc", "memory", YUVTORGB_SVE_REGS); } +__arm_locally_streaming void I444ToRGB24Row_SME( + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + I444ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); +} + __arm_locally_streaming void I400ToARGBRow_SME( const uint8_t* src_y, uint8_t* dst_argb, diff --git a/source/row_sve.cc b/source/row_sve.cc index ba89b163a..474c7950c 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -82,6 +82,15 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y, : "cc", "memory", YUVTORGB_SVE_REGS); } +void I444ToRGB24Row_SVE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + I444ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); +} + void I400ToARGBRow_SVE2(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants,