Add SVE2 and SME implementations of I422ToAR30Row

This can make use of the existing load/convert/store macros that are already present for other kernels, so add I422ToAR30Row_SVE2 and I422ToAR30Row_SME to match the existing kernels. Reduction in time taken observed for the new SVE2 implementation, compared to the existing Neon implementation: Cortex-A510: -9.1% Cortex-A520: +6.8% (!) Cortex-A710: -4.0% Cortex-A715: -1.1% Cortex-A720: -1.1% Cortex-X2: -5.7% Cortex-X3: -5.9% Cortex-X4: -2.8% Cortex-X925: -4.0% Change-Id: Ibf8bfaaeaba51f426649ded621cb0c8948dd9ee1 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6592332 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-06 16:56:55 +08:00 · 2025-03-21 16:31:23 +00:00 · 2025-03-21 16:31:23 +00:00 · 7e5863ae5a
commit 7e5863ae5a
parent 3489272e28
5 changed files with 90 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -589,6 +589,7 @@ extern "C" {
 #define HAS_I410TOAR30ROW_SVE2
 #define HAS_I410TOARGBROW_SVE2
 #define HAS_I422ALPHATOARGBROW_SVE2
 #define HAS_I422TOAR30ROW_SVE2
 #define HAS_I422TOARGB1555ROW_SVE2
 #define HAS_I422TOARGB4444ROW_SVE2
 #define HAS_I422TOARGBROW_SVE2
@ -632,6 +633,7 @@ extern "C" {
 #define HAS_I410TOAR30ROW_SME
 #define HAS_I410TOARGBROW_SME
 #define HAS_I422ALPHATOARGBROW_SME
 #define HAS_I422TOAR30ROW_SME
 #define HAS_I422TOARGB1555ROW_SME
 #define HAS_I422TOARGB4444ROW_SME
 #define HAS_I422TOARGBROW_SME
@ -1363,6 +1365,18 @@ void I422ToAR30Row_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
 void I422ToAR30Row_SVE2(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
 void I422ToAR30Row_SME(const uint8_t* src_y,
                       const uint8_t* src_u,
                       const uint8_t* src_v,
                       uint8_t* dst_argb,
                       const struct YuvConstants* yuvconstants,
                       int width);
 void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@ -769,6 +769,53 @@ static inline void I422ToRGBARow_SVE_SC(const uint8_t* src_y,
      : "cc", "memory", YUVTORGB_SVE_REGS);
 }
 static inline void I422ToAR30Row_SVE_SC(const uint8_t* src_y,
                                        const uint8_t* src_u,
                                        const uint8_t* src_v,
                                        uint8_t* dst_ar30,
                                        const struct YuvConstants* yuvconstants,
                                        int width) STREAMING_COMPATIBLE {
  uint64_t vl;
  // The limit is used for saturating the 2.14 red channel in STOREAR30_SVE.
  const uint16_t limit = 0x3ff0;
  asm volatile(
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n"  //
      YUVTORGB_SVE_SETUP
      "dup      z19.b, #255                             \n"  // Alpha
      "dup      z23.h, %w[limit]                        \n"
      "subs     %w[width], %w[width], %w[vl]            \n"
      "b.le     2f                                      \n"
      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue    p1.h                                    \n"
      "1:                                               \n"  //
      READYUV422_SVE I4XXTORGB_SVE STOREAR30_SVE
      "subs     %w[width], %w[width], %w[vl]            \n"
      "b.gt     1b                                      \n"
      // Calculate a predicate for the final iteration to deal with the tail.
      "2:                                               \n"
      "adds    %w[width], %w[width], %w[vl]             \n"
      "b.eq    99f                                      \n"
      "whilelt  p1.h, wzr, %w[width]                    \n"  //
      READYUV422_SVE I4XXTORGB_SVE STOREAR30_SVE
      "99:                                              \n"
      : [src_y] "+r"(src_y),                                // %[src_y]
        [src_u] "+r"(src_u),                                // %[src_u]
        [src_v] "+r"(src_v),                                // %[src_v]
        [dst_ar30] "+r"(dst_ar30),                          // %[dst_ar30]
        [width] "+r"(width),                                // %[width]
        [vl] "=&r"(vl)                                      // %[vl]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
        [limit] "r"(limit)                                  // %[limit]
      : "cc", "memory", YUVTORGB_SVE_REGS);
 }
 static inline void I422AlphaToARGBRow_SVE_SC(
    const uint8_t* src_y,
    const uint8_t* src_u,
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -6710,6 +6710,16 @@ int I420ToAR30Matrix(const uint8_t* src_y,
    }
  }
 #endif
 #if defined(HAS_I422TOAR30ROW_SVE2)
  if (TestCpuFlag(kCpuHasSVE2)) {
    I422ToAR30Row = I422ToAR30Row_SVE2;
  }
 #endif
 #if defined(HAS_I422TOAR30ROW_SME)
  if (TestCpuFlag(kCpuHasSME)) {
    I422ToAR30Row = I422ToAR30Row_SME;
  }
 #endif
  for (y = 0; y < height; ++y) {
    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -158,6 +158,16 @@ __arm_locally_streaming void I422ToRGBARow_SME(
  I422ToRGBARow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }
 __arm_locally_streaming void I422ToAR30Row_SME(
    const uint8_t* src_y,
    const uint8_t* src_u,
    const uint8_t* src_v,
    uint8_t* dst_argb,
    const struct YuvConstants* yuvconstants,
    int width) {
  I422ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }
 __arm_locally_streaming void I422AlphaToARGBRow_SME(
    const uint8_t* src_y,
    const uint8_t* src_u,
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -154,6 +154,15 @@ void I422ToRGBARow_SVE2(const uint8_t* src_y,
  I422ToRGBARow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }
 void I422ToAR30Row_SVE2(const uint8_t* src_y,
                        const uint8_t* src_u,
                        const uint8_t* src_v,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  I422ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
 }
 void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,