From a4ccf9940e2389c3c791c71a0aec53099bc0fee0 Mon Sep 17 00:00:00 2001 From: George Steed Date: Thu, 18 Apr 2024 16:18:10 +0100 Subject: [PATCH] [AArch64] Add I8MM implementation of ARGBToUV444Row We cannot use the standard dot-product instructions since the coefficients multiplication results are both added and subtracted, but I8MM supports mixed-sign dot products which work well here. We need to add an additional variant of the coefficient structs since we need negative constants for the elements that were previously subtracted. Reduction in runtimes observed compared to the previous Neon implementation: Cortex-A510: -37.3% Cortex-A520: -31.1% Cortex-A715: -37.1% Cortex-A720: -37.0% Cortex-X2: -62.1% Cortex-X3: -62.2% Cortex-X4: -40.4% Bug: libyuv:977 Change-Id: Idc3d9a6408c30e1bce3816a1ed926ecd76792236 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5712928 Reviewed-by: Frank Barchard Reviewed-by: Justin Green --- include/libyuv/row.h | 9 +++++ source/convert_from_argb.cc | 8 +++++ source/row_any.cc | 3 ++ source/row_neon64.cc | 65 +++++++++++++++++++++++++++++++++---- 4 files changed, 79 insertions(+), 6 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d080bb2e8..5e0d2ae9f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -578,6 +578,7 @@ extern "C" { #define HAS_RGBATOYROW_NEON_DOTPROD #define HAS_ARGBCOLORMATRIXROW_NEON_I8MM +#define HAS_ARGBTOUV444ROW_NEON_I8MM #endif // The following are available on AArch64 SVE platforms: @@ -1611,6 +1612,10 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2118,6 +2123,10 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUV444Row_Any_NEON_I8MM(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index a766beaa6..a27f9d922 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -68,6 +68,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON_I8MM; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUV444ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToUV444Row = ARGBToUV444Row_Any_MSA; diff --git a/source/row_any.cc b/source/row_any.cc index 50485bcdf..2118ad500 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2116,6 +2116,9 @@ ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) #endif +#ifdef HAS_ARGBTOUV444ROW_NEON_I8MM +ANY12(ARGBToUV444Row_Any_NEON_I8MM, ARGBToUV444Row_NEON_I8MM, 0, 4, 0, 7) +#endif #ifdef HAS_YUY2TOUV422ROW_MSA ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 062e7fb19..e5eb353b7 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2711,18 +2711,23 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } -struct RgbUVConstants { +struct RgbUVConstantsU8 { uint8_t kRGBToU[4]; uint8_t kRGBToV[4]; }; +struct RgbUVConstantsI8 { + int8_t kRGBToU[4]; + int8_t kRGBToV[4]; +}; + // 8x1 pixels. void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width, - const struct RgbUVConstants* rgbuvconstants) { - asm volatile ( + const struct RgbUVConstantsU8* rgbuvconstants) { + asm volatile( "ldr d0, [%4] \n" // load rgbuvconstants "dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient "dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient @@ -2758,6 +2763,42 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, "v27", "v28", "v29"); } +void ARGBToUV444MatrixRow_NEON_I8MM( + const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstantsI8* rgbuvconstants) { + asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" + "movi v29.16b, #0x80 \n" // 128.5 + "1: \n" + "ldp q0, q1, [%[src]], #32 \n" + "movi v2.4s, #0 \n" + "movi v3.4s, #0 \n" + "movi v4.4s, #0 \n" + "movi v5.4s, #0 \n" + "usdot v2.4s, v0.16b, v16.16b \n" + "usdot v3.4s, v1.16b, v16.16b \n" + "usdot v4.4s, v0.16b, v17.16b \n" + "usdot v5.4s, v1.16b, v17.16b \n" + "prfm pldl1keep, [%[src], 448] \n" + "subs %w[width], %w[width], #8 \n" // 8 processed per loop. + "uzp1 v0.8h, v2.8h, v3.8h \n" + "uzp1 v1.8h, v4.8h, v5.8h \n" + "addhn v0.8b, v0.8h, v29.8h \n" // +128 -> unsigned + "addhn v1.8b, v1.8h, v29.8h \n" // +128 -> unsigned + "str d0, [%[dst_u]], #8 \n" // store 8 pixels U. + "str d1, [%[dst_v]], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : [src] "+r"(src_argb), // %[src] + [dst_u] "+r"(dst_u), // %[dst_u] + [dst_v] "+r"(dst_v), // %[dst_v] + [width] "+r"(width) // %[width] + : [rgbuvconstants] "r"(rgbuvconstants) // %[rgbuvconstants] + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", + "v29"); +} + // RGB to bt601 coefficients // UB 0.875 coefficient = 112 // UG -0.5781 coefficient = 74 @@ -2766,15 +2807,27 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, // VG -0.7344 coefficient = 94 // VR 0.875 coefficient = 112 (ignored) -static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0}, - {18, 94, 112, 0}}; +static const struct RgbUVConstantsU8 kRgb24I601UVConstantsU8 = { + {112, 74, 38, 0}, + {18, 94, 112, 0}}; +static const struct RgbUVConstantsI8 kRgb24I601UVConstantsI8 = { + {112, -74, -38, 0}, + {-18, -94, 112, 0}}; void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kRgb24I601UVConstants); + &kRgb24I601UVConstantsU8); +} + +void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width, + &kRgb24I601UVConstantsI8); } #define RGBTOUV_SETUP_REG \