From d32436e8f816198236701b3363bd418c764c13c2 Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 10 Apr 2024 16:36:26 +0100 Subject: [PATCH] [AArch64] Add Neon implementation for I422ToAR30Row_NEON There is an existing x86 implementation for this kernel, but not for AArch64, so add one. Reduction in runtimes, compared to the existing C code compiled with LLVM 17: Cortex-A55: -43.1% Cortex-A510: -22.3% Cortex-A76: -54.8% Co-authored-by: Cosmina Dunca Bug: libyuv:976 Change-Id: Ifead36bcb8682a527136223e0dcd210e9abe744a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5607763 Reviewed-by: Frank Barchard Reviewed-by: Justin Green --- include/libyuv/row.h | 13 +++++++++++++ source/convert_argb.cc | 8 ++++++++ source/row_any.cc | 3 +++ source/row_neon64.cc | 26 ++++++++++++++++++++++++++ 4 files changed, 50 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 0340db6bf..e526eac1f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -563,6 +563,7 @@ extern "C" { #define HAS_I410TOAR30ROW_NEON #define HAS_I212TOARGBROW_NEON #define HAS_I212TOAR30ROW_NEON +#define HAS_I422TOAR30ROW_NEON #define HAS_ABGRTOYJROW_NEON_DOTPROD #define HAS_ABGRTOYROW_NEON_DOTPROD @@ -1148,6 +1149,12 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToAR30Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I444AlphaToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -5267,6 +5274,12 @@ void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToAR30Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 5c844fde5..c70982dc2 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -6241,6 +6241,14 @@ int I420ToAR30Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToAR30Row = I422ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToAR30Row = I422ToAR30Row_NEON; + } + } +#endif for (y = 0; y < height; ++y) { I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); diff --git a/source/row_any.cc b/source/row_any.cc index 46e11a556..9a3af5e6b 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -420,6 +420,9 @@ ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15) #ifdef HAS_I444TORGB24ROW_NEON ANY31C(I444ToRGB24Row_Any_NEON, I444ToRGB24Row_NEON, 0, 0, 3, 7) #endif +#ifdef HAS_I422TOAR30ROW_NEON +ANY31C(I422ToAR30Row_Any_NEON, I422ToAR30Row_NEON, 1, 0, 4, 7) +#endif #ifdef HAS_I422TOARGBROW_NEON ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index c17b58660..8a4193196 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -440,6 +440,32 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, : "cc", "memory", YUVTORGB_REGS, "v19"); } +void I422ToAR30Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + const uint16_t limit = 0x3ff0; + asm(YUVTORGB_SETUP + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "1: \n" READYUV422 I4XXTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit) // %[limit] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); +} + void I444AlphaToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v,