From bbd9cedc4f0a474a2639fdcb8fef1c2c11c00396 Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 10 Apr 2024 16:36:26 +0100 Subject: [PATCH] [AArch64] Add Neon impls for I212To{ARGB,AR30}Row_NEON There are existing x86 implementations for these kernels, but not for AArch64, so add them. Reduction in runtimes, compared to the existing C code compiled with LLVM 17: | I210ToAR30Row | I210ToARGBRow Cortex-A55 | -40.8% | -54.4% Cortex-A510 | -26.2% | -22.7% Cortex-A76 | -49.2% | -44.5% Co-authored-by: Cosmina Dunca Bug: libyuv:976 Change-Id: I967951a6b453ac0023a30d96b754c85c2a3bf14a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5607762 Reviewed-by: Justin Green Reviewed-by: Frank Barchard --- include/libyuv/row.h | 26 +++++++++++++++++ source/convert_argb.cc | 16 +++++++++++ source/row_any.cc | 6 ++++ source/row_neon64.cc | 65 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 33a304e53..0340db6bf 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -561,6 +561,8 @@ extern "C" { #define HAS_I410TOARGBROW_NEON #define HAS_I210TOAR30ROW_NEON #define HAS_I410TOAR30ROW_NEON +#define HAS_I212TOARGBROW_NEON +#define HAS_I212TOAR30ROW_NEON #define HAS_ABGRTOYJROW_NEON_DOTPROD #define HAS_ABGRTOYROW_NEON_DOTPROD @@ -1122,6 +1124,18 @@ void I410ToAR30Row_NEON(const uint16_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); +void I212ToARGBRow_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I212ToAR30Row_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -5183,6 +5197,18 @@ void I410ToAR30Row_Any_NEON(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I212ToARGBRow_Any_NEON(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I212ToAR30Row_Any_NEON(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 0670f2d67..5c844fde5 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -1137,6 +1137,14 @@ int I012ToAR30Matrix(const uint16_t* src_y, I212ToAR30Row = I212ToAR30Row_AVX2; } } +#endif +#if defined(HAS_I212TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I212ToAR30Row = I212ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I212ToAR30Row = I212ToAR30Row_NEON; + } + } #endif for (y = 0; y < height; ++y) { I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); @@ -1592,6 +1600,14 @@ int I012ToARGBMatrix(const uint16_t* src_y, I212ToARGBRow = I212ToARGBRow_AVX2; } } +#endif +#if defined(HAS_I212TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I212ToARGBRow = I212ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I212ToARGBRow = I212ToARGBRow_NEON; + } + } #endif for (y = 0; y < height; ++y) { I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); diff --git a/source/row_any.cc b/source/row_any.cc index abd731fb2..46e11a556 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -528,6 +528,12 @@ ANY31CT(I210ToAR30Row_Any_NEON, I210ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7) #ifdef HAS_I410TOAR30ROW_NEON ANY31CT(I410ToAR30Row_Any_NEON, I410ToAR30Row_NEON, 0, 0, uint16_t, 2, 4, 7) #endif +#ifdef HAS_I212TOARGBROW_NEON +ANY31CT(I212ToARGBRow_Any_NEON, I212ToARGBRow_NEON, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I212TOAR30ROW_NEON +ANY31CT(I212ToAR30Row_Any_NEON, I212ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7) +#endif #undef ANY31CT // Any 3 planes to 1 plane with parameter diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 70b44d226..c17b58660 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -52,6 +52,21 @@ extern "C" { "prfm pldl1keep, [%[src_u], 128] \n" \ "prfm pldl1keep, [%[src_v], 128] \n" +// Read 8 Y, 4 U and 4 V from 212 +#define READYUV212 \ + "ldr q2, [%[src_y]], #16 \n" \ + "ldr d1, [%[src_u]], #8 \n" \ + "ldr d3, [%[src_v]], #8 \n" \ + "shl v0.8h, v2.8h, #4 \n" \ + "usra v0.8h, v2.8h, #8 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "zip1 v2.8h, v3.8h, v3.8h \n" \ + "zip1 v3.8h, v1.8h, v1.8h \n" \ + "uqshrn v1.8b, v3.8h, #4 \n" \ + "uqshrn2 v1.16b, v2.8h, #4 \n" \ + "prfm pldl1keep, [%[src_u], 128] \n" \ + "prfm pldl1keep, [%[src_v], 128] \n" + // Read 8 Y, 8 U and 8 V from 410 #define READYUV410 \ "ldr q1, [%[src_y]], #16 \n" \ @@ -307,6 +322,32 @@ void I410ToAR30Row_NEON(const uint16_t* src_y, : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); } +void I212ToAR30Row_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + const uint16_t limit = 0x3ff0; + asm(YUVTORGB_SETUP + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "1: \n" READYUV212 NVTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit) // %[limit] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); +} + void I210ToARGBRow_NEON(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, @@ -351,6 +392,30 @@ void I410ToARGBRow_NEON(const uint16_t* src_y, : "cc", "memory", YUVTORGB_REGS, "v19"); } +void I212ToARGBRow_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + asm(YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "1: \n" READYUV212 NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + void I422ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v,