From d1ec694ad38f9c5f1cc07e8db7b4157a44a14ae8 Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 10 Apr 2024 16:36:26 +0100 Subject: [PATCH] [AArch64] Add P{210,410}To{ARGB,AR30}Row_NEON There are existing x86 implementations for these kernels, but not for AArch64, so add them. Reduction in runtimes, compared to the existing C code compiled with LLVM 17: | Cortex-A55 | Cortex-A510 | Cortex-A76 P210ToARGBRow | -59.8% | -16.8% | -53.2% P210ToAR30Row | -48.1% | -21.8% | -54.0% P410ToARGBRow | -56.5% | -32.2% | -54.1% P410ToAR30Row | -42.4% | -4.5% | -50.4% Co-authored-by: Cosmina Dunca Bug: libyuv:976 Change-Id: I24a5addd2c54c7fdfb9717e2a45ae5acd43d6e96 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5607764 Reviewed-by: Justin Green Reviewed-by: Frank Barchard --- include/libyuv/row.h | 44 ++++++++++++++ source/convert_argb.cc | 64 ++++++++++++++++++++ source/row_any.cc | 12 ++++ source/row_neon64.cc | 129 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 247 insertions(+), 2 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index e526eac1f..86b024321 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -564,6 +564,10 @@ extern "C" { #define HAS_I212TOARGBROW_NEON #define HAS_I212TOAR30ROW_NEON #define HAS_I422TOAR30ROW_NEON +#define HAS_P210TOAR30ROW_NEON +#define HAS_P210TOARGBROW_NEON +#define HAS_P410TOAR30ROW_NEON +#define HAS_P410TOARGBROW_NEON #define HAS_ABGRTOYJROW_NEON_DOTPROD #define HAS_ABGRTOYROW_NEON_DOTPROD @@ -5325,6 +5329,46 @@ void ABGRToAR30Row_Any_NEON(const uint8_t* src_ptr, void ARGBToAR30Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void P210ToARGBRow_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P210ToARGBRow_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I444ToARGBRow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index c70982dc2..11948726a 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -1891,6 +1891,14 @@ int P010ToARGBMatrix(const uint16_t* src_y, P210ToARGBRow = P210ToARGBRow_AVX2; } } +#endif +#if defined(HAS_P210TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + P210ToARGBRow = P210ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + P210ToARGBRow = P210ToARGBRow_NEON; + } + } #endif for (y = 0; y < height; ++y) { P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); @@ -1942,6 +1950,14 @@ int P210ToARGBMatrix(const uint16_t* src_y, P210ToARGBRow = P210ToARGBRow_AVX2; } } +#endif +#if defined(HAS_P210TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + P210ToARGBRow = P210ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + P210ToARGBRow = P210ToARGBRow_NEON; + } + } #endif for (y = 0; y < height; ++y) { P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); @@ -1991,6 +2007,14 @@ int P010ToAR30Matrix(const uint16_t* src_y, P210ToAR30Row = P210ToAR30Row_AVX2; } } +#endif +#if defined(HAS_P210TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + P210ToAR30Row = P210ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + P210ToAR30Row = P210ToAR30Row_NEON; + } + } #endif for (y = 0; y < height; ++y) { P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); @@ -2042,6 +2066,14 @@ int P210ToAR30Matrix(const uint16_t* src_y, P210ToAR30Row = P210ToAR30Row_AVX2; } } +#endif +#if defined(HAS_P210TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + P210ToAR30Row = P210ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + P210ToAR30Row = P210ToAR30Row_NEON; + } + } #endif for (y = 0; y < height; ++y) { P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); @@ -7931,6 +7963,14 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y, } } #endif +#if defined(HAS_P410TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + P410ToARGBRow = P410ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + P410ToARGBRow = P410ToARGBRow_NEON; + } + } +#endif #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 if (TestCpuFlag(kCpuHasSSE41)) { @@ -8024,6 +8064,14 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y, } } #endif +#if defined(HAS_P410TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + P410ToARGBRow = P410ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + P410ToARGBRow = P410ToARGBRow_NEON; + } + } +#endif #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 if (TestCpuFlag(kCpuHasSSE41)) { @@ -8103,6 +8151,14 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y, } } #endif +#if defined(HAS_P410TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + P410ToAR30Row = P410ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + P410ToAR30Row = P410ToAR30Row_NEON; + } + } +#endif #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 if (TestCpuFlag(kCpuHasSSE41)) { @@ -8196,6 +8252,14 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y, } } #endif +#if defined(HAS_P410TOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + P410ToAR30Row = P410ToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + P410ToAR30Row = P410ToAR30Row_NEON; + } + } +#endif #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 if (TestCpuFlag(kCpuHasSSE41)) { diff --git a/source/row_any.cc b/source/row_any.cc index 9a3af5e6b..1fc4d958b 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -897,6 +897,12 @@ ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #ifdef HAS_P210TOAR30ROW_AVX2 ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif +#ifdef HAS_P210TOAR30ROW_NEON +ANY21CT(P210ToAR30Row_Any_NEON, P210ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P210TOARGBROW_NEON +ANY21CT(P210ToARGBRow_Any_NEON, P210ToARGBRow_NEON, 1, 0, uint16_t, 2, 4, 7) +#endif #ifdef HAS_P410TOAR30ROW_SSSE3 ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) #endif @@ -909,6 +915,12 @@ ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) #ifdef HAS_P410TOAR30ROW_AVX2 ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif +#ifdef HAS_P410TOAR30ROW_NEON +ANY21CT(P410ToAR30Row_Any_NEON, P410ToAR30Row_NEON, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P410TOARGBROW_NEON +ANY21CT(P410ToARGBRow_Any_NEON, P410ToARGBRow_NEON, 0, 0, uint16_t, 2, 4, 7) +#endif #undef ANY21CT diff --git a/source/row_neon64.cc b/source/row_neon64.cc index bbe33842c..ba12ac9c0 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -52,6 +52,13 @@ extern "C" { "prfm pldl1keep, [%[src_u], 128] \n" \ "prfm pldl1keep, [%[src_v], 128] \n" +// Read 8 Y, 4 U and 4 V interleaved from 210 +#define READYUVP210 \ + "ldr q0, [%[src_y]], #16 \n" \ + "ldr q1, [%[src_uv]], #16 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "tbl v1.16b, {v1.16b}, v2.16b \n" + // Read 8 Y, 4 U and 4 V from 212 #define READYUV212 \ "ldr q2, [%[src_y]], #16 \n" \ @@ -80,6 +87,13 @@ extern "C" { "prfm pldl1keep, [%[src_u], 128] \n" \ "prfm pldl1keep, [%[src_v], 128] \n" +// Read 8 Y, 8 U and 8 V interleaved from 410 +#define READYUVP410 \ + "ldr q0, [%[src_y]], #16 \n" \ + "ldp q4, q5, [%[src_uv]], #32 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "tbl v1.16b, {v4.16b, v5.16b}, v2.16b \n" + // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ "ldr d0, [%[src_y]], #8 \n" \ @@ -208,9 +222,9 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13, */ \ "uqshl v0.8h, v16.8h, #2 \n" /* bbbbbbbbbbxxxxxx */ \ "uqshl v1.8h, v17.8h, #2 \n" /* ggggggggggxxxxxx */ \ - "umin v2.8h, v18.8h, v22.8h \n" /* 00rrrrrrrrrrxxxx */ \ + "umin v6.8h, v18.8h, v22.8h \n" /* 00rrrrrrrrrrxxxx */ \ "shl v4.8h, v1.8h, #4 \n" /* ggggggxxxxxx0000 */ \ - "orr v5.16b, v2.16b, v23.16b \n" /* 11rrrrrrrrrrxxxx */ \ + "orr v5.16b, v6.16b, v23.16b \n" /* 11rrrrrrrrrrxxxx */ \ "sri v4.8h, v0.8h, #6 \n" /* ggggggbbbbbbbbbb */ \ "sri v5.8h, v1.8h, #12 \n" /* 11rrrrrrrrrrgggg */ \ "st2 {v4.8h, v5.8h}, [%[dst_ar30]], #32 \n" @@ -442,6 +456,117 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, : "cc", "memory", YUVTORGB_REGS, "v19"); } +uint8_t kP210LoadShuffleIndices[] = {1, 1, 5, 5, 9, 9, 13, 13, + 3, 3, 7, 7, 11, 11, 15, 15}; + +void P210ToARGBRow_NEON(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kIndices]] \n" + "1: \n" // + READYUVP210 NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + +uint8_t kP410LoadShuffleIndices[] = {1, 5, 9, 13, 17, 21, 25, 29, + 3, 7, 11, 15, 19, 23, 27, 31}; + +void P410ToARGBRow_NEON(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kIndices]] \n" + "1: \n" // + READYUVP410 NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + +void P210ToAR30Row_NEON(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + const uint16_t limit = 0x3ff0; + asm volatile(YUVTORGB_SETUP + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "ldr q2, [%[kIndices]] \n" + "1: \n" READYUVP210 NVTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit), // %[limit] + [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); +} + +void P410ToAR30Row_NEON(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + uint16_t limit = 0x3ff0; + uint16_t alpha = 0xc000; + asm volatile(YUVTORGB_SETUP + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "ldr q2, [%[kIndices]] \n" + "1: \n" READYUVP410 NVTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit), // %[limit] + [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); +} + void I422ToAR30Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v,