From 5b4160b9c322fda98e2208d80c2ea75dd7e7f25f Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 10 Apr 2024 16:36:26 +0100 Subject: [PATCH] [AArch64] Add Neon impls for I{210,410}AlphaToARGBRow_NEON There are existing x86 implementations for these kernels, but not for AArch64, so add them. Reduction in runtimes, compared to the existing C code compiled with LLVM 17: | I210AlphaToARGBRow | I410AlphaToARGBRow Cortex-A55 | -55.3% | -56.1% Cortex-A510 | -27.9% | -42.6% Cortex-A76 | -54.9% | -60.3% Co-authored-by: Cosmina Dunca Bug: libyuv:976 Change-Id: Ieb7ad945abda72babd0cfe1020738d31e3562705 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5465593 Reviewed-by: Justin Green Reviewed-by: Frank Barchard --- include/libyuv/row.h | 30 +++++++++++++++ source/convert_argb.cc | 40 ++++++++++++++++++++ source/row_any.cc | 21 +++++++++++ source/row_neon64.cc | 84 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 175 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index b0ee4bf2d..b547e8194 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -555,6 +555,8 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #define HAS_ARGBTOAR30ROW_NEON #define HAS_ABGRTOAR30ROW_NEON +#define HAS_I210ALPHATOARGBROW_NEON +#define HAS_I410ALPHATOARGBROW_NEON #define HAS_ABGRTOYJROW_NEON_DOTPROD #define HAS_ABGRTOYROW_NEON_DOTPROD @@ -1041,6 +1043,20 @@ struct YuvConstants { IACA_UD_BYTES \ } +void I210AlphaToARGBRow_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); void I444ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -5072,6 +5088,20 @@ void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I410AlphaToARGBRow_Any_NEON(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_Any_NEON(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGBARow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 627a62020..c56d08c99 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -2576,6 +2576,14 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } +#if defined(HAS_I210ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_NEON; + } + } +#endif #if defined(HAS_I210ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; @@ -2682,6 +2690,14 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } +#if defined(HAS_I210ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_NEON; + } + } +#endif #if defined(HAS_I210ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; @@ -2786,6 +2802,14 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } +#if defined(HAS_I410ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_NEON; + } + } +#endif #if defined(HAS_I410ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; @@ -7436,6 +7460,14 @@ static int I010AlphaToARGBMatrixBilinear( dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } +#if defined(HAS_I410ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_NEON; + } + } +#endif #if defined(HAS_I410ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; @@ -7604,6 +7636,14 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } +#if defined(HAS_I410ALPHATOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_NEON; + } + } +#endif #if defined(HAS_I410ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index 459dad26a..af48ebbd1 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -142,6 +142,27 @@ ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15) memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } +#ifdef HAS_I210ALPHATOARGBROW_NEON +ANY41CT(I210AlphaToARGBRow_Any_NEON, + I210AlphaToARGBRow_NEON, + 1, + 0, + uint16_t, + 2, + 4, + 7); +#endif +#ifdef HAS_I410ALPHATOARGBROW_NEON +ANY41CT(I410AlphaToARGBRow_Any_NEON, + I410AlphaToARGBRow_NEON, + 0, + 0, + uint16_t, + 2, + 4, + 7); +#endif + #ifdef HAS_I210ALPHATOARGBROW_SSSE3 ANY41CT(I210AlphaToARGBRow_Any_SSSE3, I210AlphaToARGBRow_SSSE3, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 572c082e2..ff2472e1a 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -37,6 +37,34 @@ extern "C" { "prfm pldl1keep, [%[src_u], 128] \n" \ "prfm pldl1keep, [%[src_v], 128] \n" +// Read 8 Y, 4 U and 4 V from 210 +#define READYUV210 \ + "ldr q2, [%[src_y]], #16 \n" \ + "ldr d1, [%[src_u]], #8 \n" \ + "ldr d3, [%[src_v]], #8 \n" \ + "shl v0.8h, v2.8h, #6 \n" \ + "usra v0.8h, v2.8h, #4 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "zip1 v2.8h, v3.8h, v3.8h \n" \ + "zip1 v3.8h, v1.8h, v1.8h \n" \ + "uqshrn v1.8b, v3.8h, #2 \n" \ + "uqshrn2 v1.16b, v2.8h, #2 \n" \ + "prfm pldl1keep, [%[src_u], 128] \n" \ + "prfm pldl1keep, [%[src_v], 128] \n" + +// Read 8 Y, 8 U and 8 V from 410 +#define READYUV410 \ + "ldr q1, [%[src_y]], #16 \n" \ + "ldr q2, [%[src_u]], #16 \n" \ + "ldr q3, [%[src_v]], #16 \n" \ + "shl v0.8h, v1.8h, #6 \n" \ + "usra v0.8h, v1.8h, #4 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "uqshrn v1.8b, v2.8h, #2 \n" \ + "uqshrn2 v1.16b, v3.8h, #2 \n" \ + "prfm pldl1keep, [%[src_u], 128] \n" \ + "prfm pldl1keep, [%[src_v], 128] \n" + // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ "ldr d0, [%[src_y]], #8 \n" \ @@ -255,6 +283,62 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, : "cc", "memory", YUVTORGB_REGS, "v19"); } +void I410AlphaToARGBRow_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + asm volatile( + YUVTORGB_SETUP + "1: \n" + "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410 + "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + +void I210AlphaToARGBRow_NEON(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + const uvec8* uv_coeff = &yuvconstants->kUVCoeff; + const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; + asm volatile( + YUVTORGB_SETUP + "1: \n" + "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210 + "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); +} + void I422AlphaToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v,