[AArch64] Add Neon impls for I{210,410}ToARGBRow_NEON

There is are existing x86 implementations for these kernels, but not for
AArch64, so add them.

Reduction in runtimes, compared to the existing C code compiled with
LLVM 17:

            | I210ToARGBRow | I410ToARGBRow
 Cortex-A55 |        -55.6% |        -56.2%
Cortex-A510 |        -22.6% |        -35.6%
 Cortex-A76 |        -48.1% |        -57.2%

Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: libyuv:976
Change-Id: I2ccae1388760a129c73d2e550b32bb0b5af235d6
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5465594
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-10 16:36:26 +01:00 committed by Frank Barchard
parent 5b4160b9c3
commit 812b4955b2
4 changed files with 122 additions and 0 deletions

View File

@ -557,6 +557,8 @@ extern "C" {
#define HAS_ABGRTOAR30ROW_NEON
#define HAS_I210ALPHATOARGBROW_NEON
#define HAS_I410ALPHATOARGBROW_NEON
#define HAS_I210TOARGBROW_NEON
#define HAS_I410TOARGBROW_NEON
#define HAS_ABGRTOYJROW_NEON_DOTPROD
#define HAS_ABGRTOYROW_NEON_DOTPROD
@ -1075,6 +1077,18 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void I210ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I410ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -5074,6 +5088,18 @@ void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I210ToARGBRow_Any_NEON(const uint16_t* y_buf,
const uint16_t* u_buf,
const uint16_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I410ToARGBRow_Any_NEON(const uint16_t* y_buf,
const uint16_t* u_buf,
const uint16_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,

View File

@ -1389,6 +1389,14 @@ int I010ToARGBMatrix(const uint16_t* src_y,
}
}
#endif
#if defined(HAS_I210TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I210ToARGBRow = I210ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I210ToARGBRow = I210ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I210TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I210ToARGBRow = I210ToARGBRow_Any_AVX2;
@ -1609,6 +1617,14 @@ int I210ToARGBMatrix(const uint16_t* src_y,
}
}
#endif
#if defined(HAS_I210TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I210ToARGBRow = I210ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I210ToARGBRow = I210ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I210TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I210ToARGBRow = I210ToARGBRow_Any_AVX2;
@ -1770,6 +1786,14 @@ int I410ToARGBMatrix(const uint16_t* src_y,
}
}
#endif
#if defined(HAS_I410TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I410ToARGBRow = I410ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I410ToARGBRow = I410ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I410TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I410ToARGBRow = I410ToARGBRow_Any_AVX2;
@ -6874,6 +6898,14 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
}
}
#endif
#if defined(HAS_I410TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I410ToARGBRow = I410ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I410ToARGBRow = I410ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I410TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I410ToARGBRow = I410ToARGBRow_Any_AVX2;
@ -6979,6 +7011,14 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y,
}
}
#endif
#if defined(HAS_I410TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I410ToARGBRow = I410ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I410ToARGBRow = I410ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I410TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I410ToARGBRow = I410ToARGBRow_Any_AVX2;

View File

@ -516,6 +516,12 @@ ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
#ifdef HAS_I212TOAR30ROW_AVX2
ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
#endif
#ifdef HAS_I210TOARGBROW_NEON
ANY31CT(I210ToARGBRow_Any_NEON, I210ToARGBRow_NEON, 1, 0, uint16_t, 2, 4, 7)
#endif
#ifdef HAS_I410TOARGBROW_NEON
ANY31CT(I410ToARGBRow_Any_NEON, I410ToARGBRow_NEON, 0, 0, uint16_t, 2, 4, 7)
#endif
#undef ANY31CT
// Any 3 planes to 1 plane with parameter

View File

@ -233,6 +233,56 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_REGS);
}
void I210ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV210 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I410ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV410 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I422ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,