[AArch64] Add Neon impls for I{210,410}ToAR30Row_NEON

There are existing x86 implementations for these kernels, but not for
AArch64, so add them.

Reduction in runtimes, compared to the existing C code compiled with
LLVM 17:

 I210ToAR30Row on Cortex-A55: -43.8%
I210ToAR30Row on Cortex-A510: -27.0%
 I210ToAR30Row on Cortex-A76: -50.4%
 I410ToAR30Row on Cortex-A55: -44.3%
I410ToAR30Row on Cortex-A510: -17.5%
 I410ToAR30Row on Cortex-A76: -57.2%

Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: libyuv:976
Change-Id: Ib5fb9b2ce6ef06ec76ecd8473be5fe76d2622fbc
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5593931
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-10 16:36:26 +01:00 committed by Frank Barchard
parent cc823114a1
commit 6c70eb2819
4 changed files with 146 additions and 0 deletions

View File

@ -559,6 +559,8 @@ extern "C" {
#define HAS_I410ALPHATOARGBROW_NEON #define HAS_I410ALPHATOARGBROW_NEON
#define HAS_I210TOARGBROW_NEON #define HAS_I210TOARGBROW_NEON
#define HAS_I410TOARGBROW_NEON #define HAS_I410TOARGBROW_NEON
#define HAS_I210TOAR30ROW_NEON
#define HAS_I410TOAR30ROW_NEON
#define HAS_ABGRTOYJROW_NEON_DOTPROD #define HAS_ABGRTOYJROW_NEON_DOTPROD
#define HAS_ABGRTOYROW_NEON_DOTPROD #define HAS_ABGRTOYROW_NEON_DOTPROD
@ -1090,6 +1092,18 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
uint8_t* rgb_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I210ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I410ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_NEON(const uint8_t* src_y, void I422ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
@ -5104,6 +5118,18 @@ void I410ToARGBRow_Any_NEON(const uint16_t* y_buf,
uint8_t* dst_ptr, uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I210ToAR30Row_Any_NEON(const uint16_t* y_buf,
const uint16_t* u_buf,
const uint16_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I410ToAR30Row_Any_NEON(const uint16_t* y_buf,
const uint16_t* u_buf,
const uint16_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* v_buf,

View File

@ -954,6 +954,14 @@ int I010ToAR30Matrix(const uint16_t* src_y,
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
dst_stride_ar30 = -dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30;
} }
#if defined(HAS_I210TOAR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I210ToAR30Row = I210ToAR30Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I210ToAR30Row = I210ToAR30Row_NEON;
}
}
#endif
#if defined(HAS_I210TOAR30ROW_SSSE3) #if defined(HAS_I210TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I210ToAR30Row = I210ToAR30Row_Any_SSSE3; I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
@ -1172,6 +1180,14 @@ int I210ToAR30Matrix(const uint16_t* src_y,
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
dst_stride_ar30 = -dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30;
} }
#if defined(HAS_I210TOAR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I210ToAR30Row = I210ToAR30Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I210ToAR30Row = I210ToAR30Row_NEON;
}
}
#endif
#if defined(HAS_I210TOAR30ROW_SSSE3) #if defined(HAS_I210TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I210ToAR30Row = I210ToAR30Row_Any_SSSE3; I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
@ -1327,6 +1343,14 @@ int I410ToAR30Matrix(const uint16_t* src_y,
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
dst_stride_ar30 = -dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30;
} }
#if defined(HAS_I410TOAR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I410ToAR30Row = I410ToAR30Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I410ToAR30Row = I410ToAR30Row_NEON;
}
}
#endif
#if defined(HAS_I410TOAR30ROW_SSSE3) #if defined(HAS_I410TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I410ToAR30Row = I410ToAR30Row_Any_SSSE3; I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
@ -6699,6 +6723,14 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
dst_stride_ar30 = -dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30;
} }
#if defined(HAS_I410TOAR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I410ToAR30Row = I410ToAR30Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I410ToAR30Row = I410ToAR30Row_NEON;
}
}
#endif
#if defined(HAS_I410TOAR30ROW_SSSE3) #if defined(HAS_I410TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I410ToAR30Row = I410ToAR30Row_Any_SSSE3; I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
@ -6805,6 +6837,14 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y,
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
dst_stride_ar30 = -dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30;
} }
#if defined(HAS_I410TOAR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I410ToAR30Row = I410ToAR30Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I410ToAR30Row = I410ToAR30Row_NEON;
}
}
#endif
#if defined(HAS_I410TOAR30ROW_SSSE3) #if defined(HAS_I410TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I410ToAR30Row = I410ToAR30Row_Any_SSSE3; I410ToAR30Row = I410ToAR30Row_Any_SSSE3;

View File

@ -522,6 +522,12 @@ ANY31CT(I210ToARGBRow_Any_NEON, I210ToARGBRow_NEON, 1, 0, uint16_t, 2, 4, 7)
#ifdef HAS_I410TOARGBROW_NEON #ifdef HAS_I410TOARGBROW_NEON
ANY31CT(I410ToARGBRow_Any_NEON, I410ToARGBRow_NEON, 0, 0, uint16_t, 2, 4, 7) ANY31CT(I410ToARGBRow_Any_NEON, I410ToARGBRow_NEON, 0, 0, uint16_t, 2, 4, 7)
#endif #endif
#ifdef HAS_I210TOAR30ROW_NEON
ANY31CT(I210ToAR30Row_Any_NEON, I210ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7)
#endif
#ifdef HAS_I410TOAR30ROW_NEON
ANY31CT(I410ToAR30Row_Any_NEON, I410ToAR30Row_NEON, 0, 0, uint16_t, 2, 4, 7)
#endif
#undef ANY31CT #undef ANY31CT
// Any 3 planes to 1 plane with parameter // Any 3 planes to 1 plane with parameter

View File

@ -182,6 +182,24 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13,
"uqshl v16.8h, v16.8h, #2 \n" \ "uqshl v16.8h, v16.8h, #2 \n" \
"uqshl v18.8h, v18.8h, #2 \n" "uqshl v18.8h, v18.8h, #2 \n"
// Store 2.14 fixed point RGB as AR30 elements
#define STOREAR30 \
/* Inputs: \
* v16.8h: xxbbbbbbbbbbxxxx \
* v17.8h: xxggggggggggxxxx \
* v18.8h: xxrrrrrrrrrrxxxx \
* v22.8h: 0011111111110000 (umin limit) \
* v23.8h: 1100000000000000 (alpha) \
*/ \
"uqshl v0.8h, v16.8h, #2 \n" /* bbbbbbbbbbxxxxxx */ \
"uqshl v1.8h, v17.8h, #2 \n" /* ggggggggggxxxxxx */ \
"umin v2.8h, v18.8h, v22.8h \n" /* 00rrrrrrrrrrxxxx */ \
"shl v4.8h, v1.8h, #4 \n" /* ggggggxxxxxx0000 */ \
"orr v5.16b, v2.16b, v23.16b \n" /* 11rrrrrrrrrrxxxx */ \
"sri v4.8h, v0.8h, #6 \n" /* ggggggbbbbbbbbbb */ \
"sri v5.8h, v1.8h, #12 \n" /* 11rrrrrrrrrrgggg */ \
"st2 {v4.8h, v5.8h}, [%[dst_ar30]], #32 \n"
#define YUVTORGB_REGS \ #define YUVTORGB_REGS \
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", \ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", \
"v25", "v26", "v27", "v28", "v29", "v30", "v31" "v25", "v26", "v27", "v28", "v29", "v30", "v31"
@ -233,6 +251,62 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_REGS); : "cc", "memory", YUVTORGB_REGS);
} }
void I210ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0;
uint16_t alpha = 0xc000;
asm(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n"
"1: \n" READYUV210 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[alpha] "r"(alpha) // %[alpha]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I410ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0;
uint16_t alpha = 0xc000;
asm(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n"
"1: \n" READYUV410 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[alpha] "r"(alpha) // %[alpha]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I210ToARGBRow_NEON(const uint16_t* src_y, void I210ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u, const uint16_t* src_u,
const uint16_t* src_v, const uint16_t* src_v,