[AArch64] Add Neon impls for I212To{ARGB,AR30}Row_NEON

There are existing x86 implementations for these kernels, but not for
AArch64, so add them.

Reduction in runtimes, compared to the existing C code compiled with
LLVM 17:

            | I210ToAR30Row | I210ToARGBRow
 Cortex-A55 |        -40.8% |        -54.4%
Cortex-A510 |        -26.2% |        -22.7%
 Cortex-A76 |        -49.2% |        -44.5%

Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: libyuv:976
Change-Id: I967951a6b453ac0023a30d96b754c85c2a3bf14a
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5607762
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-10 16:36:26 +01:00 committed by Frank Barchard
parent fa16ddbb9f
commit bbd9cedc4f
4 changed files with 113 additions and 0 deletions

View File

@ -561,6 +561,8 @@ extern "C" {
#define HAS_I410TOARGBROW_NEON
#define HAS_I210TOAR30ROW_NEON
#define HAS_I410TOAR30ROW_NEON
#define HAS_I212TOARGBROW_NEON
#define HAS_I212TOAR30ROW_NEON
#define HAS_ABGRTOYJROW_NEON_DOTPROD
#define HAS_ABGRTOYROW_NEON_DOTPROD
@ -1122,6 +1124,18 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I212ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I212ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -5183,6 +5197,18 @@ void I410ToAR30Row_Any_NEON(const uint16_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I212ToARGBRow_Any_NEON(const uint16_t* y_buf,
const uint16_t* u_buf,
const uint16_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I212ToAR30Row_Any_NEON(const uint16_t* y_buf,
const uint16_t* u_buf,
const uint16_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,

View File

@ -1137,6 +1137,14 @@ int I012ToAR30Matrix(const uint16_t* src_y,
I212ToAR30Row = I212ToAR30Row_AVX2;
}
}
#endif
#if defined(HAS_I212TOAR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I212ToAR30Row = I212ToAR30Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I212ToAR30Row = I212ToAR30Row_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
@ -1592,6 +1600,14 @@ int I012ToARGBMatrix(const uint16_t* src_y,
I212ToARGBRow = I212ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I212TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I212ToARGBRow = I212ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I212ToARGBRow = I212ToARGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);

View File

@ -528,6 +528,12 @@ ANY31CT(I210ToAR30Row_Any_NEON, I210ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7)
#ifdef HAS_I410TOAR30ROW_NEON
ANY31CT(I410ToAR30Row_Any_NEON, I410ToAR30Row_NEON, 0, 0, uint16_t, 2, 4, 7)
#endif
#ifdef HAS_I212TOARGBROW_NEON
ANY31CT(I212ToARGBRow_Any_NEON, I212ToARGBRow_NEON, 1, 0, uint16_t, 2, 4, 7)
#endif
#ifdef HAS_I212TOAR30ROW_NEON
ANY31CT(I212ToAR30Row_Any_NEON, I212ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7)
#endif
#undef ANY31CT
// Any 3 planes to 1 plane with parameter

View File

@ -52,6 +52,21 @@ extern "C" {
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n"
// Read 8 Y, 4 U and 4 V from 212
#define READYUV212 \
"ldr q2, [%[src_y]], #16 \n" \
"ldr d1, [%[src_u]], #8 \n" \
"ldr d3, [%[src_v]], #8 \n" \
"shl v0.8h, v2.8h, #4 \n" \
"usra v0.8h, v2.8h, #8 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"zip1 v2.8h, v3.8h, v3.8h \n" \
"zip1 v3.8h, v1.8h, v1.8h \n" \
"uqshrn v1.8b, v3.8h, #4 \n" \
"uqshrn2 v1.16b, v2.8h, #4 \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n"
// Read 8 Y, 8 U and 8 V from 410
#define READYUV410 \
"ldr q1, [%[src_y]], #16 \n" \
@ -307,6 +322,32 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I212ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
const uint16_t limit = 0x3ff0;
asm(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"1: \n" READYUV212 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit) // %[limit]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I210ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@ -351,6 +392,30 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I212ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
asm(YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV212 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I422ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,