[AArch64] Add Neon implementation for I422ToAR30Row_NEON

There is an existing x86 implementation for this kernel, but not for
AArch64, so add one.

Reduction in runtimes, compared to the existing C code compiled with
LLVM 17:

 Cortex-A55: -43.1%
Cortex-A510: -22.3%
 Cortex-A76: -54.8%

Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: libyuv:976
Change-Id: Ifead36bcb8682a527136223e0dcd210e9abe744a
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5607763
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
George Steed 2024-04-10 16:36:26 +01:00 committed by Frank Barchard
parent bbd9cedc4f
commit d32436e8f8
4 changed files with 50 additions and 0 deletions

View File

@ -563,6 +563,7 @@ extern "C" {
#define HAS_I410TOAR30ROW_NEON
#define HAS_I212TOARGBROW_NEON
#define HAS_I212TOAR30ROW_NEON
#define HAS_I422TOAR30ROW_NEON
#define HAS_ABGRTOYJROW_NEON_DOTPROD
#define HAS_ABGRTOYROW_NEON_DOTPROD
@ -1148,6 +1149,12 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToAR30Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -5267,6 +5274,12 @@ void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I422ToAR30Row_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,

View File

@ -6241,6 +6241,14 @@ int I420ToAR30Matrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TOAR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToAR30Row = I422ToAR30Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToAR30Row = I422ToAR30Row_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);

View File

@ -420,6 +420,9 @@ ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
#ifdef HAS_I444TORGB24ROW_NEON
ANY31C(I444ToRGB24Row_Any_NEON, I444ToRGB24Row_NEON, 0, 0, 3, 7)
#endif
#ifdef HAS_I422TOAR30ROW_NEON
ANY31C(I422ToAR30Row_Any_NEON, I422ToAR30Row_NEON, 1, 0, 4, 7)
#endif
#ifdef HAS_I422TOARGBROW_NEON
ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)

View File

@ -440,6 +440,32 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I422ToAR30Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
const uint16_t limit = 0x3ff0;
asm(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"1: \n" READYUV422 I4XXTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit) // %[limit]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,