[AArch64] Add Neon impls for I{210,410}AlphaToARGBRow_NEON

There are existing x86 implementations for these kernels, but not for
AArch64, so add them.

Reduction in runtimes, compared to the existing C code compiled with
LLVM 17:

            | I210AlphaToARGBRow | I410AlphaToARGBRow
 Cortex-A55 |             -55.3% |             -56.1%
Cortex-A510 |             -27.9% |             -42.6%
 Cortex-A76 |             -54.9% |             -60.3%

Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: libyuv:976
Change-Id: Ieb7ad945abda72babd0cfe1020738d31e3562705
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5465593
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-10 16:36:26 +01:00 committed by Frank Barchard
parent e348995a92
commit 5b4160b9c3
4 changed files with 175 additions and 0 deletions

View File

@ -555,6 +555,8 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_ARGBTOAR30ROW_NEON #define HAS_ARGBTOAR30ROW_NEON
#define HAS_ABGRTOAR30ROW_NEON #define HAS_ABGRTOAR30ROW_NEON
#define HAS_I210ALPHATOARGBROW_NEON
#define HAS_I410ALPHATOARGBROW_NEON
#define HAS_ABGRTOYJROW_NEON_DOTPROD #define HAS_ABGRTOYJROW_NEON_DOTPROD
#define HAS_ABGRTOYROW_NEON_DOTPROD #define HAS_ABGRTOYROW_NEON_DOTPROD
@ -1041,6 +1043,20 @@ struct YuvConstants {
IACA_UD_BYTES \ IACA_UD_BYTES \
} }
void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_NEON(const uint8_t* src_y, void I444ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
@ -5072,6 +5088,20 @@ void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
uint8_t* dst_ptr, uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I410AlphaToARGBRow_Any_NEON(const uint16_t* y_buf,
const uint16_t* u_buf,
const uint16_t* v_buf,
const uint16_t* a_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I210AlphaToARGBRow_Any_NEON(const uint16_t* y_buf,
const uint16_t* u_buf,
const uint16_t* v_buf,
const uint16_t* a_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_Any_NEON(const uint8_t* y_buf, void I422ToRGBARow_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* v_buf,

View File

@ -2576,6 +2576,14 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
#if defined(HAS_I210ALPHATOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I210ALPHATOARGBROW_SSSE3) #if defined(HAS_I210ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
@ -2682,6 +2690,14 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
#if defined(HAS_I210ALPHATOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I210ALPHATOARGBROW_SSSE3) #if defined(HAS_I210ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
@ -2786,6 +2802,14 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
#if defined(HAS_I410ALPHATOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I410ALPHATOARGBROW_SSSE3) #if defined(HAS_I410ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
@ -7436,6 +7460,14 @@ static int I010AlphaToARGBMatrixBilinear(
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
#if defined(HAS_I410ALPHATOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I410ALPHATOARGBROW_SSSE3) #if defined(HAS_I410ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
@ -7604,6 +7636,14 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
#if defined(HAS_I410ALPHATOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I410ALPHATOARGBROW_SSSE3) #if defined(HAS_I410ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;

View File

@ -142,6 +142,27 @@ ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15)
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \
} }
#ifdef HAS_I210ALPHATOARGBROW_NEON
ANY41CT(I210AlphaToARGBRow_Any_NEON,
I210AlphaToARGBRow_NEON,
1,
0,
uint16_t,
2,
4,
7);
#endif
#ifdef HAS_I410ALPHATOARGBROW_NEON
ANY41CT(I410AlphaToARGBRow_Any_NEON,
I410AlphaToARGBRow_NEON,
0,
0,
uint16_t,
2,
4,
7);
#endif
#ifdef HAS_I210ALPHATOARGBROW_SSSE3 #ifdef HAS_I210ALPHATOARGBROW_SSSE3
ANY41CT(I210AlphaToARGBRow_Any_SSSE3, ANY41CT(I210AlphaToARGBRow_Any_SSSE3,
I210AlphaToARGBRow_SSSE3, I210AlphaToARGBRow_SSSE3,

View File

@ -37,6 +37,34 @@ extern "C" {
"prfm pldl1keep, [%[src_u], 128] \n" \ "prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n" "prfm pldl1keep, [%[src_v], 128] \n"
// Read 8 Y, 4 U and 4 V from 210
#define READYUV210 \
"ldr q2, [%[src_y]], #16 \n" \
"ldr d1, [%[src_u]], #8 \n" \
"ldr d3, [%[src_v]], #8 \n" \
"shl v0.8h, v2.8h, #6 \n" \
"usra v0.8h, v2.8h, #4 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"zip1 v2.8h, v3.8h, v3.8h \n" \
"zip1 v3.8h, v1.8h, v1.8h \n" \
"uqshrn v1.8b, v3.8h, #2 \n" \
"uqshrn2 v1.16b, v2.8h, #2 \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n"
// Read 8 Y, 8 U and 8 V from 410
#define READYUV410 \
"ldr q1, [%[src_y]], #16 \n" \
"ldr q2, [%[src_u]], #16 \n" \
"ldr q3, [%[src_v]], #16 \n" \
"shl v0.8h, v1.8h, #6 \n" \
"usra v0.8h, v1.8h, #4 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"uqshrn v1.8b, v2.8h, #2 \n" \
"uqshrn2 v1.16b, v3.8h, #2 \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n"
// Read 8 Y, 8 U and 8 V from 444 // Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \ #define READYUV444 \
"ldr d0, [%[src_y]], #8 \n" \ "ldr d0, [%[src_y]], #8 \n" \
@ -255,6 +283,62 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_REGS, "v19"); : "cc", "memory", YUVTORGB_REGS, "v19");
} }
void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
asm volatile(
YUVTORGB_SETUP
"1: \n"
"ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[src_a] "+r"(src_a), // %[src_a]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
asm volatile(
YUVTORGB_SETUP
"1: \n"
"ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[src_a] "+r"(src_a), // %[src_a]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I422AlphaToARGBRow_NEON(const uint8_t* src_y, void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,