[AArch64] Add Neon implementations for {ARGB,ABGR}ToAR30Row

There are existing x86 implementations for these kernels but not for
AArch64, so add them.

Reduction in runtimes, compared to the existing C code compiled with
LLVM 17:

            | ABGRToAR30Row | ARGBToAR30Row
 Cortex-A55 |        -55.1% |        -55.1%
Cortex-A510 |        -39.3% |        -40.1%
 Cortex-A76 |        -62.3% |        -63.6%

Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: libyuv:976
Change-Id: I307f03bddcbe5429c2d3ab2f42aa023a3539ddd0
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5465592
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-10 16:36:25 +01:00 committed by Frank Barchard
parent 83c48c782a
commit 9fac9a4a82
4 changed files with 86 additions and 0 deletions

View File

@ -553,6 +553,9 @@ extern "C" {
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_ARGBTOAR30ROW_NEON
#define HAS_ABGRTOAR30ROW_NEON
#define HAS_ABGRTOYJROW_NEON_DOTPROD
#define HAS_ABGRTOYROW_NEON_DOTPROD
#define HAS_ARGBTOYJROW_NEON_DOTPROD
@ -5136,6 +5139,14 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void ARGBToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width);
void ABGRToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width);
void ABGRToAR30Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBToAR30Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,

View File

@ -2268,6 +2268,14 @@ int ABGRToAR30(const uint8_t* src_abgr,
height = 1;
src_stride_abgr = dst_stride_ar30 = 0;
}
#if defined(HAS_ABGRTOAR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToAR30Row = ABGRToAR30Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ABGRToAR30Row = ABGRToAR30Row_NEON;
}
}
#endif
#if defined(HAS_ABGRTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
@ -2317,6 +2325,14 @@ int ARGBToAR30(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_ar30 = 0;
}
#if defined(HAS_ARGBTOAR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToAR30Row = ARGBToAR30Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToAR30Row = ARGBToAR30Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;

View File

@ -948,6 +948,12 @@ ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
#if defined(HAS_ABGRTOAR30ROW_SSSE3)
ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
#endif
#if defined(HAS_ABGRTOAR30ROW_NEON)
ANY11(ABGRToAR30Row_Any_NEON, ABGRToAR30Row_NEON, 0, 4, 4, 7)
#endif
#if defined(HAS_ARGBTOAR30ROW_NEON)
ANY11(ARGBToAR30Row_Any_NEON, ARGBToAR30Row_NEON, 0, 4, 4, 7)
#endif
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
#endif

View File

@ -1722,6 +1722,59 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
);
}
static const int16_t kAR30Row_BoxShifts[] = {0, -6, 0, -6, 0, -6, 0, -6};
static const uint8_t kABGRToAR30Row_BoxIndices[] = {
2, 2, 1, 1, 6, 6, 5, 5, 10, 10, 9, 9, 14, 14, 13, 13,
0, 0, 3, 3, 4, 4, 7, 7, 8, 8, 11, 11, 12, 12, 15, 15};
static const uint8_t kARGBToAR30Row_BoxIndices[] = {
0, 0, 1, 1, 4, 4, 5, 5, 8, 8, 9, 9, 12, 12, 13, 13,
2, 2, 3, 3, 6, 6, 7, 7, 10, 10, 11, 11, 14, 14, 15, 15};
// ARGB or ABGR as input, reordering based on TBL indices parameter.
static void ABCDToAR30Row_NEON(const uint8_t* src_abcd,
uint8_t* dst_ar30,
int width,
const uint8_t* indices) {
asm volatile(
"movi v2.4s, #0xf, msl 16 \n" // 0xfffff
"ldr q3, [%[kAR30Row_BoxShifts]] \n"
"ldp q4, q5, [%[indices]] \n"
"1: \n"
"ldp q0, q20, [%[src]], #32 \n"
"subs %w[width], %w[width], #8 \n"
"tbl v1.16b, {v0.16b}, v5.16b \n"
"tbl v21.16b, {v20.16b}, v5.16b \n"
"tbl v0.16b, {v0.16b}, v4.16b \n"
"tbl v20.16b, {v20.16b}, v4.16b \n"
"ushl v0.8h, v0.8h, v3.8h \n"
"ushl v20.8h, v20.8h, v3.8h \n"
"ushl v1.8h, v1.8h, v3.8h \n"
"ushl v21.8h, v21.8h, v3.8h \n"
"ushr v0.4s, v0.4s, #6 \n"
"ushr v20.4s, v20.4s, #6 \n"
"shl v1.4s, v1.4s, #14 \n"
"shl v21.4s, v21.4s, #14 \n"
"bif v0.16b, v1.16b, v2.16b \n"
"bif v20.16b, v21.16b, v2.16b \n"
"stp q0, q20, [%[dst]], #32 \n"
"b.gt 1b \n"
: [src] "+r"(src_abcd), // %[src]
[dst] "+r"(dst_ar30), // %[dst]
[width] "+r"(width) // %[width]
: [kAR30Row_BoxShifts] "r"(kAR30Row_BoxShifts), // %[kAR30Row_BoxShifts]
[indices] "r"(indices) // %[indices]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20", "v21");
}
void ABGRToAR30Row_NEON(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
ABCDToAR30Row_NEON(src_abgr, dst_ar30, width, kABGRToAR30Row_BoxIndices);
}
void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
ABCDToAR30Row_NEON(src_argb, dst_ar30, width, kARGBToAR30Row_BoxIndices);
}
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb24,
int width) {