mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[AArch64] Add Neon implementations for {ARGB,ABGR}ToAR30Row
There are existing x86 implementations for these kernels but not for
AArch64, so add them.
Reduction in runtimes, compared to the existing C code compiled with
LLVM 17:
| ABGRToAR30Row | ARGBToAR30Row
Cortex-A55 | -55.1% | -55.1%
Cortex-A510 | -39.3% | -40.1%
Cortex-A76 | -62.3% | -63.6%
Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: libyuv:976
Change-Id: I307f03bddcbe5429c2d3ab2f42aa023a3539ddd0
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5465592
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
83c48c782a
commit
9fac9a4a82
@ -553,6 +553,9 @@ extern "C" {
|
|||||||
|
|
||||||
// The following are available on AArch64 platforms:
|
// The following are available on AArch64 platforms:
|
||||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||||
|
#define HAS_ARGBTOAR30ROW_NEON
|
||||||
|
#define HAS_ABGRTOAR30ROW_NEON
|
||||||
|
|
||||||
#define HAS_ABGRTOYJROW_NEON_DOTPROD
|
#define HAS_ABGRTOYJROW_NEON_DOTPROD
|
||||||
#define HAS_ABGRTOYROW_NEON_DOTPROD
|
#define HAS_ABGRTOYROW_NEON_DOTPROD
|
||||||
#define HAS_ARGBTOYJROW_NEON_DOTPROD
|
#define HAS_ARGBTOYJROW_NEON_DOTPROD
|
||||||
@ -5136,6 +5139,14 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width);
|
int width);
|
||||||
|
void ARGBToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width);
|
||||||
|
void ABGRToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width);
|
||||||
|
void ABGRToAR30Row_Any_NEON(const uint8_t* src_ptr,
|
||||||
|
uint8_t* dst_ptr,
|
||||||
|
int width);
|
||||||
|
void ARGBToAR30Row_Any_NEON(const uint8_t* src_ptr,
|
||||||
|
uint8_t* dst_ptr,
|
||||||
|
int width);
|
||||||
void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
|
void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
|
||||||
const uint8_t* u_buf,
|
const uint8_t* u_buf,
|
||||||
const uint8_t* v_buf,
|
const uint8_t* v_buf,
|
||||||
|
|||||||
@ -2268,6 +2268,14 @@ int ABGRToAR30(const uint8_t* src_abgr,
|
|||||||
height = 1;
|
height = 1;
|
||||||
src_stride_abgr = dst_stride_ar30 = 0;
|
src_stride_abgr = dst_stride_ar30 = 0;
|
||||||
}
|
}
|
||||||
|
#if defined(HAS_ABGRTOAR30ROW_NEON)
|
||||||
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
|
ABGRToAR30Row = ABGRToAR30Row_Any_NEON;
|
||||||
|
if (IS_ALIGNED(width, 8)) {
|
||||||
|
ABGRToAR30Row = ABGRToAR30Row_NEON;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(HAS_ABGRTOAR30ROW_SSSE3)
|
#if defined(HAS_ABGRTOAR30ROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||||
ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
|
ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
|
||||||
@ -2317,6 +2325,14 @@ int ARGBToAR30(const uint8_t* src_argb,
|
|||||||
height = 1;
|
height = 1;
|
||||||
src_stride_argb = dst_stride_ar30 = 0;
|
src_stride_argb = dst_stride_ar30 = 0;
|
||||||
}
|
}
|
||||||
|
#if defined(HAS_ARGBTOAR30ROW_NEON)
|
||||||
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
|
ARGBToAR30Row = ARGBToAR30Row_Any_NEON;
|
||||||
|
if (IS_ALIGNED(width, 8)) {
|
||||||
|
ARGBToAR30Row = ARGBToAR30Row_NEON;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
|
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||||
ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
|
ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
|
||||||
|
|||||||
@ -948,6 +948,12 @@ ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
|
|||||||
#if defined(HAS_ABGRTOAR30ROW_SSSE3)
|
#if defined(HAS_ABGRTOAR30ROW_SSSE3)
|
||||||
ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
|
ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_ABGRTOAR30ROW_NEON)
|
||||||
|
ANY11(ABGRToAR30Row_Any_NEON, ABGRToAR30Row_NEON, 0, 4, 4, 7)
|
||||||
|
#endif
|
||||||
|
#if defined(HAS_ARGBTOAR30ROW_NEON)
|
||||||
|
ANY11(ARGBToAR30Row_Any_NEON, ARGBToAR30Row_NEON, 0, 4, 4, 7)
|
||||||
|
#endif
|
||||||
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
|
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
|
||||||
ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
|
ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -1722,6 +1722,59 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const int16_t kAR30Row_BoxShifts[] = {0, -6, 0, -6, 0, -6, 0, -6};
|
||||||
|
|
||||||
|
static const uint8_t kABGRToAR30Row_BoxIndices[] = {
|
||||||
|
2, 2, 1, 1, 6, 6, 5, 5, 10, 10, 9, 9, 14, 14, 13, 13,
|
||||||
|
0, 0, 3, 3, 4, 4, 7, 7, 8, 8, 11, 11, 12, 12, 15, 15};
|
||||||
|
static const uint8_t kARGBToAR30Row_BoxIndices[] = {
|
||||||
|
0, 0, 1, 1, 4, 4, 5, 5, 8, 8, 9, 9, 12, 12, 13, 13,
|
||||||
|
2, 2, 3, 3, 6, 6, 7, 7, 10, 10, 11, 11, 14, 14, 15, 15};
|
||||||
|
|
||||||
|
// ARGB or ABGR as input, reordering based on TBL indices parameter.
|
||||||
|
static void ABCDToAR30Row_NEON(const uint8_t* src_abcd,
|
||||||
|
uint8_t* dst_ar30,
|
||||||
|
int width,
|
||||||
|
const uint8_t* indices) {
|
||||||
|
asm volatile(
|
||||||
|
"movi v2.4s, #0xf, msl 16 \n" // 0xfffff
|
||||||
|
"ldr q3, [%[kAR30Row_BoxShifts]] \n"
|
||||||
|
"ldp q4, q5, [%[indices]] \n"
|
||||||
|
"1: \n"
|
||||||
|
"ldp q0, q20, [%[src]], #32 \n"
|
||||||
|
"subs %w[width], %w[width], #8 \n"
|
||||||
|
"tbl v1.16b, {v0.16b}, v5.16b \n"
|
||||||
|
"tbl v21.16b, {v20.16b}, v5.16b \n"
|
||||||
|
"tbl v0.16b, {v0.16b}, v4.16b \n"
|
||||||
|
"tbl v20.16b, {v20.16b}, v4.16b \n"
|
||||||
|
"ushl v0.8h, v0.8h, v3.8h \n"
|
||||||
|
"ushl v20.8h, v20.8h, v3.8h \n"
|
||||||
|
"ushl v1.8h, v1.8h, v3.8h \n"
|
||||||
|
"ushl v21.8h, v21.8h, v3.8h \n"
|
||||||
|
"ushr v0.4s, v0.4s, #6 \n"
|
||||||
|
"ushr v20.4s, v20.4s, #6 \n"
|
||||||
|
"shl v1.4s, v1.4s, #14 \n"
|
||||||
|
"shl v21.4s, v21.4s, #14 \n"
|
||||||
|
"bif v0.16b, v1.16b, v2.16b \n"
|
||||||
|
"bif v20.16b, v21.16b, v2.16b \n"
|
||||||
|
"stp q0, q20, [%[dst]], #32 \n"
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: [src] "+r"(src_abcd), // %[src]
|
||||||
|
[dst] "+r"(dst_ar30), // %[dst]
|
||||||
|
[width] "+r"(width) // %[width]
|
||||||
|
: [kAR30Row_BoxShifts] "r"(kAR30Row_BoxShifts), // %[kAR30Row_BoxShifts]
|
||||||
|
[indices] "r"(indices) // %[indices]
|
||||||
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20", "v21");
|
||||||
|
}
|
||||||
|
|
||||||
|
void ABGRToAR30Row_NEON(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
|
||||||
|
ABCDToAR30Row_NEON(src_abgr, dst_ar30, width, kABGRToAR30Row_BoxIndices);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
|
||||||
|
ABCDToAR30Row_NEON(src_argb, dst_ar30, width, kARGBToAR30Row_BoxIndices);
|
||||||
|
}
|
||||||
|
|
||||||
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
|
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_rgb24,
|
uint8_t* dst_rgb24,
|
||||||
int width) {
|
int width) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user