mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
[AArch64] Unroll ARGB1555ToARGBRow_NEON to use full Neon vectors
Processing more data per loop iteration means that we can use the full 128-bit Neon vectors and also allows us to use e.g. UZP1 to perform XTN + XTN2 in a single instruction. The early Cortex-X cores are not a fan of ST4 .16b with a post-increment, so split out the pointer increment to a separate instruction to avoid this bottleneck. Reductions in runtime observed for ARGB1555ToARGBRow_NEON: Cortex-A55: -18.1% Cortex-A510: -11.2% Cortex-A520: -39.5% Cortex-A76: -18.0% Cortex-A715: -34.8% Cortex-A720: -34.8% Cortex-X1: -0.9% Cortex-X2: -4.6% Cortex-X3: -3.6% Cortex-X4: -20.8% Bug: libyuv:976 Change-Id: Iae2ac24ffdbc718cd1e05bb77191f8d1df3fcf6f Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790975 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
772f0fde1c
commit
d5303f4f77
@ -3792,7 +3792,7 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555,
|
||||
#if defined(HAS_ARGB1555TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1393,7 +1393,7 @@ ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15)
|
||||
ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGB1555TOARGBROW_NEON
|
||||
ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
|
||||
ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGB1555TOARGBROW_MSA
|
||||
ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
|
||||
|
||||
@ -2083,17 +2083,19 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
||||
);
|
||||
}
|
||||
|
||||
#define ARGB1555TOARGB \
|
||||
/* Input: ARRRRRGGGGGBBBBB */ \
|
||||
"xtn v29.8b, v0.8h \n" /* xxxBBBBB */ \
|
||||
"shrn v3.8b, v0.8h, #8 \n" /* Axxxxxxx */ \
|
||||
"shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \
|
||||
"shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \
|
||||
"shl v0.8b, v29.8b, #3 \n" /* BBBBB000 */ \
|
||||
"sshr v3.8b, v3.8b, #7 \n" /* AAAAAAAA */ \
|
||||
"sri v2.8b, v2.8b, #5 \n" /* RRRRRRRR */ \
|
||||
"sri v1.8b, v1.8b, #5 \n" /* GGGGGGGG */ \
|
||||
"sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */
|
||||
#define ARGB1555TOARGB \
|
||||
/* Input: ARRRRRGGGGGBBBBB */ \
|
||||
"shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \
|
||||
"uzp1 v29.16b, v0.16b, v4.16b \n" /* xxxBBBBB */ \
|
||||
"shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \
|
||||
"uzp2 v3.16b, v0.16b, v4.16b \n" /* Axxxxxxx */ \
|
||||
"shrn2 v2.16b, v4.8h, #7 \n" /* RRRRRxxx */ \
|
||||
"shl v0.16b, v29.16b, #3 \n" /* BBBBB000 */ \
|
||||
"shrn2 v1.16b, v4.8h, #2 \n" /* GGGGGxxx */ \
|
||||
"sshr v3.16b, v3.16b, #7 \n" /* AAAAAAAA */ \
|
||||
"sri v2.16b, v2.16b, #5 \n" /* RRRRRRRR */ \
|
||||
"sri v1.16b, v1.16b, #5 \n" /* GGGGGGGG */ \
|
||||
"sri v0.16b, v0.16b, #5 \n" /* BBBBBBBB */
|
||||
|
||||
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
|
||||
#define RGB555TOARGB \
|
||||
@ -2112,14 +2114,15 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
||||
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ldp q0, q4, [%0], #32 \n" // load 16 ARGB1555 pixels
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||
ARGB1555TOARGB
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
"b.gt 1b \n"
|
||||
"st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB
|
||||
"add %1, %1, #64 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb1555), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user