[AArch64] Unroll ARGB1555ToARGBRow_NEON to use full Neon vectors

Processing more data per loop iteration means that we can use the full
128-bit Neon vectors and also allows us to use e.g. UZP1 to perform XTN
+ XTN2 in a single instruction.

The early Cortex-X cores are not a fan of ST4 .16b with a
post-increment, so split out the pointer increment to a separate
instruction to avoid this bottleneck.

Reductions in runtime observed for ARGB1555ToARGBRow_NEON:

 Cortex-A55: -18.1%
Cortex-A510: -11.2%
Cortex-A520: -39.5%
 Cortex-A76: -18.0%
Cortex-A715: -34.8%
Cortex-A720: -34.8%
  Cortex-X1:  -0.9%
  Cortex-X2:  -4.6%
  Cortex-X3:  -3.6%
  Cortex-X4: -20.8%

Bug: libyuv:976
Change-Id: Iae2ac24ffdbc718cd1e05bb77191f8d1df3fcf6f
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790975
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
George Steed 2024-04-22 10:03:28 +01:00 committed by Frank Barchard
parent 772f0fde1c
commit d5303f4f77
3 changed files with 23 additions and 20 deletions

View File

@ -3792,7 +3792,7 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555,
#if defined(HAS_ARGB1555TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
if (IS_ALIGNED(width, 16)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
}
}

View File

@ -1393,7 +1393,7 @@ ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15)
ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31)
#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 15)
#endif
#ifdef HAS_ARGB1555TOARGBROW_MSA
ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)

View File

@ -2083,17 +2083,19 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
);
}
#define ARGB1555TOARGB \
/* Input: ARRRRRGGGGGBBBBB */ \
"xtn v29.8b, v0.8h \n" /* xxxBBBBB */ \
"shrn v3.8b, v0.8h, #8 \n" /* Axxxxxxx */ \
"shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \
"shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \
"shl v0.8b, v29.8b, #3 \n" /* BBBBB000 */ \
"sshr v3.8b, v3.8b, #7 \n" /* AAAAAAAA */ \
"sri v2.8b, v2.8b, #5 \n" /* RRRRRRRR */ \
"sri v1.8b, v1.8b, #5 \n" /* GGGGGGGG */ \
"sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */
#define ARGB1555TOARGB \
/* Input: ARRRRRGGGGGBBBBB */ \
"shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \
"uzp1 v29.16b, v0.16b, v4.16b \n" /* xxxBBBBB */ \
"shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \
"uzp2 v3.16b, v0.16b, v4.16b \n" /* Axxxxxxx */ \
"shrn2 v2.16b, v4.8h, #7 \n" /* RRRRRxxx */ \
"shl v0.16b, v29.16b, #3 \n" /* BBBBB000 */ \
"shrn2 v1.16b, v4.8h, #2 \n" /* GGGGGxxx */ \
"sshr v3.16b, v3.16b, #7 \n" /* AAAAAAAA */ \
"sri v2.16b, v2.16b, #5 \n" /* RRRRRRRR */ \
"sri v1.16b, v1.16b, #5 \n" /* GGGGGGGG */ \
"sri v0.16b, v0.16b, #5 \n" /* BBBBBBBB */
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
#define RGB555TOARGB \
@ -2112,14 +2114,15 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile (
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
asm volatile(
"1: \n"
"ldp q0, q4, [%0], #32 \n" // load 16 ARGB1555 pixels
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop
ARGB1555TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
"st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB
"add %1, %1, #64 \n"
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2