From d5303f4f779a6baf66bbac8a97937e507ffaaaba Mon Sep 17 00:00:00 2001 From: George Steed Date: Mon, 22 Apr 2024 10:03:28 +0100 Subject: [PATCH] [AArch64] Unroll ARGB1555ToARGBRow_NEON to use full Neon vectors Processing more data per loop iteration means that we can use the full 128-bit Neon vectors and also allows us to use e.g. UZP1 to perform XTN + XTN2 in a single instruction. The early Cortex-X cores are not a fan of ST4 .16b with a post-increment, so split out the pointer increment to a separate instruction to avoid this bottleneck. Reductions in runtime observed for ARGB1555ToARGBRow_NEON: Cortex-A55: -18.1% Cortex-A510: -11.2% Cortex-A520: -39.5% Cortex-A76: -18.0% Cortex-A715: -34.8% Cortex-A720: -34.8% Cortex-X1: -0.9% Cortex-X2: -4.6% Cortex-X3: -3.6% Cortex-X4: -20.8% Bug: libyuv:976 Change-Id: Iae2ac24ffdbc718cd1e05bb77191f8d1df3fcf6f Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790975 Reviewed-by: Frank Barchard Reviewed-by: Justin Green --- source/convert_argb.cc | 2 +- source/row_any.cc | 2 +- source/row_neon64.cc | 39 +++++++++++++++++++++------------------ 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/source/convert_argb.cc b/source/convert_argb.cc index dd94c1b77..a77358df8 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3792,7 +3792,7 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555, #if defined(HAS_ARGB1555TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; } } diff --git a/source/row_any.cc b/source/row_any.cc index b9eaa0a3c..d2f8a5419 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1393,7 +1393,7 @@ ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15) ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31) #endif #ifdef HAS_ARGB1555TOARGBROW_NEON -ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) +ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 15) #endif #ifdef HAS_ARGB1555TOARGBROW_MSA ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 6d50b2760..7ad54b430 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2083,17 +2083,19 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, ); } -#define ARGB1555TOARGB \ - /* Input: ARRRRRGGGGGBBBBB */ \ - "xtn v29.8b, v0.8h \n" /* xxxBBBBB */ \ - "shrn v3.8b, v0.8h, #8 \n" /* Axxxxxxx */ \ - "shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \ - "shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \ - "shl v0.8b, v29.8b, #3 \n" /* BBBBB000 */ \ - "sshr v3.8b, v3.8b, #7 \n" /* AAAAAAAA */ \ - "sri v2.8b, v2.8b, #5 \n" /* RRRRRRRR */ \ - "sri v1.8b, v1.8b, #5 \n" /* GGGGGGGG */ \ - "sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */ +#define ARGB1555TOARGB \ + /* Input: ARRRRRGGGGGBBBBB */ \ + "shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \ + "uzp1 v29.16b, v0.16b, v4.16b \n" /* xxxBBBBB */ \ + "shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \ + "uzp2 v3.16b, v0.16b, v4.16b \n" /* Axxxxxxx */ \ + "shrn2 v2.16b, v4.8h, #7 \n" /* RRRRRxxx */ \ + "shl v0.16b, v29.16b, #3 \n" /* BBBBB000 */ \ + "shrn2 v1.16b, v4.8h, #2 \n" /* GGGGGxxx */ \ + "sshr v3.16b, v3.16b, #7 \n" /* AAAAAAAA */ \ + "sri v2.16b, v2.16b, #5 \n" /* RRRRRRRR */ \ + "sri v1.16b, v1.16b, #5 \n" /* GGGGGGGG */ \ + "sri v0.16b, v0.16b, #5 \n" /* BBBBBBBB */ // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. #define RGB555TOARGB \ @@ -2112,14 +2114,15 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. + asm volatile( + "1: \n" + "ldp q0, q4, [%0], #32 \n" // load 16 ARGB1555 pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop ARGB1555TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB + "add %1, %1, #64 \n" + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2