From 90070986aeac1129aa7632d986d636d3d29d5859 Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 13 Mar 2024 06:36:29 +0000 Subject: [PATCH] [AArch64] Improve RGB565TOARGB using SRI instructions The existing code performs a lot of shifts and combines the R and B components into a single vector unnecessarily. We can express this much more cleanly by making use of the SRI instruction to insert and replace shifted bits into the original data, performing the 5/6-bit to 8-bit expansion in a single instruction if the source bits are already in the high bits of the byte. We still need a single separate XTN instruction to narrow the B component before the left shift since Neon does not have a narrowing left shift instruction. Reduction in runtime for selected kernels: Kernel | Cortex-A55 | Cortex-A76 | Cortex-X2 RGB565ToYRow_NEON | -22.1% | -23.4% | -25.1% RGB565ToUVRow_NEON | -26.8% | -20.5% | -18.8% RGB565ToARGBRow_NEON | -38.9% | -32.0% | -23.5% Bug: libyuv:976 Change-Id: I77b8d58287b70dbb9549451fc15ed3dd0d2a4dda Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5374286 Reviewed-by: Frank Barchard Reviewed-by: Justin Green --- source/row_neon64.cc | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 2836ee94a..f9b34a491 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1611,18 +1611,15 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { ); } -#define RGB565TOARGB \ - "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ - "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ - "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ - "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ - "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ - "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ - "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ - "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ - "dup v2.2D, v0.D[1] \n" /* R */ +#define RGB565TOARGB \ + /* Input: v0.8h: RRRRRGGGGGGBBBBB */ \ + "shrn v1.8b, v0.8h, #3 \n" /* G GGGGGGxx */ \ + "shrn v2.8b, v0.8h, #8 \n" /* R RRRRRxxx */ \ + "xtn v0.8b, v0.8h \n" /* B xxxBBBBB */ \ + "sri v1.8b, v1.8b, #6 \n" /* G GGGGGGGG, fill 2 */ \ + "shl v0.8b, v0.8b, #3 \n" /* B BBBBB000 */ \ + "sri v2.8b, v2.8b, #5 \n" /* R RRRRRRRR, fill 3 */ \ + "sri v0.8b, v0.8b, #5 \n" /* R BBBBBBBB, fill 3 */ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb,