From dff7bad43def35d09b6b1666b9645ec78c0cb0da Mon Sep 17 00:00:00 2001 From: George Steed Date: Sat, 23 Mar 2024 20:02:41 +0000 Subject: [PATCH] [AArch64] Use full Neon vectors in ARGB4444ToARGBRow_NEON The existing Neon code narrows the input 16-bit packed data to 8-bit elements and separates the color channels, causing us to only process half a Neon vector per instruction for the channel widening from 4-bit color data to 8-bits. We can note that the processing being done is identical for all color channels and therefore we can keep them partially interleaved during the widening step. This allows us to use full Neon vectors for the whole loop body. Reductions in runtimes observed for ARGB4444ToARGBRow_NEON: Cortex-A55: -30.7% Cortex-A510: -44.3% Cortex-A76: -51.6% Cortex-X2: -54.2% Bug: libyuv:976 Change-Id: I9d9cda7e16eb07619c6d7f1de2e6b8c0fb6d64cf Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5594389 Reviewed-by: Frank Barchard --- source/row_neon64.cc | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 41c386201..231ab9af3 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1911,16 +1911,11 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, ); } -#define ARGB4444TOARGB \ - /* Input: v0.8h = AAAARRRRGGGGBBBB */ \ - "xtn v1.8b, v0.8h \n" /* GGGGBBBB */ \ - "shrn v2.8b, v0.8h, #4 \n" /* RRRRxxxx */ \ - "shrn v3.8b, v0.8h, #8 \n" /* AAAAxxxx */ \ - "shl v0.8b, v1.8b, #4 \n" /* BBBB0000 */ \ - "sri v1.8b, v1.8b, #4 \n" /* GGGGGGGG */ \ - "sri v2.8b, v2.8b, #4 \n" /* RRRRRRRR */ \ - "sri v3.8b, v3.8b, #4 \n" /* AAAAAAAA */ \ - "sri v0.8b, v0.8b, #4 \n" /* BBBBBBBB */ +#define ARGB4444TOARGB \ + /* Input: v1.8h = AAAARRRR_GGGGBBBB */ \ + "shl v0.16b, v1.16b, #4 \n" /* RRRR0000_BBBB0000 */ \ + "sri v1.16b, v1.16b, #4 \n" /* AAAAAAAA_GGGGGGGG */ \ + "sri v0.16b, v0.16b, #4 \n" /* RRRRRRRR_BBBBBBBB */ #define ARGB4444TORGB \ /* Input: v0.8h = xxxxRRRRGGGGBBBB */ \ @@ -1936,10 +1931,10 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, int width) { asm volatile( "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "ld1 {v1.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 8 ARGB. "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1