[AArch64] Avoid unnecessary widening in I422ToARGB1555Row_NEON

The existing code first widens the component vectors from 8-bit elements
to 16-bits to construct the final ARGB1555 result, however this is
unnecessary since the inputs to the widening are themselves the result
of having just been narrowed in the RGBTORGB8 macro.

By making use of the new RGBTORGB8_TOP macro we can get rid of both the
widening as well as the prior narrowing step.

Also remove volatile from the asm, it is unnecessary.

Reduction in runtime observed for I422ToARGB1555Row_NEON:

 Cortex-A55:  -7.8%
 Cortex-A76: -15.0%
Cortex-A720: -20.3%
  Cortex-X1: -20.2%
  Cortex-X2: -20.3%

Bug: libyuv:976
Change-Id: Id031c5d4d788828297adcc2fe2c2cd8d99b45433
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5616050
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-19 15:06:35 +01:00 committed by Frank Barchard
parent e6c4b9ad2e
commit 89cf221baa

View File

@ -566,14 +566,24 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_REGS);
}
#define ARGBTOARGB1555 \
"shll v0.8h, v19.8b, #8 \n" /* A */ \
"shll v18.8h, v18.8b, #8 \n" /* R */ \
"shll v17.8h, v17.8b, #8 \n" /* G */ \
"shll v16.8h, v16.8b, #8 \n" /* B */ \
"sri v0.8h, v18.8h, #1 \n" /* AR */ \
"sri v0.8h, v17.8h, #6 \n" /* ARG */ \
"sri v0.8h, v16.8h, #11 \n" /* ARGB */
#define ARGBTOARGB1555 \
/* Inputs: \
* v16: bbbbbxxx v17: gggggxxx v18: rrrrrxxx v19: axxxxxxx */ \
"shll v0.8h, v19.8b, #8 \n" /* axxxxxxx00000000 */ \
"shll v18.8h, v18.8b, #8 \n" /* rrrrrxxx00000000 */ \
"shll v17.8h, v17.8b, #8 \n" /* gggggxxx00000000 */ \
"shll v16.8h, v16.8b, #8 \n" /* bbbbbxxx00000000 */ \
"sri v0.8h, v18.8h, #1 \n" /* arrrrrxxx0000000 */ \
"sri v0.8h, v17.8h, #6 \n" /* arrrrrgggggxxx00 */ \
"sri v0.8h, v16.8h, #11 \n" /* arrrrrgggggbbbbb */
#define ARGBTOARGB1555_FROM_TOP \
/* Inputs: \
* v16: bbbbbxxxxxxxxxxx v17: gggggxxxxxxxxxxx \
* v18: rrrrrxxxxxxxxxxx v19: axxxxxxxxxxxxxxx */ \
"sri v19.8h, v18.8h, #1 \n" /* arrrrrxxxxxxxxxx */ \
"sri v19.8h, v17.8h, #6 \n" /* arrrrrgggggxxxxx */ \
"sri v19.8h, v16.8h, #11 \n" /* arrrrrgggggbbbbb */
void I422ToARGB1555Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
@ -581,15 +591,14 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n" ARGBTOARGB1555
"st1 {v0.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels
// RGB565.
"b.gt 1b \n"
asm(YUVTORGB_SETUP
"movi v19.8h, #0x80, lsl #8 \n"
"1: \n" //
READYUV422 I4XXTORGB RGBTORGB8_TOP
"subs %w[width], %w[width], #8 \n" //
ARGBTOARGB1555_FROM_TOP
"st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels RGB1555.
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]