[AArch64] Unroll to use full vectors in ARGBToARGB1555Row_NEON

By loading packed 16-bit AR/GB data and operating on that directly we
avoid the need to perform a separate widening step before the
conversion.

Reduction in runtime observed compared to the existing Neon code:

 Cortex-A55: -13.2%
Cortex-A510:  -5.4%
 Cortex-A76: -21.5%
Cortex-A720: -25.2%
  Cortex-X1: -50.6%
  Cortex-X2: -36.8%

Bug: b/42280945
Change-Id: I780c71fdff1d017464c6e4e38f86979dda0e43ad
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790973
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
George Steed 2024-04-22 11:21:48 +01:00 committed by Frank Barchard
parent 432d186116
commit 2dfb84b311

View File

@ -786,14 +786,12 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
#define ARGBTOARGB1555 \
/* Inputs: \
* v16: bbbbbxxx v17: gggggxxx v18: rrrrrxxx v19: axxxxxxx */ \
"shll v0.8h, v19.8b, #8 \n" /* axxxxxxx00000000 */ \
"shll v18.8h, v18.8b, #8 \n" /* rrrrrxxx00000000 */ \
"shll v17.8h, v17.8b, #8 \n" /* gggggxxx00000000 */ \
"shll v16.8h, v16.8b, #8 \n" /* bbbbbxxx00000000 */ \
"sri v0.8h, v18.8h, #1 \n" /* arrrrrxxx0000000 */ \
"sri v0.8h, v17.8h, #6 \n" /* arrrrrgggggxxx00 */ \
"sri v0.8h, v16.8h, #11 \n" /* arrrrrgggggbbbbb */
* v16: gggggxxxbbbbbxxx v17: axxxxxxxrrrrrxxx */ \
"shl v1.8h, v16.8h, #8 \n" /* bbbbbxxx00000000 */ \
"shl v2.8h, v17.8h, #8 \n" /* rrrrrxxx00000000 */ \
"sri v17.8h, v2.8h, #1 \n" /* arrrrrxxxrrrrxxx */ \
"sri v17.8h, v16.8h, #6 \n" /* arrrrrgggggxxxbb */ \
"sri v17.8h, v1.8h, #11 \n" /* arrrrrgggggbbbbb */
#define ARGBTOARGB1555_FROM_TOP \
/* Inputs: \
@ -2517,19 +2515,18 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb1555,
int width) {
asm volatile (
asm volatile(
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
// pixels
"ld2 {v16.8h,v17.8h}, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
"st1 {v17.16b}, [%1], #16 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v16", "v17", "v18", "v19");
: "cc", "memory", "v1", "v2", "v16", "v17");
}
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,