[AArch64] Use full vectors in ARGB1555To{Y,UV}Row_NEON

The existing RGB555TOARGB macro only makes use of 64 bit wide vectors
rather than the full 128 bits available, so unroll it to allow us to
process more data per instruction.

For ARGB1555ToUVRow_NEON we already have enough data available each
iteration to make use of full vectors, but for ARGB1555ToYRow_NEON we
also need to adjust the "any" kernel to allow us to process 16 elements
per iteration.

Reduction in runtimes observed compared to the existing Neon kernels:

            | ARGB1555ToUVRow | ARGB1555ToYRow
 Cortex-A55 |          -28.8% |         -35.3%
Cortex-A510 |          -34.0% |         -48.5%
 Cortex-A76 |          -36.7% |         -25.1%
Cortex-A720 |          -29.7% |         -31.1%
  Cortex-X1 |          -31.6% |         -19.7%
  Cortex-X2 |          -27.6% |         -22.7%

Bug: libyuv:976
Change-Id: Idd745c133b5fb65001652a59f01ac1aa3bb42067
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5631540
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
George Steed 2024-04-17 11:27:48 +01:00 committed by Frank Barchard
parent 3b3c7f0b81
commit a425b559bd
2 changed files with 58 additions and 61 deletions

View File

@ -1295,8 +1295,12 @@ ANY11(RGB565ToYRow_Any_LSX, RGB565ToYRow_LSX, 0, 2, 1, 15)
ANY11(RGB565ToYRow_Any_LASX, RGB565ToYRow_LASX, 0, 2, 1, 31)
#endif
#ifdef HAS_ARGB1555TOYROW_NEON
#ifdef __aarch64__
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 15)
#else
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
#endif
#endif
#ifdef HAS_ARGB1555TOYROW_MSA
ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
#endif

View File

@ -2096,16 +2096,18 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
"sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
#define RGB555TOARGB \
/* Input: xRRRRRGGGGGBBBBB */ \
"xtn v29.8b, v0.8h \n" /* xxxBBBBB */ \
"shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \
"shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \
"shl v0.8b, v29.8b, #3 \n" /* BBBBB000 */ \
\
"sri v2.8b, v2.8b, #5 \n" /* RRRRRRRR */ \
"sri v1.8b, v1.8b, #5 \n" /* GGGGGGGG */ \
"sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */
#define RGB555TOARGB \
/* Input: xRRRRRGGGGGBBBBB */ \
"uzp1 v29.16b, v0.16b, v3.16b \n" /* xxxBBBBB */ \
"shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \
"shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \
"shl v0.16b, v29.16b, #3 \n" /* BBBBB000 */ \
"shrn2 v2.16b, v3.8h, #7 \n" /* RRRRRxxx */ \
"shrn2 v1.16b, v3.8h, #2 \n" /* GGGGGxxx */ \
\
"sri v0.16b, v0.16b, #5 \n" /* BBBBBBBB */ \
"sri v2.16b, v2.16b, #5 \n" /* RRRRRRRR */ \
"sri v1.16b, v1.16b, #5 \n" /* GGGGGGGG */
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
@ -3285,46 +3287,32 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_v,
int width) {
const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
asm volatile (
asm volatile(
RGBTOUV_SETUP_REG
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"1: \n"
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels.
RGB555TOARGB
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
"uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
"uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ldp q0, q3, [%1], #32 \n" // load 16 ARGB1555 pixels.
RGB555TOARGB
"uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
"uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
"uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
"uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
"uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
"uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
"uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
"urshr v0.8h, v16.8h, #1 \n" // 2x average
"urshr v1.8h, v17.8h, #1 \n"
"urshr v2.8h, v18.8h, #1 \n"
"ins v16.D[1], v26.D[0] \n"
"ins v17.D[1], v27.D[0] \n"
"ins v18.D[1], v28.D[0] \n"
"urshr v0.8h, v16.8h, #1 \n" // 2x average
"urshr v1.8h, v17.8h, #1 \n"
"urshr v2.8h, v18.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_argb1555_1), // %1
"+r"(dst_u), // %2
@ -3425,28 +3413,33 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
asm volatile (
"movi v4.8b, #25 \n" // B * 0.1016 coefficient
"movi v5.8b, #129 \n" // G * 0.5078 coefficient
"movi v6.8b, #66 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
asm volatile(
"movi v4.16b, #25 \n" // B * 0.1016 coefficient
"movi v5.16b, #129 \n" // G * 0.5078 coefficient
"movi v6.16b, #66 \n" // R * 0.2578 coefficient
"movi v7.16b, #16 \n" // Add 16 constant
"1: \n"
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
RGB555TOARGB
"umull v3.8h, v0.8b, v4.8b \n" // B
"prfm pldl1keep, [%0, 448] \n"
"umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n"
"umull v16.8h, v0.8b, v4.8b \n" // B
"umull2 v17.8h, v0.16b, v4.16b \n" // B
"prfm pldl1keep, [%0, 448] \n"
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal2 v17.8h, v1.16b, v5.16b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R
"umlal2 v17.8h, v2.16b, v6.16b \n" // R
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqrshrn2 v0.16b, v17.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.16b, v0.16b, v7.16b \n"
"str q0, [%1], #16 \n" // store pixels Y.
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v29");
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v29");
}
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,