mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-13 13:49:53 +08:00
[AArch64] Use full vectors in ARGB1555To{Y,UV}Row_NEON
The existing RGB555TOARGB macro only makes use of 64 bit wide vectors
rather than the full 128 bits available, so unroll it to allow us to
process more data per instruction.
For ARGB1555ToUVRow_NEON we already have enough data available each
iteration to make use of full vectors, but for ARGB1555ToYRow_NEON we
also need to adjust the "any" kernel to allow us to process 16 elements
per iteration.
Reduction in runtimes observed compared to the existing Neon kernels:
| ARGB1555ToUVRow | ARGB1555ToYRow
Cortex-A55 | -28.8% | -35.3%
Cortex-A510 | -34.0% | -48.5%
Cortex-A76 | -36.7% | -25.1%
Cortex-A720 | -29.7% | -31.1%
Cortex-X1 | -31.6% | -19.7%
Cortex-X2 | -27.6% | -22.7%
Bug: libyuv:976
Change-Id: Idd745c133b5fb65001652a59f01ac1aa3bb42067
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5631540
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
3b3c7f0b81
commit
a425b559bd
@ -1295,8 +1295,12 @@ ANY11(RGB565ToYRow_Any_LSX, RGB565ToYRow_LSX, 0, 2, 1, 15)
|
|||||||
ANY11(RGB565ToYRow_Any_LASX, RGB565ToYRow_LASX, 0, 2, 1, 31)
|
ANY11(RGB565ToYRow_Any_LASX, RGB565ToYRow_LASX, 0, 2, 1, 31)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_ARGB1555TOYROW_NEON
|
#ifdef HAS_ARGB1555TOYROW_NEON
|
||||||
|
#ifdef __aarch64__
|
||||||
|
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 15)
|
||||||
|
#else
|
||||||
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
|
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
#ifdef HAS_ARGB1555TOYROW_MSA
|
#ifdef HAS_ARGB1555TOYROW_MSA
|
||||||
ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
|
ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -2096,16 +2096,18 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
|||||||
"sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */
|
"sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */
|
||||||
|
|
||||||
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
|
// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
|
||||||
#define RGB555TOARGB \
|
#define RGB555TOARGB \
|
||||||
/* Input: xRRRRRGGGGGBBBBB */ \
|
/* Input: xRRRRRGGGGGBBBBB */ \
|
||||||
"xtn v29.8b, v0.8h \n" /* xxxBBBBB */ \
|
"uzp1 v29.16b, v0.16b, v3.16b \n" /* xxxBBBBB */ \
|
||||||
"shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \
|
"shrn v2.8b, v0.8h, #7 \n" /* RRRRRxxx */ \
|
||||||
"shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \
|
"shrn v1.8b, v0.8h, #2 \n" /* GGGGGxxx */ \
|
||||||
"shl v0.8b, v29.8b, #3 \n" /* BBBBB000 */ \
|
"shl v0.16b, v29.16b, #3 \n" /* BBBBB000 */ \
|
||||||
\
|
"shrn2 v2.16b, v3.8h, #7 \n" /* RRRRRxxx */ \
|
||||||
"sri v2.8b, v2.8b, #5 \n" /* RRRRRRRR */ \
|
"shrn2 v1.16b, v3.8h, #2 \n" /* GGGGGxxx */ \
|
||||||
"sri v1.8b, v1.8b, #5 \n" /* GGGGGGGG */ \
|
\
|
||||||
"sri v0.8b, v0.8b, #5 \n" /* BBBBBBBB */
|
"sri v0.16b, v0.16b, #5 \n" /* BBBBBBBB */ \
|
||||||
|
"sri v2.16b, v2.16b, #5 \n" /* RRRRRRRR */ \
|
||||||
|
"sri v1.16b, v1.16b, #5 \n" /* GGGGGGGG */
|
||||||
|
|
||||||
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
|
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
@ -3285,46 +3287,32 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
|
|||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
|
const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
|
||||||
asm volatile (
|
asm volatile(
|
||||||
RGBTOUV_SETUP_REG
|
RGBTOUV_SETUP_REG
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
|
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels.
|
||||||
RGB555TOARGB
|
RGB555TOARGB
|
||||||
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
"uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
"uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||||
"uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
"uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
|
|
||||||
|
"ldp q0, q3, [%1], #32 \n" // load 16 ARGB1555 pixels.
|
||||||
RGB555TOARGB
|
RGB555TOARGB
|
||||||
"uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
"uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||||
"uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
"uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
"uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||||
|
"uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||||
|
|
||||||
"ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
|
"urshr v0.8h, v16.8h, #1 \n" // 2x average
|
||||||
RGB555TOARGB
|
"urshr v1.8h, v17.8h, #1 \n"
|
||||||
"uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
"urshr v2.8h, v18.8h, #1 \n"
|
||||||
"prfm pldl1keep, [%1, 448] \n"
|
|
||||||
"uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
|
||||||
"uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
|
||||||
"ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
|
|
||||||
RGB555TOARGB
|
|
||||||
"uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
|
||||||
"uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
|
||||||
"uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
|
||||||
|
|
||||||
"ins v16.D[1], v26.D[0] \n"
|
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||||
"ins v17.D[1], v27.D[0] \n"
|
|
||||||
"ins v18.D[1], v28.D[0] \n"
|
|
||||||
|
|
||||||
"urshr v0.8h, v16.8h, #1 \n" // 2x average
|
|
||||||
"urshr v1.8h, v17.8h, #1 \n"
|
|
||||||
"urshr v2.8h, v18.8h, #1 \n"
|
|
||||||
|
|
||||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
|
||||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_argb1555), // %0
|
: "+r"(src_argb1555), // %0
|
||||||
"+r"(src_argb1555_1), // %1
|
"+r"(src_argb1555_1), // %1
|
||||||
"+r"(dst_u), // %2
|
"+r"(dst_u), // %2
|
||||||
@ -3425,28 +3413,33 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
|
|||||||
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
|
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
|
||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile (
|
asm volatile(
|
||||||
"movi v4.8b, #25 \n" // B * 0.1016 coefficient
|
"movi v4.16b, #25 \n" // B * 0.1016 coefficient
|
||||||
"movi v5.8b, #129 \n" // G * 0.5078 coefficient
|
"movi v5.16b, #129 \n" // G * 0.5078 coefficient
|
||||||
"movi v6.8b, #66 \n" // R * 0.2578 coefficient
|
"movi v6.16b, #66 \n" // R * 0.2578 coefficient
|
||||||
"movi v7.8b, #16 \n" // Add 16 constant
|
"movi v7.16b, #16 \n" // Add 16 constant
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
|
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels.
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||||
RGB555TOARGB
|
RGB555TOARGB
|
||||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
"umull v16.8h, v0.8b, v4.8b \n" // B
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"umull2 v17.8h, v0.16b, v4.16b \n" // B
|
||||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"umlal v3.8h, v2.8b, v6.8b \n" // R
|
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
||||||
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
"umlal2 v17.8h, v1.16b, v5.16b \n" // G
|
||||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
"umlal v16.8h, v2.8b, v6.8b \n" // R
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
"umlal2 v17.8h, v2.16b, v6.16b \n" // R
|
||||||
"b.gt 1b \n"
|
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
||||||
|
"uqrshrn2 v0.16b, v17.8h, #8 \n" // 16 bit to 8 bit Y
|
||||||
|
"uqadd v0.16b, v0.16b, v7.16b \n"
|
||||||
|
"str q0, [%1], #16 \n" // store pixels Y.
|
||||||
|
"b.gt 1b \n"
|
||||||
: "+r"(src_argb1555), // %0
|
: "+r"(src_argb1555), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
:
|
:
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v29");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
|
"v17", "v29");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user