mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
[AArch64] Use full vectors in ARGB4444To{Y,UV}Row_NEON
The existing ARGB4444TORGB macro only makes use of 64 bit wide vectors
rather than the full 128 bits available, so unroll it to allow us to
process more data per instruction.
For ARGB4444ToUVRow_NEON we already have enough data available each
iteration to make use of full vectors, but for ARGB4444ToYRow_NEON we
also need to adjust the "any" kernel to allow us to process 16 elements
per iteration.
Reduction in runtimes observed compared to the existing Neon kernels:
| ARGB4444ToUVRow | ARGB4444ToYRow
Cortex-A55 | -27.8% | -34.6%
Cortex-A510 | -37.0% | -44.4%
Cortex-A76 | -40.2% | -22.0%
Cortex-A720 | -33.4% | -35.5%
Cortex-X1 | -34.1% | -19.7%
Cortex-X2 | -32.1% | -26.3%
Bug: libyuv:976
Change-Id: I08f6286bab0ebf5e24d5d5803f8c45ec6ba776ee
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5631541
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
5bac99fe09
commit
c1fe5663f5
@ -1311,8 +1311,12 @@ ANY11(ARGB1555ToYRow_Any_LSX, ARGB1555ToYRow_LSX, 0, 2, 1, 15)
|
|||||||
ANY11(ARGB1555ToYRow_Any_LASX, ARGB1555ToYRow_LASX, 0, 2, 1, 31)
|
ANY11(ARGB1555ToYRow_Any_LASX, ARGB1555ToYRow_LASX, 0, 2, 1, 31)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_ARGB4444TOYROW_NEON
|
#ifdef HAS_ARGB4444TOYROW_NEON
|
||||||
|
#ifdef __aarch64__
|
||||||
|
ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 15)
|
||||||
|
#else
|
||||||
ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
|
ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
#ifdef HAS_YUY2TOYROW_NEON
|
#ifdef HAS_YUY2TOYROW_NEON
|
||||||
ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
|
ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -2134,14 +2134,15 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
|
|||||||
"sri v1.16b, v1.16b, #4 \n" /* AAAAAAAA_GGGGGGGG */ \
|
"sri v1.16b, v1.16b, #4 \n" /* AAAAAAAA_GGGGGGGG */ \
|
||||||
"sri v0.16b, v0.16b, #4 \n" /* RRRRRRRR_BBBBBBBB */
|
"sri v0.16b, v0.16b, #4 \n" /* RRRRRRRR_BBBBBBBB */
|
||||||
|
|
||||||
#define ARGB4444TORGB \
|
#define ARGB4444TORGB \
|
||||||
/* Input: v0.8h = xxxxRRRRGGGGBBBB */ \
|
/* Input: v0.8h = xxxxRRRRGGGGBBBB */ \
|
||||||
"xtn v1.8b, v0.8h \n" /* GGGGBBBB */ \
|
"uzp1 v1.16b, v0.16b, v3.16b \n" /* GGGGBBBB */ \
|
||||||
"shrn v2.8b, v0.8h, #4 \n" /* RRRRxxxx */ \
|
"shrn v2.8b, v0.8h, #4 \n" /* RRRRxxxx */ \
|
||||||
"shl v0.8b, v1.8b, #4 \n" /* BBBB0000 */ \
|
"shl v0.16b, v1.16b, #4 \n" /* BBBB0000 */ \
|
||||||
"sri v1.8b, v1.8b, #4 \n" /* GGGGGGGG */ \
|
"shrn2 v2.16b, v3.8h, #4 \n" /* RRRRxxxx */ \
|
||||||
"sri v2.8b, v2.8b, #4 \n" /* RRRRRRRR */ \
|
"sri v1.16b, v1.16b, #4 \n" /* GGGGGGGG */ \
|
||||||
"sri v0.8b, v0.8b, #4 \n" /* BBBBBBBB */
|
"sri v2.16b, v2.16b, #4 \n" /* RRRRRRRR */ \
|
||||||
|
"sri v0.16b, v0.16b, #4 \n" /* BBBBBBBB */
|
||||||
|
|
||||||
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
|
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
@ -3331,46 +3332,32 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
|||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
|
const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
|
||||||
asm volatile (
|
asm volatile(
|
||||||
RGBTOUV_SETUP_REG // sets v20-v25
|
RGBTOUV_SETUP_REG // sets v20-v25
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
|
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels.
|
||||||
ARGB4444TORGB
|
ARGB4444TORGB
|
||||||
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
"uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
"uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||||
"uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
"uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
|
|
||||||
|
"ldp q0, q3, [%1], #32 \n" // load 16 ARGB4444 pixels.
|
||||||
ARGB4444TORGB
|
ARGB4444TORGB
|
||||||
"uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
"uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||||
"uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
"uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
"uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||||
|
"uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||||
|
|
||||||
"ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
|
"urshr v0.8h, v16.8h, #1 \n" // 2x average
|
||||||
ARGB4444TORGB
|
"urshr v1.8h, v17.8h, #1 \n"
|
||||||
"uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
"urshr v2.8h, v18.8h, #1 \n"
|
||||||
"prfm pldl1keep, [%1, 448] \n"
|
|
||||||
"uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
|
||||||
"uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
|
||||||
"ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
|
|
||||||
ARGB4444TORGB
|
|
||||||
"uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
|
||||||
"uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
|
||||||
"uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
|
||||||
|
|
||||||
"ins v16.D[1], v26.D[0] \n"
|
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||||
"ins v17.D[1], v27.D[0] \n"
|
|
||||||
"ins v18.D[1], v28.D[0] \n"
|
|
||||||
|
|
||||||
"urshr v0.8h, v16.8h, #1 \n" // 2x average
|
|
||||||
"urshr v1.8h, v17.8h, #1 \n"
|
|
||||||
"urshr v2.8h, v18.8h, #1 \n"
|
|
||||||
|
|
||||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
|
||||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_argb4444), // %0
|
: "+r"(src_argb4444), // %0
|
||||||
"+r"(src_argb4444_1), // %1
|
"+r"(src_argb4444_1), // %1
|
||||||
"+r"(dst_u), // %2
|
"+r"(dst_u), // %2
|
||||||
@ -3445,23 +3432,27 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
|
|||||||
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile (
|
asm volatile(
|
||||||
"movi v24.8b, #25 \n" // B * 0.1016 coefficient
|
"movi v24.16b, #25 \n" // B * 0.1016 coefficient
|
||||||
"movi v25.8b, #129 \n" // G * 0.5078 coefficient
|
"movi v25.16b, #129 \n" // G * 0.5078 coefficient
|
||||||
"movi v26.8b, #66 \n" // R * 0.2578 coefficient
|
"movi v26.16b, #66 \n" // R * 0.2578 coefficient
|
||||||
"movi v27.8b, #16 \n" // Add 16 constant
|
"movi v27.16b, #16 \n" // Add 16 constant
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
|
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels.
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||||
ARGB4444TORGB
|
ARGB4444TORGB
|
||||||
"umull v3.8h, v0.8b, v24.8b \n" // B
|
"umull v16.8h, v0.8b, v24.8b \n" // B
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"umull2 v17.8h, v0.16b, v24.16b \n" // B
|
||||||
"umlal v3.8h, v1.8b, v25.8b \n" // G
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"umlal v3.8h, v2.8b, v26.8b \n" // R
|
"umlal v16.8h, v1.8b, v25.8b \n" // G
|
||||||
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
"umlal2 v17.8h, v1.16b, v25.16b \n" // G
|
||||||
"uqadd v0.8b, v0.8b, v27.8b \n"
|
"umlal v16.8h, v2.8b, v26.8b \n" // R
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
"umlal2 v17.8h, v2.16b, v26.16b \n" // R
|
||||||
"b.gt 1b \n"
|
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
||||||
|
"uqrshrn2 v0.16b, v17.8h, #8 \n" // 16 bit to 8 bit Y
|
||||||
|
"uqadd v0.16b, v0.16b, v27.16b \n"
|
||||||
|
"str q0, [%1], #16 \n" // store 8 pixels Y.
|
||||||
|
"b.gt 1b \n"
|
||||||
: "+r"(src_argb4444), // %0
|
: "+r"(src_argb4444), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user