mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[AArch64] Use full Neon vectors in RGB565To{ARGB,UV,Y}Row_NEON
The existing code only makes use of half of the vector lanes in the
RGB565TOARGB macro. In the RGB565To{ARGB,Y} kernels we can load more
data to allow using full vectors, adjusting the "any" kernel macros to
match. For the RGB565ToUVRow kernel we already have plenty of data but
currently call the macro twice as much as needed, so refactor the code
to only call it once but operating with full vectors instead.
Reduction in runtimes observed for selected micro-architectures:
| RGB565ToARGBRow | RGB565ToUVRow | RGB565ToYRow
Cortex-A53 | -35.2% | -28.8% | -31.1%
Cortex-A55 | -32.5% | -34.4% | -42.9%
Cortex-A510 | -21.6% | -27.7% | -47.2%
Cortex-A76 | -0.9% | -42.0% | -21.4%
Cortex-A720 | -28.6% | -37.2% | -26.1%
Cortex-X1 | -3.2% | -42.3% | -23.4%
Bug: b/42280945
Change-Id: Ib1f68e5b87cc05a1485bbe96cfef87e6ac119fc3
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790974
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
2dfb84b311
commit
772f0fde1c
@ -3565,11 +3565,9 @@ int RGB565ToI420(const uint8_t* src_rgb565,
|
|||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
|
RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
|
||||||
RGB565ToYRow = RGB565ToYRow_Any_NEON;
|
RGB565ToYRow = RGB565ToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
RGB565ToYRow = RGB565ToYRow_NEON;
|
RGB565ToYRow = RGB565ToYRow_NEON;
|
||||||
if (IS_ALIGNED(width, 16)) {
|
RGB565ToUVRow = RGB565ToUVRow_NEON;
|
||||||
RGB565ToUVRow = RGB565ToUVRow_NEON;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// MSA version does direct RGB565 to YUV.
|
// MSA version does direct RGB565 to YUV.
|
||||||
|
|||||||
@ -3709,7 +3709,7 @@ int RGB565ToARGB(const uint8_t* src_rgb565,
|
|||||||
#if defined(HAS_RGB565TOARGBROW_NEON)
|
#if defined(HAS_RGB565TOARGBROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
|
RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
RGB565ToARGBRow = RGB565ToARGBRow_NEON;
|
RGB565ToARGBRow = RGB565ToARGBRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1283,7 +1283,7 @@ ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15)
|
|||||||
ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31)
|
ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGB565TOYROW_NEON
|
#ifdef HAS_RGB565TOYROW_NEON
|
||||||
ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
|
ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGB565TOYROW_MSA
|
#ifdef HAS_RGB565TOYROW_MSA
|
||||||
ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
|
ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
|
||||||
@ -1381,7 +1381,7 @@ ANY11(RAWToARGBRow_Any_LSX, RAWToARGBRow_LSX, 0, 3, 4, 15)
|
|||||||
ANY11(RAWToARGBRow_Any_LASX, RAWToARGBRow_LASX, 0, 3, 4, 31)
|
ANY11(RAWToARGBRow_Any_LASX, RAWToARGBRow_LASX, 0, 3, 4, 31)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGB565TOARGBROW_NEON
|
#ifdef HAS_RGB565TOARGBROW_NEON
|
||||||
ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
|
ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGB565TOARGBROW_MSA
|
#ifdef HAS_RGB565TOARGBROW_MSA
|
||||||
ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
|
ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
|
||||||
|
|||||||
@ -2053,26 +2053,28 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define RGB565TOARGB \
|
#define RGB565TOARGB \
|
||||||
/* Input: v0.8h: RRRRRGGGGGGBBBBB */ \
|
/* Input: v0/v4.8h: RRRRRGGGGGGBBBBB */ \
|
||||||
"shrn v1.8b, v0.8h, #3 \n" /* G GGGGGGxx */ \
|
"shrn v1.8b, v0.8h, #3 \n" /* G GGGGGGxx */ \
|
||||||
"shrn v2.8b, v0.8h, #8 \n" /* R RRRRRxxx */ \
|
"shrn2 v1.16b, v4.8h, #3 \n" /* G GGGGGGxx */ \
|
||||||
"xtn v0.8b, v0.8h \n" /* B xxxBBBBB */ \
|
"uzp2 v2.16b, v0.16b, v4.16b \n" /* R RRRRRxxx */ \
|
||||||
"sri v1.8b, v1.8b, #6 \n" /* G GGGGGGGG, fill 2 */ \
|
"uzp1 v0.16b, v0.16b, v4.16b \n" /* B xxxBBBBB */ \
|
||||||
"shl v0.8b, v0.8b, #3 \n" /* B BBBBB000 */ \
|
"sri v1.16b, v1.16b, #6 \n" /* G GGGGGGGG, fill 2 */ \
|
||||||
"sri v2.8b, v2.8b, #5 \n" /* R RRRRRRRR, fill 3 */ \
|
"shl v0.16b, v0.16b, #3 \n" /* B BBBBB000 */ \
|
||||||
"sri v0.8b, v0.8b, #5 \n" /* R BBBBBBBB, fill 3 */
|
"sri v2.16b, v2.16b, #5 \n" /* R RRRRRRRR, fill 3 */ \
|
||||||
|
"sri v0.16b, v0.16b, #5 \n" /* R BBBBBBBB, fill 3 */
|
||||||
|
|
||||||
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile (
|
asm volatile(
|
||||||
"movi v3.8b, #255 \n" // Alpha
|
"movi v3.16b, #255 \n" // Alpha
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
|
"ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||||
"prfm pldl1keep, [%0, 448] \n" RGB565TOARGB
|
"prfm pldl1keep, [%0, 448] \n" RGB565TOARGB
|
||||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
|
"st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB
|
||||||
"b.gt 1b \n"
|
"add %1, %1, #64 \n"
|
||||||
|
"b.gt 1b \n"
|
||||||
: "+r"(src_rgb565), // %0
|
: "+r"(src_rgb565), // %0
|
||||||
"+r"(dst_argb), // %1
|
"+r"(dst_argb), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
@ -3281,46 +3283,32 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
|
|||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
|
const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
|
||||||
asm volatile (
|
asm volatile(
|
||||||
RGBTOUV_SETUP_REG
|
RGBTOUV_SETUP_REG
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
|
"ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels.
|
||||||
RGB565TOARGB
|
RGB565TOARGB
|
||||||
"uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
"uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
"uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||||
"uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
"uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
|
|
||||||
|
"ldp q0, q4, [%1], #32 \n" // load 16 RGB565 pixels.
|
||||||
RGB565TOARGB
|
RGB565TOARGB
|
||||||
"uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
"uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||||
"uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
"uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
"uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||||
|
"uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||||
|
|
||||||
"ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
|
"urshr v0.8h, v16.8h, #1 \n" // 2x average
|
||||||
RGB565TOARGB
|
"urshr v1.8h, v17.8h, #1 \n"
|
||||||
"uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
"urshr v2.8h, v18.8h, #1 \n"
|
||||||
"prfm pldl1keep, [%1, 448] \n"
|
|
||||||
"uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
|
||||||
"uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
|
||||||
"ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
|
|
||||||
RGB565TOARGB
|
|
||||||
"uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
|
|
||||||
"uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
|
|
||||||
"uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
|
|
||||||
|
|
||||||
"ins v16.D[1], v26.D[0] \n"
|
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||||
"ins v17.D[1], v27.D[0] \n"
|
|
||||||
"ins v18.D[1], v28.D[0] \n"
|
|
||||||
|
|
||||||
"urshr v0.8h, v16.8h, #1 \n" // 2x average
|
|
||||||
"urshr v1.8h, v17.8h, #1 \n"
|
|
||||||
"urshr v2.8h, v18.8h, #1 \n"
|
|
||||||
|
|
||||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
|
||||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_rgb565), // %0
|
: "+r"(src_rgb565), // %0
|
||||||
"+r"(src_rgb565_1), // %1
|
"+r"(src_rgb565_1), // %1
|
||||||
"+r"(dst_u), // %2
|
"+r"(dst_u), // %2
|
||||||
@ -3423,22 +3411,27 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
|
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
|
||||||
asm volatile (
|
asm volatile(
|
||||||
"movi v24.8b, #25 \n" // B * 0.1016 coefficient
|
"movi v24.16b, #25 \n" // B * 0.1016 coefficient
|
||||||
"movi v25.8b, #129 \n" // G * 0.5078 coefficient
|
"movi v25.16b, #129 \n" // G * 0.5078 coefficient
|
||||||
"movi v26.8b, #66 \n" // R * 0.2578 coefficient
|
"movi v26.16b, #66 \n" // R * 0.2578 coefficient
|
||||||
"movi v27.8b, #16 \n" // Add 16 constant
|
"movi v27.16b, #16 \n" // Add 16 constant
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
|
"ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels.
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||||
RGB565TOARGB
|
RGB565TOARGB
|
||||||
"umull v3.8h, v0.8b, v24.8b \n" // B
|
"umull v3.8h, v0.8b, v24.8b \n" // B
|
||||||
|
"umull2 v4.8h, v0.16b, v24.16b \n" // B
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"umlal v3.8h, v1.8b, v25.8b \n" // G
|
"umlal v3.8h, v1.8b, v25.8b \n" // G
|
||||||
|
"umlal2 v4.8h, v1.16b, v25.16b \n" // G
|
||||||
"umlal v3.8h, v2.8b, v26.8b \n" // R
|
"umlal v3.8h, v2.8b, v26.8b \n" // R
|
||||||
|
"umlal2 v4.8h, v2.16b, v26.16b \n" // R
|
||||||
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
||||||
|
"uqrshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit Y
|
||||||
"uqadd v0.8b, v0.8b, v27.8b \n"
|
"uqadd v0.8b, v0.8b, v27.8b \n"
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
"uqadd v1.8b, v1.8b, v27.8b \n"
|
||||||
|
"stp d0, d1, [%1], #16 \n" // store 8 pixels Y.
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_rgb565), // %0
|
: "+r"(src_rgb565), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user