mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Unroll to use full vectors in ARGBToARGB1555Row_NEON
By loading packed 16-bit AR/GB data and operating on that directly we avoid the need to perform a separate widening step before the conversion. Reduction in runtime observed compared to the existing Neon code: Cortex-A55: -13.2% Cortex-A510: -5.4% Cortex-A76: -21.5% Cortex-A720: -25.2% Cortex-X1: -50.6% Cortex-X2: -36.8% Bug: b/42280945 Change-Id: I780c71fdff1d017464c6e4e38f86979dda0e43ad Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790973 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
432d186116
commit
2dfb84b311
@ -786,14 +786,12 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
|
||||
|
||||
#define ARGBTOARGB1555 \
|
||||
/* Inputs: \
|
||||
* v16: bbbbbxxx v17: gggggxxx v18: rrrrrxxx v19: axxxxxxx */ \
|
||||
"shll v0.8h, v19.8b, #8 \n" /* axxxxxxx00000000 */ \
|
||||
"shll v18.8h, v18.8b, #8 \n" /* rrrrrxxx00000000 */ \
|
||||
"shll v17.8h, v17.8b, #8 \n" /* gggggxxx00000000 */ \
|
||||
"shll v16.8h, v16.8b, #8 \n" /* bbbbbxxx00000000 */ \
|
||||
"sri v0.8h, v18.8h, #1 \n" /* arrrrrxxx0000000 */ \
|
||||
"sri v0.8h, v17.8h, #6 \n" /* arrrrrgggggxxx00 */ \
|
||||
"sri v0.8h, v16.8h, #11 \n" /* arrrrrgggggbbbbb */
|
||||
* v16: gggggxxxbbbbbxxx v17: axxxxxxxrrrrrxxx */ \
|
||||
"shl v1.8h, v16.8h, #8 \n" /* bbbbbxxx00000000 */ \
|
||||
"shl v2.8h, v17.8h, #8 \n" /* rrrrrxxx00000000 */ \
|
||||
"sri v17.8h, v2.8h, #1 \n" /* arrrrrxxxrrrrxxx */ \
|
||||
"sri v17.8h, v16.8h, #6 \n" /* arrrrrgggggxxxbb */ \
|
||||
"sri v17.8h, v1.8h, #11 \n" /* arrrrrgggggbbbbb */
|
||||
|
||||
#define ARGBTOARGB1555_FROM_TOP \
|
||||
/* Inputs: \
|
||||
@ -2517,19 +2515,18 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
|
||||
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb1555,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
|
||||
// pixels
|
||||
"ld2 {v16.8h,v17.8h}, [%0], #32 \n" // load 8 pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
|
||||
"st1 {v17.16b}, [%1], #16 \n" // store 8 pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb1555), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "cc", "memory", "v0", "v16", "v17", "v18", "v19");
|
||||
: "cc", "memory", "v1", "v2", "v16", "v17");
|
||||
}
|
||||
|
||||
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user