mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[AArch64] Optimize Merge{ARGB,XRGB}16To8Row_NEON
Rather than shifting the data into the low half of each lane and then
using a saturating narrowing operation, we can do the saturation as part
of a shift into the highest half of the lane and then use a simpler TRN2
instruction to extract pairs of high halves into full vectors. This also
has the nice advantage of allowing us to use ST2 rather than ST4 for
storing the result, since ST4 is known to be slow on some
micro-architectures.
Reduction in runtimes observed for the two kernels:
| MergeARGB16To8Row_NEON | MergeXRGB16To8Row_NEON
Cortex-A55 | -8.0% | -12.2%
Cortex-A510 | -29.9% | -31.4%
Cortex-A76 | -29.0% | -32.0%
Cortex-X2 | -33.5% | -43.4%
Bug: libyuv:976
Change-Id: I9da3beedc27ab43527b3642aa6d4decf3b5b6683
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5509198
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
4f7fd808b7
commit
d0c28db56c
@ -1293,33 +1293,31 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int depth,
|
int depth,
|
||||||
int width) {
|
int width) {
|
||||||
int shift = 8 - depth;
|
// Shift is 8 - depth, +8 so the result is in the top half of each lane.
|
||||||
|
int shift = 16 - depth;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
|
||||||
"dup v31.8h, %w6 \n"
|
"dup v31.8h, %w6 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ldr q2, [%0], #16 \n" // R
|
"ldr q0, [%0], #16 \n" // B
|
||||||
"ldr q1, [%1], #16 \n" // G
|
"ldr q1, [%1], #16 \n" // G
|
||||||
"ldr q0, [%2], #16 \n" // B
|
"ldr q2, [%2], #16 \n" // R
|
||||||
"ldr q3, [%3], #16 \n" // A
|
"ldr q3, [%3], #16 \n" // A
|
||||||
"ushl v2.8h, v2.8h, v31.8h \n"
|
"uqshl v0.8h, v0.8h, v31.8h \n"
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"ushl v1.8h, v1.8h, v31.8h \n"
|
"uqshl v1.8h, v1.8h, v31.8h \n"
|
||||||
"prfm pldl1keep, [%1, 448] \n"
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
"ushl v0.8h, v0.8h, v31.8h \n"
|
"uqshl v2.8h, v2.8h, v31.8h \n"
|
||||||
"prfm pldl1keep, [%2, 448] \n"
|
"prfm pldl1keep, [%2, 448] \n"
|
||||||
"ushl v3.8h, v3.8h, v31.8h \n"
|
"uqshl v3.8h, v3.8h, v31.8h \n"
|
||||||
"prfm pldl1keep, [%3, 448] \n"
|
"prfm pldl1keep, [%3, 448] \n"
|
||||||
"uqxtn v2.8b, v2.8h \n"
|
"trn2 v0.16b, v0.16b, v1.16b \n"
|
||||||
"uqxtn v1.8b, v1.8h \n"
|
"trn2 v1.16b, v2.16b, v3.16b \n"
|
||||||
"uqxtn v0.8b, v0.8h \n"
|
|
||||||
"uqxtn v3.8b, v3.8h \n"
|
|
||||||
"subs %w5, %w5, #8 \n"
|
"subs %w5, %w5, #8 \n"
|
||||||
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
|
"st2 {v0.8h, v1.8h}, [%4], #32 \n"
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_r), // %0
|
: "+r"(src_b), // %0
|
||||||
"+r"(src_g), // %1
|
"+r"(src_g), // %1
|
||||||
"+r"(src_b), // %2
|
"+r"(src_r), // %2
|
||||||
"+r"(src_a), // %3
|
"+r"(src_a), // %3
|
||||||
"+r"(dst_argb), // %4
|
"+r"(dst_argb), // %4
|
||||||
"+r"(width) // %5
|
"+r"(width) // %5
|
||||||
@ -1333,30 +1331,29 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int depth,
|
int depth,
|
||||||
int width) {
|
int width) {
|
||||||
int shift = 8 - depth;
|
// Shift is 8 - depth, +8 so the result is in the top half of each lane.
|
||||||
|
int shift = 16 - depth;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
|
||||||
"dup v31.8h, %w5 \n"
|
"dup v31.8h, %w5 \n"
|
||||||
"movi v3.8b, #0xff \n" // A (0xff)
|
"movi v3.16b, #0xff \n" // A (0xff)
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ldr q2, [%0], #16 \n" // R
|
"ldr q0, [%0], #16 \n" // B
|
||||||
"ldr q1, [%1], #16 \n" // G
|
"ldr q1, [%1], #16 \n" // G
|
||||||
"ldr q0, [%2], #16 \n" // B
|
"ldr q2, [%2], #16 \n" // R
|
||||||
"ushl v2.8h, v2.8h, v31.8h \n"
|
"uqshl v0.8h, v0.8h, v31.8h \n"
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"ushl v1.8h, v1.8h, v31.8h \n"
|
"uqshl v1.8h, v1.8h, v31.8h \n"
|
||||||
"prfm pldl1keep, [%1, 448] \n"
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
"ushl v0.8h, v0.8h, v31.8h \n"
|
"uqshl v2.8h, v2.8h, v31.8h \n"
|
||||||
"prfm pldl1keep, [%2, 448] \n"
|
"prfm pldl1keep, [%2, 448] \n"
|
||||||
"uqxtn v2.8b, v2.8h \n"
|
"trn2 v0.16b, v0.16b, v1.16b \n"
|
||||||
"uqxtn v1.8b, v1.8h \n"
|
"trn2 v1.16b, v2.16b, v3.16b \n"
|
||||||
"uqxtn v0.8b, v0.8h \n"
|
|
||||||
"subs %w4, %w4, #8 \n"
|
"subs %w4, %w4, #8 \n"
|
||||||
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
|
"st2 {v0.8h, v1.8h}, [%3], #32 \n"
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_r), // %0
|
: "+r"(src_b), // %0
|
||||||
"+r"(src_g), // %1
|
"+r"(src_g), // %1
|
||||||
"+r"(src_b), // %2
|
"+r"(src_r), // %2
|
||||||
"+r"(dst_argb), // %3
|
"+r"(dst_argb), // %3
|
||||||
"+r"(width) // %4
|
"+r"(width) // %4
|
||||||
: "r"(shift) // %5
|
: "r"(shift) // %5
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user