mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Avoid unnecessary MOVs in ScaleARGBRowDownEvenBox_NEON
The existing code uses three MOV instructions through a temporary register to swap the low and high halves of a vector register, however this can be done with a pair of ZIP instructions instead. Also use a pair of RSHRN rather than RSHRN2 to allow these to execute in parallel on little cores. Reduction in runtime observed compared to the existing Neon implementation: Cortex-A55: -8.3% Cortex-A510: -20.6% Cortex-A520: -16.6% Cortex-A76: -6.8% Cortex-A715: -6.2% Cortex-A720: -6.2% Cortex-X1: -22.0% Cortex-X2: -18.7% Cortex-X3: -21.1% Cortex-X4: -25.8% Cortex-X925: -21.9% Change-Id: I87ae133be86c3c9f850d5848ec19d9b71ebda4d9 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872801 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
23a6a412e5
commit
7eb552c891
@ -1284,7 +1284,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
|
|||||||
int src_stepx,
|
int src_stepx,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile (
|
asm volatile(
|
||||||
"add %1, %1, %0 \n"
|
"add %1, %1, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
|
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
|
||||||
@ -1300,26 +1300,24 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
|
|||||||
"uaddl v4.8h, v4.8b, v5.8b \n"
|
"uaddl v4.8h, v4.8b, v5.8b \n"
|
||||||
"uaddl v6.8h, v6.8b, v7.8b \n"
|
"uaddl v6.8h, v6.8b, v7.8b \n"
|
||||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||||
"mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
|
"zip1 v1.2d, v0.2d, v2.2d \n"
|
||||||
"mov v0.d[1], v2.d[0] \n"
|
"zip2 v2.2d, v0.2d, v2.2d \n"
|
||||||
"mov v2.d[0], v16.d[1] \n"
|
"zip1 v5.2d, v4.2d, v6.2d \n"
|
||||||
"mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
|
"zip2 v6.2d, v4.2d, v6.2d \n"
|
||||||
"mov v4.d[1], v6.d[0] \n"
|
|
||||||
"mov v6.d[0], v16.d[1] \n"
|
|
||||||
"prfm pldl1keep, [%1, 448] \n"
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
"add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
|
"add v0.8h, v1.8h, v2.8h \n" // (a+b)_(c+d)
|
||||||
"add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
|
"add v4.8h, v5.8h, v6.8h \n" // (e+f)_(g+h)
|
||||||
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
|
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
|
||||||
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
|
"rshrn v1.8b, v4.8h, #2 \n" // next 2 pixels.
|
||||||
"subs %w3, %w3, #4 \n" // 4 pixels per loop.
|
"subs %w3, %w3, #4 \n" // 4 pixels per loop.
|
||||||
"st1 {v0.16b}, [%2], #16 \n"
|
"stp d0, d1, [%2], #16 \n"
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(src_stride), // %1
|
"+r"(src_stride), // %1
|
||||||
"+r"(dst_argb), // %2
|
"+r"(dst_argb), // %2
|
||||||
"+r"(dst_width) // %3
|
"+r"(dst_width) // %3
|
||||||
: "r"((int64_t)(src_stepx * 4)) // %4
|
: "r"((int64_t)(src_stepx * 4)) // %4
|
||||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(Yang Zhang): Investigate less load instructions for
|
// TODO(Yang Zhang): Investigate less load instructions for
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user