mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[AArch64] Unroll ScaleRowDown4Box_NEON
We can use wider load/store instructions and avoid the need to waste half of the ADDP/RSHRN vector data. The duplicated UADDLP and UADALP instructions also provide a good improvement on little cores due to their limited out-of-order capability. The mask in the "any" kernel definition is already set up to handle an unrolling of eight so no change to scale_any.cc is needed. Reduction in runtimes observed compared to the existing Neon implementation: Cortex-A55: -19.5% Cortex-A520: -38.3% Cortex-A76: -36.0% Cortex-A715: -18.1% Cortex-A720: -17.9% Cortex-X1: -25.4% Cortex-X2: -18.5% Cortex-X3: -8.2% Cortex-X4: -3.8% Bug: b/42280945 Change-Id: Iebba5da4db5e25af4b9fa5651c7396364dedffba Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5725172 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
dc392094fc
commit
8f039f639c
@ -122,24 +122,28 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
|
|||||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||||
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
|
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
|
||||||
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
|
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
|
||||||
asm volatile (
|
asm volatile(
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
|
"ldp q0, q4, [%0], #32 \n" // load up 16x8
|
||||||
"ld1 {v1.16b}, [%2], #16 \n"
|
"ldp q1, q5, [%2], #32 \n"
|
||||||
"ld1 {v2.16b}, [%3], #16 \n"
|
"ldp q2, q6, [%3], #32 \n"
|
||||||
"ld1 {v3.16b}, [%4], #16 \n"
|
"ldp q3, q7, [%4], #32 \n"
|
||||||
"subs %w5, %w5, #4 \n"
|
"subs %w5, %w5, #8 \n"
|
||||||
"uaddlp v0.8h, v0.16b \n"
|
"uaddlp v0.8h, v0.16b \n"
|
||||||
|
"uaddlp v4.8h, v4.16b \n"
|
||||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||||
"uadalp v0.8h, v1.16b \n"
|
"uadalp v0.8h, v1.16b \n"
|
||||||
|
"uadalp v4.8h, v5.16b \n"
|
||||||
"prfm pldl1keep, [%2, 448] \n"
|
"prfm pldl1keep, [%2, 448] \n"
|
||||||
"uadalp v0.8h, v2.16b \n"
|
"uadalp v0.8h, v2.16b \n"
|
||||||
|
"uadalp v4.8h, v6.16b \n"
|
||||||
"prfm pldl1keep, [%3, 448] \n"
|
"prfm pldl1keep, [%3, 448] \n"
|
||||||
"uadalp v0.8h, v3.16b \n"
|
"uadalp v0.8h, v3.16b \n"
|
||||||
|
"uadalp v4.8h, v7.16b \n"
|
||||||
"prfm pldl1keep, [%4, 448] \n"
|
"prfm pldl1keep, [%4, 448] \n"
|
||||||
"addp v0.8h, v0.8h, v0.8h \n"
|
"addp v0.8h, v0.8h, v4.8h \n"
|
||||||
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
|
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
|
||||||
"st1 {v0.s}[0], [%1], #4 \n"
|
"str d0, [%1], #8 \n"
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
@ -148,7 +152,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
|
|||||||
"+r"(src_ptr3), // %4
|
"+r"(src_ptr3), // %4
|
||||||
"+r"(dst_width) // %5
|
"+r"(dst_width) // %5
|
||||||
:
|
:
|
||||||
: "memory", "cc", "v0", "v1", "v2", "v3");
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
|
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user