mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
[AArch64] Optimize ScaleARGBRowDown2Box_NEON
Use a pair of LD2s to load data interleaved and perform a couple of additions on the registers in order to avoid needing LD4 and ST4 instructions, since these are costly on some micro-architectures. Reduction in run times: Cortex-A55: -20.5% Cortex-A510: -28.3% Cortex-A76: -21.5% Bug: libyuv:976 Change-Id: If66e1e148b031c2cd288ff412f351d7a0b9b91e7 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5371774 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
9441ddd883
commit
9d200b704f
@ -1168,35 +1168,26 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
|
||||
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
|
||||
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
|
||||
"rshrn v1.8b, v1.8h, #2 \n"
|
||||
"rshrn v2.8b, v2.8h, #2 \n"
|
||||
"rshrn v3.8b, v3.8h, #2 \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_stride), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(dst_width) // %3
|
||||
"1: \n"
|
||||
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
|
||||
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
|
||||
"uaddl v2.8h, v0.8b, v1.8b \n"
|
||||
"uaddl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"uaddl v22.8h, v20.8b, v21.8b \n"
|
||||
"uaddl2 v23.8h, v20.16b, v21.16b \n"
|
||||
"add v0.8h, v2.8h, v22.8h \n"
|
||||
"add v1.8h, v3.8h, v23.8h \n"
|
||||
"rshrn v0.8b, v0.8h, #2 \n"
|
||||
"rshrn v1.8b, v1.8h, #2 \n"
|
||||
"subs %w[width], %w[width], #4 \n"
|
||||
"stp d0, d1, [%[dst]], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src] "+r"(src_ptr), [src1] "+r"(src_ptr1), [dst] "+r"(dst),
|
||||
[width] "+r"(dst_width)
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v20", "v21", "v22", "v23");
|
||||
}
|
||||
|
||||
void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user