[AArch64] Optimize ScaleARGBRowDown2Box_NEON

Use a pair of LD2s to load data interleaved and perform a couple of
additions on the registers in order to avoid needing LD4 and ST4
instructions, since these are costly on some micro-architectures.

Reduction in run times:

 Cortex-A55: -20.5%
Cortex-A510: -28.3%
 Cortex-A76: -21.5%

Bug: libyuv:976
Change-Id: If66e1e148b031c2cd288ff412f351d7a0b9b91e7
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5371774
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Cosmina Dunca 2024-02-14 15:11:21 +00:00 committed by Frank Barchard
parent 9441ddd883
commit 9d200b704f

View File

@ -1168,35 +1168,26 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile( asm volatile(
// change the stride to row 2 pointer "1: \n"
"add %1, %1, %0 \n" "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
"1: \n" "ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "uaddl v2.8h, v0.8b, v1.8b \n"
"subs %w3, %w3, #8 \n" // 8 processed per loop. "uaddl2 v3.8h, v0.16b, v1.16b \n"
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddl v22.8h, v20.8b, v21.8b \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddl2 v23.8h, v20.16b, v21.16b \n"
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "add v0.8h, v2.8h, v22.8h \n"
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. "add v1.8h, v3.8h, v23.8h \n"
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 "rshrn v0.8b, v0.8h, #2 \n"
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. "rshrn v1.8b, v1.8h, #2 \n"
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "subs %w[width], %w[width], #4 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "stp d0, d1, [%[dst]], #16 \n"
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. "b.gt 1b \n"
"uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. : [src] "+r"(src_ptr), [src1] "+r"(src_ptr1), [dst] "+r"(dst),
"prfm pldl1keep, [%1, 448] \n" [width] "+r"(dst_width)
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
: :
: "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); : "memory", "cc", "v0", "v1", "v2", "v3", "v20", "v21", "v22", "v23");
} }
void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,