scale neon adjust PRFM instruction to co-issue with math

Bug: libyuv:838, b/151375918
Change-Id: Ib0013fd971d700d2981b58e0aa1dd666e68fedd4
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2443953
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2020-10-01 16:05:04 -07:00 committed by Commit Bot
parent 6866adbec5
commit 0b1e6ea6c9

View File

@ -54,8 +54,8 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"st1 {v0.16b}, [%1], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst), // %1 "+r"(dst), // %1
@ -78,14 +78,14 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uaddlp v1.8h, v1.16b \n" "uaddlp v1.8h, v1.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"uadalp v0.8h, v2.16b \n" // += row 2 add adjacent "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
"uadalp v1.8h, v3.16b \n" "uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // round and pack "rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn2 v0.16b, v1.8h, #2 \n" "rshrn2 v0.16b, v1.8h, #2 \n"
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
@ -105,8 +105,8 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"st1 {v2.8b}, [%1], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -130,16 +130,16 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
"ld1 {v3.16b}, [%4], #16 \n" "ld1 {v3.16b}, [%4], #16 \n"
"subs %w5, %w5, #4 \n" "subs %w5, %w5, #4 \n"
"uaddlp v0.8h, v0.16b \n" "uaddlp v0.8h, v0.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uadalp v0.8h, v1.16b \n" "uadalp v0.8h, v1.16b \n"
"prfm pldl1keep, [%2, 448] \n"
"uadalp v0.8h, v2.16b \n" "uadalp v0.8h, v2.16b \n"
"prfm pldl1keep, [%3, 448] \n"
"uadalp v0.8h, v3.16b \n" "uadalp v0.8h, v3.16b \n"
"prfm pldl1keep, [%4, 448] \n"
"addp v0.8h, v0.8h, v0.8h \n" "addp v0.8h, v0.8h, v0.8h \n"
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
"st1 {v0.s}[0], [%1], #4 \n" "st1 {v0.s}[0], [%1], #4 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%3, 448] \n"
"prfm pldl1keep, [%4, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -164,8 +164,8 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n" "subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -199,12 +199,14 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
"umlal v17.8h, v1.8b, v20.8b \n" "umlal v17.8h, v1.8b, v20.8b \n"
"umlal v18.8h, v2.8b, v20.8b \n" "umlal v18.8h, v2.8b, v20.8b \n"
"umlal v19.8h, v3.8b, v20.8b \n" "umlal v19.8h, v3.8b, v20.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// (3 * line_0 + line_1) >> 2 // (3 * line_0 + line_1) >> 2
"uqrshrn v0.8b, v16.8h, #2 \n" "uqrshrn v0.8b, v16.8h, #2 \n"
"uqrshrn v1.8b, v17.8h, #2 \n" "uqrshrn v1.8b, v17.8h, #2 \n"
"uqrshrn v2.8b, v18.8h, #2 \n" "uqrshrn v2.8b, v18.8h, #2 \n"
"uqrshrn v3.8b, v19.8h, #2 \n" "uqrshrn v3.8b, v19.8h, #2 \n"
"prfm pldl1keep, [%3, 448] \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2 // a0 = (src[0] * 3 + s[1] * 1) >> 2
"ushll v16.8h, v1.8b, #0 \n" "ushll v16.8h, v1.8b, #0 \n"
@ -221,8 +223,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%3, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -249,11 +249,13 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
"urhadd v1.8b, v1.8b, v5.8b \n" "urhadd v1.8b, v1.8b, v5.8b \n"
"urhadd v2.8b, v2.8b, v6.8b \n" "urhadd v2.8b, v2.8b, v6.8b \n"
"urhadd v3.8b, v3.8b, v7.8b \n" "urhadd v3.8b, v3.8b, v7.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// a0 = (src[0] * 3 + s[1] * 1) >> 2 // a0 = (src[0] * 3 + s[1] * 1) >> 2
"ushll v4.8h, v1.8b, #0 \n" "ushll v4.8h, v1.8b, #0 \n"
"umlal v4.8h, v0.8b, v20.8b \n" "umlal v4.8h, v0.8b, v20.8b \n"
"uqrshrn v0.8b, v4.8h, #2 \n" "uqrshrn v0.8b, v4.8h, #2 \n"
"prfm pldl1keep, [%3, 448] \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1 // a1 = (src[1] * 1 + s[2] * 1) >> 1
"urhadd v1.8b, v1.8b, v2.8b \n" "urhadd v1.8b, v1.8b, v2.8b \n"
@ -264,8 +266,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
"uqrshrn v2.8b, v4.8h, #2 \n" "uqrshrn v2.8b, v4.8h, #2 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%3, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -298,9 +298,9 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #12 \n" "subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
"st1 {v2.s}[2], [%1], #4 \n" "st1 {v2.s}[2], [%1], #4 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -399,25 +399,25 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
"trn2 v4.8h, v0.8h, v0.8h \n" "trn2 v4.8h, v0.8h, v0.8h \n"
"xtn v0.4h, v1.4s \n" "xtn v0.4h, v1.4s \n"
"xtn v4.4h, v4.4s \n" "xtn v4.4h, v4.4s \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5 // 0+1+2, 3+4+5
"add v20.8h, v20.8h, v0.8h \n" "add v20.8h, v20.8h, v0.8h \n"
"add v21.8h, v21.8h, v4.8h \n" "add v21.8h, v21.8h, v4.8h \n"
"prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value // Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n // isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits. // and take the upper 16 bits.
"sqrdmulh v0.8h, v20.8h, v31.8h \n" "sqrdmulh v0.8h, v20.8h, v31.8h \n"
"sqrdmulh v1.8h, v21.8h, v31.8h \n" "sqrdmulh v1.8h, v21.8h, v31.8h \n"
"prfm pldl1keep, [%3, 448] \n"
// Align for table lookup, vtbl requires registers to be adjacent // Align for table lookup, vtbl requires registers to be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%3, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -504,10 +504,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"trn2 v4.8h, v0.8h, v0.8h \n" "trn2 v4.8h, v0.8h, v0.8h \n"
"xtn v0.4h, v1.4s \n" "xtn v0.4h, v1.4s \n"
"xtn v4.4h, v4.4s \n" "xtn v4.4h, v4.4s \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5 // 0+1+2, 3+4+5
"add v16.8h, v16.8h, v0.8h \n" "add v16.8h, v16.8h, v0.8h \n"
"add v17.8h, v17.8h, v4.8h \n" "add v17.8h, v17.8h, v4.8h \n"
"prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value // Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n // isn't a power of 2. So multiply by 65536 / n
@ -522,8 +524,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"st1 {v3.8b}, [%1], #8 \n" "st1 {v3.8b}, [%1], #8 \n"
"st1 {v3.s}[2], [%1], #4 \n" "st1 {v3.s}[2], [%1], #4 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -545,10 +545,10 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
"uaddw2 v2.8h, v2.8h, v0.16b \n" // add "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uaddw v1.8h, v1.8h, v0.8b \n" "uaddw v1.8h, v1.8h, v0.8b \n"
"st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -663,13 +663,13 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"umull v6.8h, v0.8b, v4.8b \n" "umull v6.8h, v0.8b, v4.8b \n"
"umull2 v7.8h, v0.16b, v4.16b \n" "umull2 v7.8h, v0.16b, v4.16b \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"umlal v6.8h, v1.8b, v5.8b \n" "umlal v6.8h, v1.8b, v5.8b \n"
"umlal2 v7.8h, v1.16b, v5.16b \n" "umlal2 v7.8h, v1.16b, v5.16b \n"
"prfm pldl1keep, [%2, 448] \n"
"rshrn v0.8b, v6.8h, #8 \n" "rshrn v0.8b, v6.8h, #8 \n"
"rshrn2 v0.16b, v7.8h, #8 \n" "rshrn2 v0.16b, v7.8h, #8 \n"
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
"b 99f \n" "b 99f \n"
@ -679,10 +679,10 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"urhadd v0.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%2, 448] \n" "prfm pldl1keep, [%2, 448] \n"
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 25b \n" "b.gt 25b \n"
"b 99f \n" "b 99f \n"
@ -691,10 +691,10 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"urhadd v0.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%2, 448] \n" "prfm pldl1keep, [%2, 448] \n"
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 50b \n" "b.gt 50b \n"
"b 99f \n" "b 99f \n"
@ -704,10 +704,10 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"ld1 {v0.16b}, [%2], #16 \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"urhadd v0.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%2, 448] \n" "prfm pldl1keep, [%2, 448] \n"
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 75b \n" "b.gt 75b \n"
"b 99f \n" "b 99f \n"
@ -715,8 +715,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"100: \n" "100: \n"
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n" "b.gt 100b \n"
"99: \n" "99: \n"
@ -742,8 +742,8 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
"ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"mov v2.16b, v3.16b \n" "mov v2.16b, v3.16b \n"
"st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst), // %1 "+r"(dst), // %1
@ -765,9 +765,9 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"urhadd v1.16b, v2.16b, v3.16b \n" "urhadd v1.16b, v2.16b, v3.16b \n"
"st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -785,7 +785,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -794,15 +794,15 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
"uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"rshrn v0.8b, v0.8h, #2 \n" // round and pack "rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn v1.8b, v1.8h, #2 \n" "rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n" "rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
@ -827,8 +827,8 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
"ld1 {v0.s}[2], [%0], %3 \n" "ld1 {v0.s}[2], [%0], %3 \n"
"ld1 {v0.s}[3], [%0], %3 \n" "ld1 {v0.s}[3], [%0], %3 \n"
"subs %w2, %w2, #4 \n" // 4 pixels per loop. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
"st1 {v0.16b}, [%1], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -861,20 +861,20 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
"uaddl v2.8h, v2.8b, v3.8b \n" "uaddl v2.8h, v2.8b, v3.8b \n"
"uaddl v4.8h, v4.8b, v5.8b \n" "uaddl v4.8h, v4.8b, v5.8b \n"
"uaddl v6.8h, v6.8b, v7.8b \n" "uaddl v6.8h, v6.8b, v7.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
"mov v0.d[1], v2.d[0] \n" "mov v0.d[1], v2.d[0] \n"
"mov v2.d[0], v16.d[1] \n" "mov v2.d[0], v16.d[1] \n"
"mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
"mov v4.d[1], v6.d[0] \n" "mov v4.d[1], v6.d[0] \n"
"mov v6.d[0], v16.d[1] \n" "mov v6.d[0], v16.d[1] \n"
"prfm pldl1keep, [%1, 448] \n"
"add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
"add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop. "subs %w3, %w3, #4 \n" // 4 pixels per loop.
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
@ -912,10 +912,10 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
LOAD1_DATA32_LANE(v1, 1) LOAD1_DATA32_LANE(v1, 1)
LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3) LOAD1_DATA32_LANE(v1, 3)
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
// clang-format on // clang-format on
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
@ -979,15 +979,14 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"umull2 v17.8h, v0.16b, v7.16b \n" "umull2 v17.8h, v0.16b, v7.16b \n"
"umull v18.8h, v1.8b, v2.8b \n" "umull v18.8h, v1.8b, v2.8b \n"
"umull2 v19.8h, v1.16b, v2.16b \n" "umull2 v19.8h, v1.16b, v2.16b \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"add v16.8h, v16.8h, v18.8h \n" "add v16.8h, v16.8h, v18.8h \n"
"add v17.8h, v17.8h, v19.8h \n" "add v17.8h, v17.8h, v19.8h \n"
"shrn v0.8b, v16.8h, #7 \n" "shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n"
"st1 {v0.4s}, [%0], #16 \n" // store pixels "st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n" "add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop "subs %w2, %w2, #4 \n" // 4 processed per loop
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
@ -1018,13 +1017,13 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
"subs %w3, %w3, #8 \n" // 8 processed per loop "subs %w3, %w3, #8 \n" // 8 processed per loop
"uaddlp v0.4s, v0.8h \n" // row 1 add adjacent "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
"uaddlp v1.4s, v1.8h \n" "uaddlp v1.4s, v1.8h \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uadalp v0.4s, v2.8h \n" // +row 2 add adjacent "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
"uadalp v1.4s, v3.8h \n" "uadalp v1.4s, v3.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"rshrn v0.4h, v0.4s, #2 \n" // round and pack "rshrn v0.4h, v0.4s, #2 \n" // round and pack
"rshrn2 v0.8h, v1.4s, #2 \n" "rshrn2 v0.8h, v1.4s, #2 \n"
"st1 {v0.8h}, [%2], #16 \n" "st1 {v0.8h}, [%2], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
@ -1056,6 +1055,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
"umull2 v7.4s, v3.8h, v0.8h \n" "umull2 v7.4s, v3.8h, v0.8h \n"
"umull v18.4s, v4.4h, v0.4h \n" "umull v18.4s, v4.4h, v0.4h \n"
"umull2 v17.4s, v4.8h, v0.8h \n" "umull2 v17.4s, v4.8h, v0.8h \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uaddw v16.4s, v16.4s, v6.4h \n" "uaddw v16.4s, v16.4s, v6.4h \n"
"uaddl2 v19.4s, v6.8h, v3.8h \n" "uaddl2 v19.4s, v6.8h, v3.8h \n"
"uaddl v3.4s, v6.4h, v3.4h \n" "uaddl v3.4s, v6.4h, v3.4h \n"
@ -1063,6 +1063,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
"uaddl2 v7.4s, v5.8h, v4.8h \n" "uaddl2 v7.4s, v5.8h, v4.8h \n"
"uaddl v4.4s, v5.4h, v4.4h \n" "uaddl v4.4s, v5.4h, v4.4h \n"
"uaddw v18.4s, v18.4s, v5.4h \n" "uaddw v18.4s, v18.4s, v5.4h \n"
"prfm pldl1keep, [%1, 448] \n"
"mla v16.4s, v4.4s, v1.4s \n" "mla v16.4s, v4.4s, v1.4s \n"
"mla v18.4s, v3.4s, v1.4s \n" "mla v18.4s, v3.4s, v1.4s \n"
"mla v6.4s, v7.4s, v1.4s \n" "mla v6.4s, v7.4s, v1.4s \n"
@ -1073,8 +1074,6 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
"uqrshrn v17.4h, v18.4s, #4 \n" "uqrshrn v17.4h, v18.4s, #4 \n"
"uqrshrn2 v17.8h, v4.4s, #4 \n" "uqrshrn2 v17.8h, v4.4s, #4 \n"
"st2 {v16.8h-v17.8h}, [%2], #32 \n" "st2 {v16.8h-v17.8h}, [%2], #32 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1