scale neon adjust PRFM instruction to co-issue with math

Bug: libyuv:838, b/151375918
Change-Id: Ib0013fd971d700d2981b58e0aa1dd666e68fedd4
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2443953
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2020-10-01 16:05:04 -07:00 committed by Commit Bot
parent 6866adbec5
commit 0b1e6ea6c9

View File

@ -54,8 +54,8 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"st1 {v0.16b}, [%1], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
@ -78,14 +78,14 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uaddlp v1.8h, v1.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
"uadalp v1.8h, v3.16b \n"
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn2 v0.16b, v1.8h, #2 \n"
"st1 {v0.16b}, [%2], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
@ -105,8 +105,8 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop
"st1 {v2.8b}, [%1], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -130,16 +130,16 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
"ld1 {v3.16b}, [%4], #16 \n"
"subs %w5, %w5, #4 \n"
"uaddlp v0.8h, v0.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uadalp v0.8h, v1.16b \n"
"prfm pldl1keep, [%2, 448] \n"
"uadalp v0.8h, v2.16b \n"
"prfm pldl1keep, [%3, 448] \n"
"uadalp v0.8h, v3.16b \n"
"prfm pldl1keep, [%4, 448] \n"
"addp v0.8h, v0.8h, v0.8h \n"
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
"st1 {v0.s}[0], [%1], #4 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%3, 448] \n"
"prfm pldl1keep, [%4, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -164,8 +164,8 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -199,12 +199,14 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
"umlal v17.8h, v1.8b, v20.8b \n"
"umlal v18.8h, v2.8b, v20.8b \n"
"umlal v19.8h, v3.8b, v20.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// (3 * line_0 + line_1) >> 2
"uqrshrn v0.8b, v16.8h, #2 \n"
"uqrshrn v1.8b, v17.8h, #2 \n"
"uqrshrn v2.8b, v18.8h, #2 \n"
"uqrshrn v3.8b, v19.8h, #2 \n"
"prfm pldl1keep, [%3, 448] \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"ushll v16.8h, v1.8b, #0 \n"
@ -221,8 +223,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%3, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -249,11 +249,13 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
"urhadd v1.8b, v1.8b, v5.8b \n"
"urhadd v2.8b, v2.8b, v6.8b \n"
"urhadd v3.8b, v3.8b, v7.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"ushll v4.8h, v1.8b, #0 \n"
"umlal v4.8h, v0.8b, v20.8b \n"
"uqrshrn v0.8b, v4.8h, #2 \n"
"prfm pldl1keep, [%3, 448] \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"urhadd v1.8b, v1.8b, v2.8b \n"
@ -264,8 +266,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
"uqrshrn v2.8b, v4.8h, #2 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%3, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -298,9 +298,9 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v2.8b}, [%1], #8 \n"
"st1 {v2.s}[2], [%1], #4 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -399,25 +399,25 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
"trn2 v4.8h, v0.8h, v0.8h \n"
"xtn v0.4h, v1.4s \n"
"xtn v4.4h, v4.4s \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
"add v20.8h, v20.8h, v0.8h \n"
"add v21.8h, v21.8h, v4.8h \n"
"prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"sqrdmulh v0.8h, v20.8h, v31.8h \n"
"sqrdmulh v1.8h, v21.8h, v31.8h \n"
"prfm pldl1keep, [%3, 448] \n"
// Align for table lookup, vtbl requires registers to be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
"st1 {v3.8b}, [%1], #8 \n"
"st1 {v3.s}[2], [%1], #4 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%3, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -504,10 +504,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"trn2 v4.8h, v0.8h, v0.8h \n"
"xtn v0.4h, v1.4s \n"
"xtn v4.4h, v4.4s \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
"add v16.8h, v16.8h, v0.8h \n"
"add v17.8h, v17.8h, v4.8h \n"
"prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
@ -522,8 +524,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"st1 {v3.8b}, [%1], #8 \n"
"st1 {v3.s}[2], [%1], #4 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -545,10 +545,10 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
"uaddw2 v2.8h, v2.8h, v0.16b \n" // add
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uaddw v1.8h, v1.8h, v0.8b \n"
"st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
"subs %w2, %w2, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -620,7 +620,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
"add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
"b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@ -663,13 +663,13 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"subs %w3, %w3, #16 \n"
"umull v6.8h, v0.8b, v4.8b \n"
"umull2 v7.8h, v0.16b, v4.16b \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"umlal v6.8h, v1.8b, v5.8b \n"
"umlal2 v7.8h, v1.16b, v5.16b \n"
"prfm pldl1keep, [%2, 448] \n"
"rshrn v0.8b, v6.8h, #8 \n"
"rshrn2 v0.16b, v7.8h, #8 \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"b.gt 1b \n"
"b 99f \n"
@ -679,10 +679,10 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"urhadd v0.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%2, 448] \n"
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 25b \n"
"b 99f \n"
@ -691,10 +691,10 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n"
"ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"urhadd v0.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%2, 448] \n"
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 50b \n"
"b 99f \n"
@ -704,10 +704,10 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"ld1 {v0.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"urhadd v0.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%2, 448] \n"
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 75b \n"
"b 99f \n"
@ -715,8 +715,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"100: \n"
"ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n"
"99: \n"
@ -742,8 +742,8 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
"ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"mov v2.16b, v3.16b \n"
"st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
@ -765,9 +765,9 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
"subs %w2, %w2, #8 \n" // 8 processed per loop
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"urhadd v1.16b, v2.16b, v3.16b \n"
"st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -785,7 +785,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -794,15 +794,15 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
"ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
"uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
"uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn v1.8b, v1.8h, #2 \n"
"rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
@ -827,8 +827,8 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
"ld1 {v0.s}[2], [%0], %3 \n"
"ld1 {v0.s}[3], [%0], %3 \n"
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
"st1 {v0.16b}, [%1], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -861,20 +861,20 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
"uaddl v2.8h, v2.8b, v3.8b \n"
"uaddl v4.8h, v4.8b, v5.8b \n"
"uaddl v6.8h, v6.8b, v7.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
"mov v0.d[1], v2.d[0] \n"
"mov v2.d[0], v16.d[1] \n"
"mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
"mov v4.d[1], v6.d[0] \n"
"mov v6.d[0], v16.d[1] \n"
"prfm pldl1keep, [%1, 448] \n"
"add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
"add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop.
"st1 {v0.16b}, [%2], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
@ -912,10 +912,10 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
LOAD1_DATA32_LANE(v1, 1)
LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3)
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
// clang-format on
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@ -979,16 +979,15 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"umull2 v17.8h, v0.16b, v7.16b \n"
"umull v18.8h, v1.8b, v2.8b \n"
"umull2 v19.8h, v1.16b, v2.16b \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"add v16.8h, v16.8h, v18.8h \n"
"add v17.8h, v17.8h, v19.8h \n"
"shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n"
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@ -1018,13 +1017,13 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
"subs %w3, %w3, #8 \n" // 8 processed per loop
"uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
"uaddlp v1.4s, v1.8h \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
"uadalp v1.4s, v3.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"rshrn v0.4h, v0.4s, #2 \n" // round and pack
"rshrn2 v0.8h, v1.4s, #2 \n"
"st1 {v0.8h}, [%2], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
@ -1056,6 +1055,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
"umull2 v7.4s, v3.8h, v0.8h \n"
"umull v18.4s, v4.4h, v0.4h \n"
"umull2 v17.4s, v4.8h, v0.8h \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uaddw v16.4s, v16.4s, v6.4h \n"
"uaddl2 v19.4s, v6.8h, v3.8h \n"
"uaddl v3.4s, v6.4h, v3.4h \n"
@ -1063,6 +1063,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
"uaddl2 v7.4s, v5.8h, v4.8h \n"
"uaddl v4.4s, v5.4h, v4.4h \n"
"uaddw v18.4s, v18.4s, v5.4h \n"
"prfm pldl1keep, [%1, 448] \n"
"mla v16.4s, v4.4s, v1.4s \n"
"mla v18.4s, v3.4s, v1.4s \n"
"mla v6.4s, v7.4s, v1.4s \n"
@ -1073,8 +1074,6 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
"uqrshrn v17.4h, v18.4s, #4 \n"
"uqrshrn2 v17.8h, v4.4s, #4 \n"
"st2 {v16.8h-v17.8h}, [%2], #32 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1