[AArch64] Avoid unnecessary MOVs in ScaleARGBRowDownEvenBox_NEON

The existing code uses three MOV instructions through a temporary
register to swap the low and high halves of a vector register, however
this can be done with a pair of ZIP instructions instead.

Also use a pair of RSHRN rather than RSHRN2 to allow these to execute in
parallel on little cores.

Reduction in runtime observed compared to the existing Neon
implementation:

 Cortex-A55:  -8.3%
Cortex-A510: -20.6%
Cortex-A520: -16.6%
 Cortex-A76:  -6.8%
Cortex-A715:  -6.2%
Cortex-A720:  -6.2%
  Cortex-X1: -22.0%
  Cortex-X2: -18.7%
  Cortex-X3: -21.1%
  Cortex-X4: -25.8%
Cortex-X925: -21.9%

Change-Id: I87ae133be86c3c9f850d5848ec19d9b71ebda4d9
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872801
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-09-16 16:56:18 +01:00 committed by Frank Barchard
parent 23a6a412e5
commit 7eb552c891

View File

@ -1300,26 +1300,24 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
"uaddl v4.8h, v4.8b, v5.8b \n" "uaddl v4.8h, v4.8b, v5.8b \n"
"uaddl v6.8h, v6.8b, v7.8b \n" "uaddl v6.8h, v6.8b, v7.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd "zip1 v1.2d, v0.2d, v2.2d \n"
"mov v0.d[1], v2.d[0] \n" "zip2 v2.2d, v0.2d, v2.2d \n"
"mov v2.d[0], v16.d[1] \n" "zip1 v5.2d, v4.2d, v6.2d \n"
"mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh "zip2 v6.2d, v4.2d, v6.2d \n"
"mov v4.d[1], v6.d[0] \n"
"mov v6.d[0], v16.d[1] \n"
"prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%1, 448] \n"
"add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) "add v0.8h, v1.8h, v2.8h \n" // (a+b)_(c+d)
"add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) "add v4.8h, v5.8h, v6.8h \n" // (e+f)_(g+h)
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. "rshrn v1.8b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop. "subs %w3, %w3, #4 \n" // 4 pixels per loop.
"st1 {v0.16b}, [%2], #16 \n" "stp d0, d1, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stride), // %1 "+r"(src_stride), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(dst_width) // %3 "+r"(dst_width) // %3
: "r"((int64_t)(src_stepx * 4)) // %4 : "r"((int64_t)(src_stepx * 4)) // %4
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
} }
// TODO(Yang Zhang): Investigate less load instructions for // TODO(Yang Zhang): Investigate less load instructions for