[AArch64] Load full vectors in ARGB{Add,Subtract}Row

Using full vectors for Add and Subtract is a win across the board. Using
full vectors for the multiply is less obviously a win, especially for
smaller cores like Cortex-A53 or Cortex-A57, so is not considered for
this change.

Observed changes in performance with this change compared to the
existing Neon code:

            | ARGBAddRow_NEON | ARGBSubtractRow_NEON
 Cortex-A55 |           -5.1% |                -5.1%
Cortex-A510 |          -18.4% |               -18.4%
 Cortex-A76 |          -28.9% |               -28.7%
Cortex-A720 |          -36.1% |               -36.2%
  Cortex-X1 |          -14.2% |               -14.4%
  Cortex-X2 |          -12.5% |               -12.5%

Bug: libyuv:976
Change-Id: I85316d4399c93b53baa62d0d43b2fa453517f5b4
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5457433
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-16 10:13:39 +01:00 committed by Frank Barchard
parent 90070986ae
commit 4838e7a194

View File

@ -3819,16 +3819,14 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
asm volatile(
// 8 pixel loop.
"1: \n"
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
"ldp q0, q1, [%0], #32 \n" // load 8 ARGB
"ldp q4, q5, [%1], #32 \n" // load 8 more
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n"
"prfm pldl1keep, [%0, 448] \n"
"uqadd v1.8b, v1.8b, v5.8b \n"
"prfm pldl1keep, [%1, 448] \n"
"uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n"
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"uqadd v0.16b, v0.16b, v4.16b \n"
"uqadd v1.16b, v1.16b, v5.16b \n"
"stp q0, q1, [%2], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
@ -3846,16 +3844,14 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
asm volatile(
// 8 pixel loop.
"1: \n"
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
"ldp q0, q1, [%0], #32 \n" // load 8 ARGB
"ldp q4, q5, [%1], #32 \n" // load 8 more
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n"
"prfm pldl1keep, [%0, 448] \n"
"uqsub v1.8b, v1.8b, v5.8b \n"
"prfm pldl1keep, [%1, 448] \n"
"uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n"
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"uqsub v0.16b, v0.16b, v4.16b \n"
"uqsub v1.16b, v1.16b, v5.16b \n"
"stp q0, q1, [%2], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1