From 2c32b689e4941290835b7e7c66341edeb6c837b1 Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 29 Nov 2024 14:51:21 +0000 Subject: [PATCH] [AArch64] Improve instruction interleaving in READI212_SVE The existing instruction arrangement is sub-optimal on little cores since it has instructions with dependencies next to each other, so spread them out to improve performance. No significant change observed on bigger cores, but little cores do show some small improvements except for the *Alpha* kernels which regress slightly. Runtimes observed compared to the previous SVE implementation: | Cortex-A510 | Cortex-A520 I210AlphaToARGBRow | (!) +7.0% | (!) +6.8% I210ToAR30Row | -10.3% | -9.9% I210ToARGBRow | -2.4% | -2.3% I212ToAR30Row | -10.3% | -9.9% I212ToARGBRow | -2.4% | -2.3% Change-Id: I626942ce02c4610cfac1ea4f8e7890653ee4324f Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6067150 Reviewed-by: Frank Barchard --- source/row_sve.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/source/row_sve.cc b/source/row_sve.cc index 92b7b23c5..cf764b174 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -80,35 +80,35 @@ extern "C" { #define READI210_SVE \ "ld1h {z3.h}, p1/z, [%[src_y]] \n" \ - "lsl z0.h, z3.h, #6 \n" \ - "usra z0.h, z3.h, #4 \n" \ "ld1h {z1.s}, p1/z, [%[src_u]] \n" \ "ld1h {z2.s}, p1/z, [%[src_v]] \n" \ "incb %[src_y] \n" \ "inch %[src_u] \n" \ "inch %[src_v] \n" \ + "lsl z0.h, z3.h, #6 \n" \ + "trn1 z1.h, z1.h, z1.h \n" \ + "trn1 z2.h, z2.h, z2.h \n" \ "prfm pldl1keep, [%[src_y], 448] \n" \ "prfm pldl1keep, [%[src_u], 128] \n" \ "prfm pldl1keep, [%[src_v], 128] \n" \ - "trn1 z1.h, z1.h, z1.h \n" \ - "trn1 z2.h, z2.h, z2.h \n" \ + "usra z0.h, z3.h, #4 \n" \ "uqshrnb z1.b, z1.h, #2 \n" \ "uqshrnb z2.b, z2.h, #2 \n" #define READI212_SVE \ "ld1h {z3.h}, p1/z, [%[src_y]] \n" \ - "lsl z0.h, z3.h, #4 \n" \ - "usra z0.h, z3.h, #8 \n" \ "ld1h {z1.s}, p1/z, [%[src_u]] \n" \ "ld1h {z2.s}, p1/z, [%[src_v]] \n" \ "incb %[src_y] \n" \ "inch %[src_u] \n" \ "inch %[src_v] \n" \ + "lsl z0.h, z3.h, #4 \n" \ + "trn1 z1.h, z1.h, z1.h \n" \ + "trn1 z2.h, z2.h, z2.h \n" \ "prfm pldl1keep, [%[src_y], 448] \n" \ "prfm pldl1keep, [%[src_u], 128] \n" \ "prfm pldl1keep, [%[src_v], 128] \n" \ - "trn1 z1.h, z1.h, z1.h \n" \ - "trn1 z2.h, z2.h, z2.h \n" \ + "usra z0.h, z3.h, #8 \n" \ "uqshrnb z1.b, z1.h, #4 \n" \ "uqshrnb z2.b, z2.h, #4 \n"