[AArch64] Improve ARGBToUVRow_SVE2 and related kernels

This commit reworks the implementation of ARGBToUVMatrixRow_SVE2, using
an approach similar to that recently used in
61bdaee13a701d2b52c6dc943ccc5c888077a591.

In particular we can rework these SVE2 implementations to use 8-bit
dot-product instructions instead of 16-bit, allowing us to process more
data in a single vector.

To ensure that the input values fit in 8-bits, negate the UV constants
arrays passed to the kernel and undo the now-unnecessary flipping of the
middle two component values.

This commit mostly reverses the performance inversion where the Neon
I8MM implementation was previously faster than the SVE2 implementation.
The reduction in runtime observed compared to the existing Neon I8MM
implementation is now:

Cortex-A510:  +5.6% (!)
Cortex-A520:  -3.0%
Cortex-A710: -12.6%
Cortex-A715: -10.9%
Cortex-A720: -10.8%
  Cortex-X2:  -3.8%
  Cortex-X3: -10.3%
  Cortex-X4:  -9.5%
Cortex-X925:  -6.7%

Change-Id: I30253976dc8e3651cfb5fd39b63a6763975d41e3
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6640990
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
George Steed 2025-03-26 19:44:24 +00:00 committed by Frank Barchard
parent 1b2f6cdbe8
commit 3d66e94fb5

View File

@ -217,9 +217,7 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
}
// Dot-product constants are stored as four-tuples with the two innermost
// elements flipped to account for the interleaving nature of the widening
// addition instructions.
// SVE constants are stored negated such that we can store 128 in int8_t.
// RGB to BT601 coefficients
// UB 0.875 coefficient = 112
@ -229,25 +227,24 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
// VG -0.7344 coefficient = -94
// VR 0.875 coefficient = 112
// SVE constants are not negated
static const int16_t kARGBToUVCoefficients[] = {
// UB, -UR, -UG, 0, -VB, VR, -VG, 0
112, -38, -74, 0, -18, 112, -94, 0,
static const int8_t kARGBToUVCoefficients[] = {
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-112, 74, 38, 0, 18, 94, -112, 0,
};
static const int16_t kRGBAToUVCoefficients[] = {
// 0, -UG, UB, -UR, 0, -VG, -VB, VR
0, -74, 112, -38, 0, -94, -18, 112,
static const int8_t kABGRToUVCoefficients[] = {
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
38, 74, -112, 0, -112, 94, 18, 0,
};
static const int16_t kBGRAToUVCoefficients[] = {
// 0, -UG, -UR, UB, 0, -VG, VR, -VB
0, -74, -38, 112, 0, -94, 112, -18,
static const int8_t kBGRAToUVCoefficients[] = {
// 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
0, 38, 74, -112, 0, -112, 94, 18,
};
static const int16_t kABGRToUVCoefficients[] = {
// -UR, UB, -UG, 0, VR, -VB, -VG, 0
-38, 112, -74, 0, 112, -18, -94, 0,
static const int8_t kRGBAToUVCoefficients[] = {
// 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
0, -112, 74, 38, 0, 18, 94, -112,
};
// RGB to JPEG coefficients
@ -258,169 +255,138 @@ static const int16_t kABGRToUVCoefficients[] = {
// VG -0.41869 coefficient = -107
// VR 0.500 coefficient = 128
static const int16_t kARGBToUVJCoefficients[] = {
// UB, -UR, -UG, 0, -VB, VR, -VG, 0
128, -43, -85, 0, -21, 128, -107, 0,
static const int8_t kARGBToUVJCoefficients[] = {
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-128, 85, 43, 0, 21, 107, -128, 0,
};
static const int16_t kABGRToUVJCoefficients[] = {
// -UR, UB, -UG, 0, VR, -VB, -VG, 0
-43, 128, -85, 0, 128, -21, -107, 0,
static const int8_t kABGRToUVJCoefficients[] = {
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
43, 85, -128, 0, -128, 107, 21, 0,
};
#define ABCDTOUVMATRIX_SVE \
"ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \
"ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \
"ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \
"ld1d {z3.d}, p4/z, [%[src0], #3, mul vl] \n" /* MNOP(bgra) */ \
"ld1d {z4.d}, p1/z, [%[src1]] \n" /* ABCD(bgra) */ \
"ld1d {z5.d}, p2/z, [%[src1], #1, mul vl] \n" /* EFGH(bgra) */ \
"ld1d {z6.d}, p3/z, [%[src1], #2, mul vl] \n" /* IJKL(bgra) */ \
"ld1d {z7.d}, p4/z, [%[src1], #3, mul vl] \n" /* MNOP(bgra) */ \
"incb %[src0], all, mul #4 \n" \
"incb %[src1], all, mul #4 \n" \
\
"uaddlb z16.h, z0.b, z4.b \n" /* ABCD(br) */ \
"uaddlb z18.h, z1.b, z5.b \n" /* EFGH(br) */ \
"uaddlb z20.h, z2.b, z6.b \n" /* IJKL(br) */ \
"uaddlb z22.h, z3.b, z7.b \n" /* MNOP(br) */ \
"uaddlt z17.h, z0.b, z4.b \n" /* ABCD(ga) */ \
"uaddlt z19.h, z1.b, z5.b \n" /* EFGH(ga) */ \
"uaddlt z21.h, z2.b, z6.b \n" /* IJKL(ga) */ \
"uaddlt z23.h, z3.b, z7.b \n" /* MNOP(ga) */ \
\
/* Use ADDP on 32-bit elements to add adjacent pairs of 9-bit unsigned */ \
"addp z16.s, p0/m, z16.s, z18.s \n" /* ABEFCDGH(br) */ \
"addp z17.s, p0/m, z17.s, z19.s \n" /* ABEFCDGH(ga) */ \
"addp z20.s, p0/m, z20.s, z22.s \n" /* IJMNKLOP(br) */ \
"addp z21.s, p0/m, z21.s, z23.s \n" /* IJMNKLOP(ga) */ \
\
"rshrnb z0.b, z16.h, #2 \n" /* ABEFCDGH(b0r0) */ \
"rshrnb z1.b, z20.h, #2 \n" /* IJMNKLOP(b0r0) */ \
"rshrnt z0.b, z17.h, #2 \n" /* ABEFCDGH(bgra) */ \
"rshrnt z1.b, z21.h, #2 \n" /* IJMNKLOP(bgra) */ \
\
"tbl z0.s, {z0.s}, z27.s \n" /* ABCDEFGH */ \
"tbl z1.s, {z1.s}, z27.s \n" /* IJKLMNOP */ \
\
"subs %w[width], %w[width], %w[vl], lsl #2 \n" /* VL per loop */ \
\
"movi v16.8h, #0 \n" \
"movi v17.8h, #0 \n" \
"movi v20.8h, #0 \n" \
"movi v21.8h, #0 \n" \
\
"usdot z16.s, z0.b, z24.b \n" \
"usdot z17.s, z1.b, z24.b \n" \
"usdot z20.s, z0.b, z25.b \n" \
"usdot z21.s, z1.b, z25.b \n" \
\
"subhnb z16.b, z26.h, z16.h \n" /* U */ \
"subhnb z20.b, z26.h, z20.h \n" /* V */ \
"subhnb z17.b, z26.h, z17.h \n" /* U */ \
"subhnb z21.b, z26.h, z21.h \n" /* V */ \
\
"uzp1 z16.h, z16.h, z17.h \n" \
"uzp1 z20.h, z20.h, z21.h \n" \
\
"st1b {z16.h}, p5, [%[dst_u]] \n" /* U */ \
"st1b {z20.h}, p5, [%[dst_v]] \n" /* V */ \
"inch %[dst_u] \n" \
"inch %[dst_v] \n"
static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const int16_t* uvconstants) {
const int8_t* uvconstants) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
uint64_t vl;
asm("cntd %x0" : "=r"(vl));
// Width is a multiple of two here, so halve it.
width >>= 1;
asm volatile(
"ptrue p0.b \n"
"ld1rd {z24.d}, p0/z, [%[uvconstants]] \n"
"ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n"
"mov z26.h, #0x8000 \n" // 128.0 (0x8000)
"cntb %[vl] \n"
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
"ptrue p0.b \n"
"ld1rw {z24.s}, p0/z, [%[uvconstants]] \n"
"ld1rw {z25.s}, p0/z, [%[uvconstants], #4] \n"
"mov z26.h, #0x8000 \n" // 128.0 (0x8000)
// Process 4x vectors from each input row per iteration.
// Cannot use predication here due to unrolling.
"1: \n" // e.g.
"ld1b {z0.b}, p0/z, [%[src0], #0, mul vl] \n" // bgrabgra
"ld1b {z4.b}, p0/z, [%[src1], #0, mul vl] \n" // bgrabgra
"ld1b {z1.b}, p0/z, [%[src0], #1, mul vl] \n" // bgrabgra
"ld1b {z5.b}, p0/z, [%[src1], #1, mul vl] \n" // bgrabgra
"ld1b {z2.b}, p0/z, [%[src0], #2, mul vl] \n" // bgrabgra
"ld1b {z6.b}, p0/z, [%[src1], #2, mul vl] \n" // bgrabgra
"ld1b {z3.b}, p0/z, [%[src0], #3, mul vl] \n" // bgrabgra
"ld1b {z7.b}, p0/z, [%[src1], #3, mul vl] \n" // bgrabgra
"incb %[src0], all, mul #4 \n"
"incb %[src1], all, mul #4 \n"
// Generate some TBL indices to undo the interleaving from ADDP.
"index z0.s, #0, #1 \n"
"index z1.s, #1, #1 \n"
"uzp1 z27.s, z0.s, z1.s \n"
"uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr
"uaddlt z17.h, z0.b, z4.b \n" // gagagaga
"uaddlb z18.h, z1.b, z5.b \n" // brbrbrbr
"uaddlt z19.h, z1.b, z5.b \n" // gagagaga
"uaddlb z20.h, z2.b, z6.b \n" // brbrbrbr
"uaddlt z21.h, z2.b, z6.b \n" // gagagaga
"uaddlb z22.h, z3.b, z7.b \n" // brbrbrbr
"uaddlt z23.h, z3.b, z7.b \n" // gagagaga
"subs %w[width], %w[width], %w[vl], lsl #2 \n"
"b.lt 2f \n"
"trn1 z0.s, z16.s, z17.s \n" // brgabgra
"trn2 z1.s, z16.s, z17.s \n" // brgabgra
"trn1 z2.s, z18.s, z19.s \n" // brgabgra
"trn2 z3.s, z18.s, z19.s \n" // brgabgra
"trn1 z4.s, z20.s, z21.s \n" // brgabgra
"trn2 z5.s, z20.s, z21.s \n" // brgabgra
"trn1 z6.s, z22.s, z23.s \n" // brgabgra
"trn2 z7.s, z22.s, z23.s \n" // brgabgra
"ptrue p1.d \n"
"ptrue p2.d \n"
"ptrue p3.d \n"
"ptrue p4.d \n"
"ptrue p5.h \n"
"1: \n" //
ABCDTOUVMATRIX_SVE
"b.gt 1b \n"
"subs %w[width], %w[width], %w[vl] \n" // 4*VL per loop
"2: \n"
"adds %w[width], %w[width], %w[vl], lsl #2 \n"
"b.eq 99f \n"
"add z0.h, p0/m, z0.h, z1.h \n" // brgabrga
"add z2.h, p0/m, z2.h, z3.h \n" // brgabrga
"add z4.h, p0/m, z4.h, z5.h \n" // brgabrga
"add z6.h, p0/m, z6.h, z7.h \n" // brgabrga
"3: \n"
"whilelt p1.d, wzr, %w[width] \n"
"whilelt p2.d, %w[vl], %w[width] \n"
"whilelt p3.d, %w[vl2], %w[width] \n"
"whilelt p4.d, %w[vl3], %w[width] \n"
"whilelt p5.h, wzr, %w[width] \n" //
ABCDTOUVMATRIX_SVE
"b.gt 3b \n"
"urshr z0.h, p0/m, z0.h, #2 \n" // brgabrga
"urshr z2.h, p0/m, z2.h, #2 \n" // brgabrga
"urshr z4.h, p0/m, z4.h, #2 \n" // brgabrga
"urshr z6.h, p0/m, z6.h, #2 \n" // brgabrga
"movi v16.8h, #0 \n"
"movi v17.8h, #0 \n"
"movi v18.8h, #0 \n"
"movi v19.8h, #0 \n"
"movi v20.8h, #0 \n"
"movi v21.8h, #0 \n"
"movi v22.8h, #0 \n"
"movi v23.8h, #0 \n"
"sdot z16.d, z0.h, z24.h \n" // UUxxxxxx
"sdot z17.d, z2.h, z24.h \n" // UUxxxxxx
"sdot z18.d, z4.h, z24.h \n" // UUxxxxxx
"sdot z19.d, z6.h, z24.h \n" // UUxxxxxx
"sdot z20.d, z0.h, z25.h \n" // VVxxxxxx
"sdot z21.d, z2.h, z25.h \n" // VVxxxxxx
"sdot z22.d, z4.h, z25.h \n" // VVxxxxxx
"sdot z23.d, z6.h, z25.h \n" // VVxxxxxx
"uzp1 z16.s, z16.s, z17.s \n" // UUxx
"uzp1 z18.s, z18.s, z19.s \n" // UUxx
"uzp1 z20.s, z20.s, z21.s \n" // VVxx
"uzp1 z22.s, z22.s, z23.s \n" // VVxx
"uzp1 z16.h, z16.h, z18.h \n" // UU
"uzp1 z20.h, z20.h, z22.h \n" // VV
"addhnb z16.b, z16.h, z26.h \n" // U
"addhnb z20.b, z20.h, z26.h \n" // V
"st1b {z16.h}, p0, [%[dst_u]] \n" // U
"st1b {z20.h}, p0, [%[dst_v]] \n" // V
"inch %[dst_u] \n"
"inch %[dst_v] \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n" // VL per loop
"b.le 99f \n"
// Process remaining pixels from each input row.
// Use predication to do one vector from each input array, so may loop up
// to three iterations.
"cntw %x[vl] \n"
"3: \n"
"whilelt p1.s, wzr, %w[width] \n"
"ld1d {z0.d}, p1/z, [%[src0]] \n" // bgrabgra
"ld1d {z4.d}, p1/z, [%[src1]] \n" // bgrabgra
"incb %[src0] \n"
"incb %[src1] \n"
"uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr
"uaddlt z17.h, z0.b, z4.b \n" // gagagaga
"trn1 z0.s, z16.s, z17.s \n" // brgabgra
"trn2 z1.s, z16.s, z17.s \n" // brgabgra
"add z0.h, p0/m, z0.h, z1.h \n" // brgabrga
"urshr z0.h, p0/m, z0.h, #2 \n" // brgabrga
"subs %w[width], %w[width], %w[vl] \n" // VL per loop
"movi v16.8h, #0 \n"
"movi v20.8h, #0 \n"
"sdot z16.d, z0.h, z24.h \n"
"sdot z20.d, z0.h, z25.h \n"
"addhnb z16.b, z16.h, z26.h \n" // U
"addhnb z20.b, z20.h, z26.h \n" // V
"st1b {z16.d}, p0, [%[dst_u]] \n" // U
"st1b {z20.d}, p0, [%[dst_v]] \n" // V
"incd %[dst_u] \n"
"incd %[dst_v] \n"
"b.gt 3b \n"
"99: \n"
: [src0] "+r"(src_argb), // %[src0]
[src1] "+r"(src_argb_1), // %[src1]
[dst_u] "+r"(dst_u), // %[dst_u]
[dst_v] "+r"(dst_v), // %[dst_v]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [uvconstants] "r"(uvconstants)
"99: \n"
: [src0] "+r"(src_argb), // %[src0]
[src1] "+r"(src_argb_1), // %[src1]
[dst_u] "+r"(dst_u), // %[dst_u]
[dst_v] "+r"(dst_v), // %[dst_v]
[width] "+r"(width) // %[width]
: [uvconstants] "r"(uvconstants), // %[uvconstants]
[vl] "r"(vl), // %[vl]
[vl2] "r"(vl * 2), // %[vl2]
[vl3] "r"(vl * 3) // %[vl3]
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16",
"z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26",
"p0");
"z27", "p0", "p1", "p2", "p3", "p4", "p5");
}
void ARGBToUVRow_SVE2(const uint8_t* src_argb,