mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[AArch64] Improve ARGBToUVRow_SVE2 and related kernels
This commit reworks the implementation of ARGBToUVMatrixRow_SVE2, using an approach similar to that recently used in 61bdaee13a701d2b52c6dc943ccc5c888077a591. In particular we can rework these SVE2 implementations to use 8-bit dot-product instructions instead of 16-bit, allowing us to process more data in a single vector. To ensure that the input values fit in 8-bits, negate the UV constants arrays passed to the kernel and undo the now-unnecessary flipping of the middle two component values. This commit mostly reverses the performance inversion where the Neon I8MM implementation was previously faster than the SVE2 implementation. The reduction in runtime observed compared to the existing Neon I8MM implementation is now: Cortex-A510: +5.6% (!) Cortex-A520: -3.0% Cortex-A710: -12.6% Cortex-A715: -10.9% Cortex-A720: -10.8% Cortex-X2: -3.8% Cortex-X3: -10.3% Cortex-X4: -9.5% Cortex-X925: -6.7% Change-Id: I30253976dc8e3651cfb5fd39b63a6763975d41e3 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6640990 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
1b2f6cdbe8
commit
3d66e94fb5
@ -217,9 +217,7 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
|
|||||||
NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
|
NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Dot-product constants are stored as four-tuples with the two innermost
|
// SVE constants are stored negated such that we can store 128 in int8_t.
|
||||||
// elements flipped to account for the interleaving nature of the widening
|
|
||||||
// addition instructions.
|
|
||||||
|
|
||||||
// RGB to BT601 coefficients
|
// RGB to BT601 coefficients
|
||||||
// UB 0.875 coefficient = 112
|
// UB 0.875 coefficient = 112
|
||||||
@ -229,25 +227,24 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
|
|||||||
// VG -0.7344 coefficient = -94
|
// VG -0.7344 coefficient = -94
|
||||||
// VR 0.875 coefficient = 112
|
// VR 0.875 coefficient = 112
|
||||||
|
|
||||||
// SVE constants are not negated
|
static const int8_t kARGBToUVCoefficients[] = {
|
||||||
static const int16_t kARGBToUVCoefficients[] = {
|
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
|
||||||
// UB, -UR, -UG, 0, -VB, VR, -VG, 0
|
-112, 74, 38, 0, 18, 94, -112, 0,
|
||||||
112, -38, -74, 0, -18, 112, -94, 0,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static const int16_t kRGBAToUVCoefficients[] = {
|
static const int8_t kABGRToUVCoefficients[] = {
|
||||||
// 0, -UG, UB, -UR, 0, -VG, -VB, VR
|
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
|
||||||
0, -74, 112, -38, 0, -94, -18, 112,
|
38, 74, -112, 0, -112, 94, 18, 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const int16_t kBGRAToUVCoefficients[] = {
|
static const int8_t kBGRAToUVCoefficients[] = {
|
||||||
// 0, -UG, -UR, UB, 0, -VG, VR, -VB
|
// 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
|
||||||
0, -74, -38, 112, 0, -94, 112, -18,
|
0, 38, 74, -112, 0, -112, 94, 18,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const int16_t kABGRToUVCoefficients[] = {
|
static const int8_t kRGBAToUVCoefficients[] = {
|
||||||
// -UR, UB, -UG, 0, VR, -VB, -VG, 0
|
// 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
|
||||||
-38, 112, -74, 0, 112, -18, -94, 0,
|
0, -112, 74, 38, 0, 18, 94, -112,
|
||||||
};
|
};
|
||||||
|
|
||||||
// RGB to JPEG coefficients
|
// RGB to JPEG coefficients
|
||||||
@ -258,169 +255,138 @@ static const int16_t kABGRToUVCoefficients[] = {
|
|||||||
// VG -0.41869 coefficient = -107
|
// VG -0.41869 coefficient = -107
|
||||||
// VR 0.500 coefficient = 128
|
// VR 0.500 coefficient = 128
|
||||||
|
|
||||||
static const int16_t kARGBToUVJCoefficients[] = {
|
static const int8_t kARGBToUVJCoefficients[] = {
|
||||||
// UB, -UR, -UG, 0, -VB, VR, -VG, 0
|
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
|
||||||
128, -43, -85, 0, -21, 128, -107, 0,
|
-128, 85, 43, 0, 21, 107, -128, 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const int16_t kABGRToUVJCoefficients[] = {
|
static const int8_t kABGRToUVJCoefficients[] = {
|
||||||
// -UR, UB, -UG, 0, VR, -VB, -VG, 0
|
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
|
||||||
-43, 128, -85, 0, 128, -21, -107, 0,
|
43, 85, -128, 0, -128, 107, 21, 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define ABCDTOUVMATRIX_SVE \
|
||||||
|
"ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \
|
||||||
|
"ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \
|
||||||
|
"ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \
|
||||||
|
"ld1d {z3.d}, p4/z, [%[src0], #3, mul vl] \n" /* MNOP(bgra) */ \
|
||||||
|
"ld1d {z4.d}, p1/z, [%[src1]] \n" /* ABCD(bgra) */ \
|
||||||
|
"ld1d {z5.d}, p2/z, [%[src1], #1, mul vl] \n" /* EFGH(bgra) */ \
|
||||||
|
"ld1d {z6.d}, p3/z, [%[src1], #2, mul vl] \n" /* IJKL(bgra) */ \
|
||||||
|
"ld1d {z7.d}, p4/z, [%[src1], #3, mul vl] \n" /* MNOP(bgra) */ \
|
||||||
|
"incb %[src0], all, mul #4 \n" \
|
||||||
|
"incb %[src1], all, mul #4 \n" \
|
||||||
|
\
|
||||||
|
"uaddlb z16.h, z0.b, z4.b \n" /* ABCD(br) */ \
|
||||||
|
"uaddlb z18.h, z1.b, z5.b \n" /* EFGH(br) */ \
|
||||||
|
"uaddlb z20.h, z2.b, z6.b \n" /* IJKL(br) */ \
|
||||||
|
"uaddlb z22.h, z3.b, z7.b \n" /* MNOP(br) */ \
|
||||||
|
"uaddlt z17.h, z0.b, z4.b \n" /* ABCD(ga) */ \
|
||||||
|
"uaddlt z19.h, z1.b, z5.b \n" /* EFGH(ga) */ \
|
||||||
|
"uaddlt z21.h, z2.b, z6.b \n" /* IJKL(ga) */ \
|
||||||
|
"uaddlt z23.h, z3.b, z7.b \n" /* MNOP(ga) */ \
|
||||||
|
\
|
||||||
|
/* Use ADDP on 32-bit elements to add adjacent pairs of 9-bit unsigned */ \
|
||||||
|
"addp z16.s, p0/m, z16.s, z18.s \n" /* ABEFCDGH(br) */ \
|
||||||
|
"addp z17.s, p0/m, z17.s, z19.s \n" /* ABEFCDGH(ga) */ \
|
||||||
|
"addp z20.s, p0/m, z20.s, z22.s \n" /* IJMNKLOP(br) */ \
|
||||||
|
"addp z21.s, p0/m, z21.s, z23.s \n" /* IJMNKLOP(ga) */ \
|
||||||
|
\
|
||||||
|
"rshrnb z0.b, z16.h, #2 \n" /* ABEFCDGH(b0r0) */ \
|
||||||
|
"rshrnb z1.b, z20.h, #2 \n" /* IJMNKLOP(b0r0) */ \
|
||||||
|
"rshrnt z0.b, z17.h, #2 \n" /* ABEFCDGH(bgra) */ \
|
||||||
|
"rshrnt z1.b, z21.h, #2 \n" /* IJMNKLOP(bgra) */ \
|
||||||
|
\
|
||||||
|
"tbl z0.s, {z0.s}, z27.s \n" /* ABCDEFGH */ \
|
||||||
|
"tbl z1.s, {z1.s}, z27.s \n" /* IJKLMNOP */ \
|
||||||
|
\
|
||||||
|
"subs %w[width], %w[width], %w[vl], lsl #2 \n" /* VL per loop */ \
|
||||||
|
\
|
||||||
|
"movi v16.8h, #0 \n" \
|
||||||
|
"movi v17.8h, #0 \n" \
|
||||||
|
"movi v20.8h, #0 \n" \
|
||||||
|
"movi v21.8h, #0 \n" \
|
||||||
|
\
|
||||||
|
"usdot z16.s, z0.b, z24.b \n" \
|
||||||
|
"usdot z17.s, z1.b, z24.b \n" \
|
||||||
|
"usdot z20.s, z0.b, z25.b \n" \
|
||||||
|
"usdot z21.s, z1.b, z25.b \n" \
|
||||||
|
\
|
||||||
|
"subhnb z16.b, z26.h, z16.h \n" /* U */ \
|
||||||
|
"subhnb z20.b, z26.h, z20.h \n" /* V */ \
|
||||||
|
"subhnb z17.b, z26.h, z17.h \n" /* U */ \
|
||||||
|
"subhnb z21.b, z26.h, z21.h \n" /* V */ \
|
||||||
|
\
|
||||||
|
"uzp1 z16.h, z16.h, z17.h \n" \
|
||||||
|
"uzp1 z20.h, z20.h, z21.h \n" \
|
||||||
|
\
|
||||||
|
"st1b {z16.h}, p5, [%[dst_u]] \n" /* U */ \
|
||||||
|
"st1b {z20.h}, p5, [%[dst_v]] \n" /* V */ \
|
||||||
|
"inch %[dst_u] \n" \
|
||||||
|
"inch %[dst_v] \n"
|
||||||
|
|
||||||
static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
|
static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
|
||||||
int src_stride_argb,
|
int src_stride_argb,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width,
|
int width,
|
||||||
const int16_t* uvconstants) {
|
const int8_t* uvconstants) {
|
||||||
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
|
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
|
asm("cntd %x0" : "=r"(vl));
|
||||||
|
|
||||||
|
// Width is a multiple of two here, so halve it.
|
||||||
|
width >>= 1;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"ptrue p0.b \n"
|
"ptrue p0.b \n"
|
||||||
"ld1rd {z24.d}, p0/z, [%[uvconstants]] \n"
|
"ld1rw {z24.s}, p0/z, [%[uvconstants]] \n"
|
||||||
"ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n"
|
"ld1rw {z25.s}, p0/z, [%[uvconstants], #4] \n"
|
||||||
"mov z26.h, #0x8000 \n" // 128.0 (0x8000)
|
"mov z26.h, #0x8000 \n" // 128.0 (0x8000)
|
||||||
"cntb %[vl] \n"
|
|
||||||
"subs %w[width], %w[width], %w[vl] \n"
|
|
||||||
"b.lt 2f \n"
|
|
||||||
|
|
||||||
// Process 4x vectors from each input row per iteration.
|
// Generate some TBL indices to undo the interleaving from ADDP.
|
||||||
// Cannot use predication here due to unrolling.
|
"index z0.s, #0, #1 \n"
|
||||||
"1: \n" // e.g.
|
"index z1.s, #1, #1 \n"
|
||||||
"ld1b {z0.b}, p0/z, [%[src0], #0, mul vl] \n" // bgrabgra
|
"uzp1 z27.s, z0.s, z1.s \n"
|
||||||
"ld1b {z4.b}, p0/z, [%[src1], #0, mul vl] \n" // bgrabgra
|
|
||||||
"ld1b {z1.b}, p0/z, [%[src0], #1, mul vl] \n" // bgrabgra
|
|
||||||
"ld1b {z5.b}, p0/z, [%[src1], #1, mul vl] \n" // bgrabgra
|
|
||||||
"ld1b {z2.b}, p0/z, [%[src0], #2, mul vl] \n" // bgrabgra
|
|
||||||
"ld1b {z6.b}, p0/z, [%[src1], #2, mul vl] \n" // bgrabgra
|
|
||||||
"ld1b {z3.b}, p0/z, [%[src0], #3, mul vl] \n" // bgrabgra
|
|
||||||
"ld1b {z7.b}, p0/z, [%[src1], #3, mul vl] \n" // bgrabgra
|
|
||||||
"incb %[src0], all, mul #4 \n"
|
|
||||||
"incb %[src1], all, mul #4 \n"
|
|
||||||
|
|
||||||
"uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr
|
"subs %w[width], %w[width], %w[vl], lsl #2 \n"
|
||||||
"uaddlt z17.h, z0.b, z4.b \n" // gagagaga
|
"b.lt 2f \n"
|
||||||
"uaddlb z18.h, z1.b, z5.b \n" // brbrbrbr
|
|
||||||
"uaddlt z19.h, z1.b, z5.b \n" // gagagaga
|
|
||||||
"uaddlb z20.h, z2.b, z6.b \n" // brbrbrbr
|
|
||||||
"uaddlt z21.h, z2.b, z6.b \n" // gagagaga
|
|
||||||
"uaddlb z22.h, z3.b, z7.b \n" // brbrbrbr
|
|
||||||
"uaddlt z23.h, z3.b, z7.b \n" // gagagaga
|
|
||||||
|
|
||||||
"trn1 z0.s, z16.s, z17.s \n" // brgabgra
|
"ptrue p1.d \n"
|
||||||
"trn2 z1.s, z16.s, z17.s \n" // brgabgra
|
"ptrue p2.d \n"
|
||||||
"trn1 z2.s, z18.s, z19.s \n" // brgabgra
|
"ptrue p3.d \n"
|
||||||
"trn2 z3.s, z18.s, z19.s \n" // brgabgra
|
"ptrue p4.d \n"
|
||||||
"trn1 z4.s, z20.s, z21.s \n" // brgabgra
|
"ptrue p5.h \n"
|
||||||
"trn2 z5.s, z20.s, z21.s \n" // brgabgra
|
"1: \n" //
|
||||||
"trn1 z6.s, z22.s, z23.s \n" // brgabgra
|
ABCDTOUVMATRIX_SVE
|
||||||
"trn2 z7.s, z22.s, z23.s \n" // brgabgra
|
"b.gt 1b \n"
|
||||||
|
|
||||||
"subs %w[width], %w[width], %w[vl] \n" // 4*VL per loop
|
"2: \n"
|
||||||
|
"adds %w[width], %w[width], %w[vl], lsl #2 \n"
|
||||||
|
"b.eq 99f \n"
|
||||||
|
|
||||||
"add z0.h, p0/m, z0.h, z1.h \n" // brgabrga
|
"3: \n"
|
||||||
"add z2.h, p0/m, z2.h, z3.h \n" // brgabrga
|
"whilelt p1.d, wzr, %w[width] \n"
|
||||||
"add z4.h, p0/m, z4.h, z5.h \n" // brgabrga
|
"whilelt p2.d, %w[vl], %w[width] \n"
|
||||||
"add z6.h, p0/m, z6.h, z7.h \n" // brgabrga
|
"whilelt p3.d, %w[vl2], %w[width] \n"
|
||||||
|
"whilelt p4.d, %w[vl3], %w[width] \n"
|
||||||
|
"whilelt p5.h, wzr, %w[width] \n" //
|
||||||
|
ABCDTOUVMATRIX_SVE
|
||||||
|
"b.gt 3b \n"
|
||||||
|
|
||||||
"urshr z0.h, p0/m, z0.h, #2 \n" // brgabrga
|
"99: \n"
|
||||||
"urshr z2.h, p0/m, z2.h, #2 \n" // brgabrga
|
: [src0] "+r"(src_argb), // %[src0]
|
||||||
"urshr z4.h, p0/m, z4.h, #2 \n" // brgabrga
|
[src1] "+r"(src_argb_1), // %[src1]
|
||||||
"urshr z6.h, p0/m, z6.h, #2 \n" // brgabrga
|
[dst_u] "+r"(dst_u), // %[dst_u]
|
||||||
|
[dst_v] "+r"(dst_v), // %[dst_v]
|
||||||
"movi v16.8h, #0 \n"
|
[width] "+r"(width) // %[width]
|
||||||
"movi v17.8h, #0 \n"
|
: [uvconstants] "r"(uvconstants), // %[uvconstants]
|
||||||
"movi v18.8h, #0 \n"
|
[vl] "r"(vl), // %[vl]
|
||||||
"movi v19.8h, #0 \n"
|
[vl2] "r"(vl * 2), // %[vl2]
|
||||||
|
[vl3] "r"(vl * 3) // %[vl3]
|
||||||
"movi v20.8h, #0 \n"
|
|
||||||
"movi v21.8h, #0 \n"
|
|
||||||
"movi v22.8h, #0 \n"
|
|
||||||
"movi v23.8h, #0 \n"
|
|
||||||
|
|
||||||
"sdot z16.d, z0.h, z24.h \n" // UUxxxxxx
|
|
||||||
"sdot z17.d, z2.h, z24.h \n" // UUxxxxxx
|
|
||||||
"sdot z18.d, z4.h, z24.h \n" // UUxxxxxx
|
|
||||||
"sdot z19.d, z6.h, z24.h \n" // UUxxxxxx
|
|
||||||
|
|
||||||
"sdot z20.d, z0.h, z25.h \n" // VVxxxxxx
|
|
||||||
"sdot z21.d, z2.h, z25.h \n" // VVxxxxxx
|
|
||||||
"sdot z22.d, z4.h, z25.h \n" // VVxxxxxx
|
|
||||||
"sdot z23.d, z6.h, z25.h \n" // VVxxxxxx
|
|
||||||
|
|
||||||
"uzp1 z16.s, z16.s, z17.s \n" // UUxx
|
|
||||||
"uzp1 z18.s, z18.s, z19.s \n" // UUxx
|
|
||||||
"uzp1 z20.s, z20.s, z21.s \n" // VVxx
|
|
||||||
"uzp1 z22.s, z22.s, z23.s \n" // VVxx
|
|
||||||
|
|
||||||
"uzp1 z16.h, z16.h, z18.h \n" // UU
|
|
||||||
"uzp1 z20.h, z20.h, z22.h \n" // VV
|
|
||||||
|
|
||||||
"addhnb z16.b, z16.h, z26.h \n" // U
|
|
||||||
"addhnb z20.b, z20.h, z26.h \n" // V
|
|
||||||
|
|
||||||
"st1b {z16.h}, p0, [%[dst_u]] \n" // U
|
|
||||||
"st1b {z20.h}, p0, [%[dst_v]] \n" // V
|
|
||||||
"inch %[dst_u] \n"
|
|
||||||
"inch %[dst_v] \n"
|
|
||||||
|
|
||||||
"b.ge 1b \n"
|
|
||||||
|
|
||||||
"2: \n"
|
|
||||||
"adds %w[width], %w[width], %w[vl] \n" // VL per loop
|
|
||||||
"b.le 99f \n"
|
|
||||||
|
|
||||||
// Process remaining pixels from each input row.
|
|
||||||
// Use predication to do one vector from each input array, so may loop up
|
|
||||||
// to three iterations.
|
|
||||||
"cntw %x[vl] \n"
|
|
||||||
|
|
||||||
"3: \n"
|
|
||||||
"whilelt p1.s, wzr, %w[width] \n"
|
|
||||||
"ld1d {z0.d}, p1/z, [%[src0]] \n" // bgrabgra
|
|
||||||
"ld1d {z4.d}, p1/z, [%[src1]] \n" // bgrabgra
|
|
||||||
"incb %[src0] \n"
|
|
||||||
"incb %[src1] \n"
|
|
||||||
|
|
||||||
"uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr
|
|
||||||
"uaddlt z17.h, z0.b, z4.b \n" // gagagaga
|
|
||||||
|
|
||||||
"trn1 z0.s, z16.s, z17.s \n" // brgabgra
|
|
||||||
"trn2 z1.s, z16.s, z17.s \n" // brgabgra
|
|
||||||
|
|
||||||
"add z0.h, p0/m, z0.h, z1.h \n" // brgabrga
|
|
||||||
|
|
||||||
"urshr z0.h, p0/m, z0.h, #2 \n" // brgabrga
|
|
||||||
|
|
||||||
"subs %w[width], %w[width], %w[vl] \n" // VL per loop
|
|
||||||
|
|
||||||
"movi v16.8h, #0 \n"
|
|
||||||
"movi v20.8h, #0 \n"
|
|
||||||
|
|
||||||
"sdot z16.d, z0.h, z24.h \n"
|
|
||||||
"sdot z20.d, z0.h, z25.h \n"
|
|
||||||
|
|
||||||
"addhnb z16.b, z16.h, z26.h \n" // U
|
|
||||||
"addhnb z20.b, z20.h, z26.h \n" // V
|
|
||||||
|
|
||||||
"st1b {z16.d}, p0, [%[dst_u]] \n" // U
|
|
||||||
"st1b {z20.d}, p0, [%[dst_v]] \n" // V
|
|
||||||
"incd %[dst_u] \n"
|
|
||||||
"incd %[dst_v] \n"
|
|
||||||
"b.gt 3b \n"
|
|
||||||
|
|
||||||
"99: \n"
|
|
||||||
: [src0] "+r"(src_argb), // %[src0]
|
|
||||||
[src1] "+r"(src_argb_1), // %[src1]
|
|
||||||
[dst_u] "+r"(dst_u), // %[dst_u]
|
|
||||||
[dst_v] "+r"(dst_v), // %[dst_v]
|
|
||||||
[width] "+r"(width), // %[width]
|
|
||||||
[vl] "=&r"(vl) // %[vl]
|
|
||||||
: [uvconstants] "r"(uvconstants)
|
|
||||||
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16",
|
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16",
|
||||||
"z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26",
|
"z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26",
|
||||||
"p0");
|
"z27", "p0", "p1", "p2", "p3", "p4", "p5");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ARGBToUVRow_SVE2(const uint8_t* src_argb,
|
void ARGBToUVRow_SVE2(const uint8_t* src_argb,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user