[AArch64] Add SVE2 impls for {UYVY,YUY2}ToARGBRow

This is mostly similar to the existing NV{12,21}ToARGBRow_SVE2 kernels
except reading the YUV components all from the same interleaved input
array. We load four-byte elements and then use TBL to de-interleave the
UV components.

Unlike the NV{12,21} cases we need to de-interleave bytes rather than
widened 16-bit elements. Since we need a TBL instruction already it
would ordinarily be possible to perform the zero-extension from bytes to
16-bit elements by setting the index for every other byte to be out of
range. Such an approach does not work in SVE since at a vector length of
2048 bits since all possible byte values (0-255) are valid indices into
the vector. We instead get around this by rewriting the I4XXTORGB_SVE
macro to perform widening multiplies, operating on the low byte of each
16-bit UV element instead of the full value and therefore eliminating
the need for a zero-extension.

Observed reductions in runtimes compared to the existing Neon code:

            | UYVYToARGBRow | YUY2ToARGBRow
Cortex-A510 |        -30.2% |        -30.2%
Cortex-A720 |         -4.8% |         -4.7%
  Cortex-X2 |         -9.6% |        -10.1%

Bug: libyuv:973
Change-Id: I841a049aba020d0517563d24d2f14f4d1221ebc6
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5622132
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-24 14:23:16 +01:00 committed by Frank Barchard
parent cd4113f4e8
commit 367dd50755
3 changed files with 162 additions and 11 deletions

View File

@ -594,6 +594,8 @@ extern "C" {
#define HAS_NV12TOARGBROW_SVE2
#define HAS_NV21TOARGBROW_SVE2
#define HAS_RGBATOUVROW_SVE2
#define HAS_UYVYTOARGBROW_SVE2
#define HAS_YUY2TOARGBROW_SVE2
#endif
// The following are available on AArch64 platforms:
@ -1233,10 +1235,18 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_RVV(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,

View File

@ -4626,6 +4626,11 @@ int YUY2ToARGBMatrix(const uint8_t* src_yuy2,
}
}
#endif
#if defined(HAS_YUY2TOARGBROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
YUY2ToARGBRow = YUY2ToARGBRow_SVE2;
}
#endif
#if defined(HAS_YUY2TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
@ -4714,6 +4719,11 @@ int UYVYToARGBMatrix(const uint8_t* src_uyvy,
}
}
#endif
#if defined(HAS_UYVYTOARGBROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
UYVYToARGBRow = UYVYToARGBRow_SVE2;
}
#endif
#if defined(HAS_UYVYTOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToARGBRow = UYVYToARGBRow_Any_MSA;

View File

@ -63,11 +63,27 @@ extern "C" {
"tbl z2.h, {z1.h}, z23.h \n" /* V0V0 */ \
"tbl z1.h, {z1.h}, z22.h \n" /* U0U0 */
#define READYUY2_SVE \
"ld1w {z0.s}, p2/z, [%[src_yuy2]] \n" /* YUYV */ \
"incb %[src_yuy2] \n" \
"prfm pldl1keep, [%[src_yuy2], 448] \n" \
"tbl z2.b, {z0.b}, z23.b \n" /* V0V0 */ \
"tbl z1.b, {z0.b}, z22.b \n" /* U0U0 */ \
"trn1 z0.b, z0.b, z0.b \n" /* YYYY */
#define READUYVY_SVE \
"ld1w {z0.s}, p2/z, [%[src_uyvy]] \n" /* UYVY */ \
"incb %[src_uyvy] \n" \
"prfm pldl1keep, [%[src_uyvy], 448] \n" \
"tbl z2.b, {z0.b}, z23.b \n" /* V0V0 */ \
"tbl z1.b, {z0.b}, z22.b \n" /* U0U0 */ \
"trn2 z0.b, z0.b, z0.b \n" /* YYYY */
#define YUVTORGB_SVE_SETUP \
"ld1rb {z28.h}, p0/z, [%[kUVCoeff], #0] \n" \
"ld1rb {z29.h}, p0/z, [%[kUVCoeff], #1] \n" \
"ld1rb {z30.h}, p0/z, [%[kUVCoeff], #2] \n" \
"ld1rb {z31.h}, p0/z, [%[kUVCoeff], #3] \n" \
"ld1rb {z28.b}, p0/z, [%[kUVCoeff], #0] \n" \
"ld1rb {z29.b}, p0/z, [%[kUVCoeff], #1] \n" \
"ld1rb {z30.b}, p0/z, [%[kUVCoeff], #2] \n" \
"ld1rb {z31.b}, p0/z, [%[kUVCoeff], #3] \n" \
"ld1rh {z24.h}, p0/z, [%[kRGBCoeffBias], #0] \n" \
"ld1rh {z25.h}, p0/z, [%[kRGBCoeffBias], #2] \n" \
"ld1rh {z26.h}, p0/z, [%[kRGBCoeffBias], #4] \n" \
@ -75,10 +91,10 @@ extern "C" {
#define I4XXTORGB_SVE \
"umulh z0.h, z24.h, z0.h \n" /* Y */ \
"mul z6.h, z30.h, z1.h \n" \
"mul z4.h, z28.h, z1.h \n" /* DB */ \
"mul z5.h, z29.h, z2.h \n" /* DR */ \
"mla z6.h, p0/m, z31.h, z2.h \n" /* DG */ \
"umullb z6.h, z30.b, z1.b \n" \
"umullb z4.h, z28.b, z1.b \n" /* DB */ \
"umullb z5.h, z29.b, z2.b \n" /* DR */ \
"umlalb z6.h, z31.b, z2.b \n" /* DG */ \
"add z17.h, z0.h, z26.h \n" /* G */ \
"add z16.h, z0.h, z4.h \n" /* B */ \
"add z18.h, z0.h, z5.h \n" /* R */ \
@ -171,9 +187,9 @@ void I400ToARGBRow_SVE2(const uint8_t* src_y,
YUVTORGB_SVE_SETUP
"cmp %w[width], %w[vl] \n"
"mov z1.h, #128 \n" // U/V
"mul z6.h, z30.h, z1.h \n"
"mul z4.h, z28.h, z1.h \n" // DB
"mul z5.h, z29.h, z1.h \n" // DR
"umullb z6.h, z30.b, z1.b \n"
"umullb z4.h, z28.b, z1.b \n" // DB
"umullb z5.h, z29.b, z1.b \n" // DR
"mla z6.h, p0/m, z31.h, z1.h \n" // DG
"sub z4.h, z4.h, z25.h \n"
"sub z5.h, z5.h, z27.h \n"
@ -965,6 +981,121 @@ void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
"p1");
}
void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
uint32_t nv_u_start = 0x0001'0001U;
uint32_t nv_u_step = 0x0004'0004U;
uint32_t nv_v_start = 0x0003'0003U;
uint32_t nv_v_step = 0x0004'0004U;
uint64_t vl;
asm("cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1);
int width_last_uv = width_last_y + (width_last_y & 1);
asm("ptrue p0.b \n"
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
"dup z19.b, #255 \n" // A
YUVTORGB_SVE_SETUP
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p1.h \n"
"ptrue p2.h \n"
"1: \n" //
READYUY2_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p1.h, wzr, %w[width_last_y] \n"
"whilelt p2.h, wzr, %w[width_last_uv] \n" //
READYUY2_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"99: \n"
: [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [vl] "r"(vl), // %[vl]
[kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[nv_u_start] "r"(nv_u_start), // %[nv_u_start]
[nv_u_step] "r"(nv_u_step), // %[nv_u_step]
[nv_v_start] "r"(nv_v_start), // %[nv_v_start]
[nv_v_step] "r"(nv_v_step), // %[nv_v_step]
[width_last_y] "r"(width_last_y), // %[width_last_y]
[width_last_uv] "r"(width_last_uv) // %[width_last_uv]
: "cc", "memory", YUVTORGB_SVE_REGS, "p2");
}
void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
uint32_t nv_u_start = 0x0000'0000U;
uint32_t nv_u_step = 0x0004'0004U;
uint32_t nv_v_start = 0x0002'0002U;
uint32_t nv_v_step = 0x0004'0004U;
uint64_t vl;
asm("cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1);
int width_last_uv = width_last_y + (width_last_y & 1);
asm("ptrue p0.b \n"
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
"dup z19.b, #255 \n" // A
YUVTORGB_SVE_SETUP
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p1.h \n"
"ptrue p2.h \n"
"1: \n" //
READUYVY_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"2: \n"
"whilelt p1.h, wzr, %w[width_last_y] \n"
"whilelt p2.h, wzr, %w[width_last_uv] \n" //
READUYVY_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"99: \n"
: [src_uyvy] "+r"(src_uyvy), // %[src_yuy2]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [vl] "r"(vl), // %[vl]
[kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[nv_u_start] "r"(nv_u_start), // %[nv_u_start]
[nv_u_step] "r"(nv_u_step), // %[nv_u_step]
[nv_v_start] "r"(nv_v_start), // %[nv_v_start]
[nv_v_step] "r"(nv_v_step), // %[nv_v_step]
[width_last_y] "r"(width_last_y), // %[width_last_y]
[width_last_uv] "r"(width_last_uv) // %[width_last_uv]
: "cc", "memory", YUVTORGB_SVE_REGS, "p2");
}
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
#ifdef __cplusplus