mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Add SVE2 impls for {UYVY,YUY2}ToARGBRow
This is mostly similar to the existing NV{12,21}ToARGBRow_SVE2 kernels
except reading the YUV components all from the same interleaved input
array. We load four-byte elements and then use TBL to de-interleave the
UV components.
Unlike the NV{12,21} cases we need to de-interleave bytes rather than
widened 16-bit elements. Since we need a TBL instruction already it
would ordinarily be possible to perform the zero-extension from bytes to
16-bit elements by setting the index for every other byte to be out of
range. Such an approach does not work in SVE since at a vector length of
2048 bits since all possible byte values (0-255) are valid indices into
the vector. We instead get around this by rewriting the I4XXTORGB_SVE
macro to perform widening multiplies, operating on the low byte of each
16-bit UV element instead of the full value and therefore eliminating
the need for a zero-extension.
Observed reductions in runtimes compared to the existing Neon code:
| UYVYToARGBRow | YUY2ToARGBRow
Cortex-A510 | -30.2% | -30.2%
Cortex-A720 | -4.8% | -4.7%
Cortex-X2 | -9.6% | -10.1%
Bug: libyuv:973
Change-Id: I841a049aba020d0517563d24d2f14f4d1221ebc6
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5622132
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
cd4113f4e8
commit
367dd50755
@ -594,6 +594,8 @@ extern "C" {
|
||||
#define HAS_NV12TOARGBROW_SVE2
|
||||
#define HAS_NV21TOARGBROW_SVE2
|
||||
#define HAS_RGBATOUVROW_SVE2
|
||||
#define HAS_UYVYTOARGBROW_SVE2
|
||||
#define HAS_YUY2TOARGBROW_SVE2
|
||||
#endif
|
||||
|
||||
// The following are available on AArch64 platforms:
|
||||
@ -1233,10 +1235,18 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I444ToARGBRow_RVV(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
|
||||
@ -4626,6 +4626,11 @@ int YUY2ToARGBMatrix(const uint8_t* src_yuy2,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_YUY2TOARGBROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
YUY2ToARGBRow = YUY2ToARGBRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_YUY2TOARGBROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
|
||||
@ -4714,6 +4719,11 @@ int UYVYToARGBMatrix(const uint8_t* src_uyvy,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_UYVYTOARGBROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
UYVYToARGBRow = UYVYToARGBRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_UYVYTOARGBROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
|
||||
|
||||
@ -63,11 +63,27 @@ extern "C" {
|
||||
"tbl z2.h, {z1.h}, z23.h \n" /* V0V0 */ \
|
||||
"tbl z1.h, {z1.h}, z22.h \n" /* U0U0 */
|
||||
|
||||
#define READYUY2_SVE \
|
||||
"ld1w {z0.s}, p2/z, [%[src_yuy2]] \n" /* YUYV */ \
|
||||
"incb %[src_yuy2] \n" \
|
||||
"prfm pldl1keep, [%[src_yuy2], 448] \n" \
|
||||
"tbl z2.b, {z0.b}, z23.b \n" /* V0V0 */ \
|
||||
"tbl z1.b, {z0.b}, z22.b \n" /* U0U0 */ \
|
||||
"trn1 z0.b, z0.b, z0.b \n" /* YYYY */
|
||||
|
||||
#define READUYVY_SVE \
|
||||
"ld1w {z0.s}, p2/z, [%[src_uyvy]] \n" /* UYVY */ \
|
||||
"incb %[src_uyvy] \n" \
|
||||
"prfm pldl1keep, [%[src_uyvy], 448] \n" \
|
||||
"tbl z2.b, {z0.b}, z23.b \n" /* V0V0 */ \
|
||||
"tbl z1.b, {z0.b}, z22.b \n" /* U0U0 */ \
|
||||
"trn2 z0.b, z0.b, z0.b \n" /* YYYY */
|
||||
|
||||
#define YUVTORGB_SVE_SETUP \
|
||||
"ld1rb {z28.h}, p0/z, [%[kUVCoeff], #0] \n" \
|
||||
"ld1rb {z29.h}, p0/z, [%[kUVCoeff], #1] \n" \
|
||||
"ld1rb {z30.h}, p0/z, [%[kUVCoeff], #2] \n" \
|
||||
"ld1rb {z31.h}, p0/z, [%[kUVCoeff], #3] \n" \
|
||||
"ld1rb {z28.b}, p0/z, [%[kUVCoeff], #0] \n" \
|
||||
"ld1rb {z29.b}, p0/z, [%[kUVCoeff], #1] \n" \
|
||||
"ld1rb {z30.b}, p0/z, [%[kUVCoeff], #2] \n" \
|
||||
"ld1rb {z31.b}, p0/z, [%[kUVCoeff], #3] \n" \
|
||||
"ld1rh {z24.h}, p0/z, [%[kRGBCoeffBias], #0] \n" \
|
||||
"ld1rh {z25.h}, p0/z, [%[kRGBCoeffBias], #2] \n" \
|
||||
"ld1rh {z26.h}, p0/z, [%[kRGBCoeffBias], #4] \n" \
|
||||
@ -75,10 +91,10 @@ extern "C" {
|
||||
|
||||
#define I4XXTORGB_SVE \
|
||||
"umulh z0.h, z24.h, z0.h \n" /* Y */ \
|
||||
"mul z6.h, z30.h, z1.h \n" \
|
||||
"mul z4.h, z28.h, z1.h \n" /* DB */ \
|
||||
"mul z5.h, z29.h, z2.h \n" /* DR */ \
|
||||
"mla z6.h, p0/m, z31.h, z2.h \n" /* DG */ \
|
||||
"umullb z6.h, z30.b, z1.b \n" \
|
||||
"umullb z4.h, z28.b, z1.b \n" /* DB */ \
|
||||
"umullb z5.h, z29.b, z2.b \n" /* DR */ \
|
||||
"umlalb z6.h, z31.b, z2.b \n" /* DG */ \
|
||||
"add z17.h, z0.h, z26.h \n" /* G */ \
|
||||
"add z16.h, z0.h, z4.h \n" /* B */ \
|
||||
"add z18.h, z0.h, z5.h \n" /* R */ \
|
||||
@ -171,9 +187,9 @@ void I400ToARGBRow_SVE2(const uint8_t* src_y,
|
||||
YUVTORGB_SVE_SETUP
|
||||
"cmp %w[width], %w[vl] \n"
|
||||
"mov z1.h, #128 \n" // U/V
|
||||
"mul z6.h, z30.h, z1.h \n"
|
||||
"mul z4.h, z28.h, z1.h \n" // DB
|
||||
"mul z5.h, z29.h, z1.h \n" // DR
|
||||
"umullb z6.h, z30.b, z1.b \n"
|
||||
"umullb z4.h, z28.b, z1.b \n" // DB
|
||||
"umullb z5.h, z29.b, z1.b \n" // DR
|
||||
"mla z6.h, p0/m, z31.h, z1.h \n" // DG
|
||||
"sub z4.h, z4.h, z25.h \n"
|
||||
"sub z5.h, z5.h, z27.h \n"
|
||||
@ -965,6 +981,121 @@ void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
|
||||
"p1");
|
||||
}
|
||||
|
||||
void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
uint32_t nv_u_start = 0x0001'0001U;
|
||||
uint32_t nv_u_step = 0x0004'0004U;
|
||||
uint32_t nv_v_start = 0x0003'0003U;
|
||||
uint32_t nv_v_step = 0x0004'0004U;
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
int width_last_uv = width_last_y + (width_last_y & 1);
|
||||
asm("ptrue p0.b \n"
|
||||
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
|
||||
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
|
||||
"dup z19.b, #255 \n" // A
|
||||
YUVTORGB_SVE_SETUP
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// Run bulk of computation with an all-true predicate to avoid predicate
|
||||
// generation overhead.
|
||||
"ptrue p1.h \n"
|
||||
"ptrue p2.h \n"
|
||||
"1: \n" //
|
||||
READYUY2_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[width], %w[width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"whilelt p1.h, wzr, %w[width_last_y] \n"
|
||||
"whilelt p2.h, wzr, %w[width_last_uv] \n" //
|
||||
READYUY2_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
|
||||
[dst_argb] "+r"(dst_argb), // %[dst_argb]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [vl] "r"(vl), // %[vl]
|
||||
[kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
|
||||
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
|
||||
[nv_u_start] "r"(nv_u_start), // %[nv_u_start]
|
||||
[nv_u_step] "r"(nv_u_step), // %[nv_u_step]
|
||||
[nv_v_start] "r"(nv_v_start), // %[nv_v_start]
|
||||
[nv_v_step] "r"(nv_v_step), // %[nv_v_step]
|
||||
[width_last_y] "r"(width_last_y), // %[width_last_y]
|
||||
[width_last_uv] "r"(width_last_uv) // %[width_last_uv]
|
||||
: "cc", "memory", YUVTORGB_SVE_REGS, "p2");
|
||||
}
|
||||
|
||||
void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
uint32_t nv_u_start = 0x0000'0000U;
|
||||
uint32_t nv_u_step = 0x0004'0004U;
|
||||
uint32_t nv_v_start = 0x0002'0002U;
|
||||
uint32_t nv_v_step = 0x0004'0004U;
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
int width_last_uv = width_last_y + (width_last_y & 1);
|
||||
asm("ptrue p0.b \n"
|
||||
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
|
||||
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
|
||||
"dup z19.b, #255 \n" // A
|
||||
YUVTORGB_SVE_SETUP
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// Run bulk of computation with an all-true predicate to avoid predicate
|
||||
// generation overhead.
|
||||
"ptrue p1.h \n"
|
||||
"ptrue p2.h \n"
|
||||
"1: \n" //
|
||||
READUYVY_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[width], %w[width], %w[vl] \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"2: \n"
|
||||
"whilelt p1.h, wzr, %w[width_last_y] \n"
|
||||
"whilelt p2.h, wzr, %w[width_last_uv] \n" //
|
||||
READUYVY_SVE I4XXTORGB_SVE RGBTOARGB8_SVE
|
||||
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_uyvy] "+r"(src_uyvy), // %[src_yuy2]
|
||||
[dst_argb] "+r"(dst_argb), // %[dst_argb]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [vl] "r"(vl), // %[vl]
|
||||
[kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
|
||||
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
|
||||
[nv_u_start] "r"(nv_u_start), // %[nv_u_start]
|
||||
[nv_u_step] "r"(nv_u_step), // %[nv_u_step]
|
||||
[nv_v_start] "r"(nv_v_start), // %[nv_v_start]
|
||||
[nv_v_step] "r"(nv_v_step), // %[nv_v_step]
|
||||
[width_last_y] "r"(width_last_y), // %[width_last_y]
|
||||
[width_last_uv] "r"(width_last_uv) // %[width_last_uv]
|
||||
: "cc", "memory", YUVTORGB_SVE_REGS, "p2");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user