[AArch64] Avoid LD2 in YUY2ToARGBRow_NEON

In this case we have an LD2 instruction followed by a pair of permutes
(ZIP1 and TBL). On some micro-architectures LD2 involves use of the
vector pipelines, so in these cases it is preferable to do an LD1 and
then a different pair of permutes (TRN + TBL) instead to avoid the extra
vector pipeline usage.

Reduction in runtime on selected kernels (no observed performance delta
on Cortex-A55):

            Kernel | Cortex-A76 | Cortex-X2
UYVYToARGBRow_NEON |      -2.6% |     -8.8%
YUY2ToARGBRow_NEON |      -6.2% |     -4.9%

Bug: libyuv:976
Change-Id: I7ca45e0c7bf7cb50cc5ab37c6a01215d9689039a
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5366652
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-03-12 22:08:57 +00:00 committed by libyuv LUCI CQ
parent 188e4e3afb
commit 8d0d885c2f

View File

@ -55,8 +55,12 @@ extern "C" {
static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
1, 1, 3, 3, 5, 5, 7, 7};
static const uvec8 kNV12InterleavedTable = {0, 0, 4, 4, 8, 8, 12, 12,
2, 2, 6, 6, 10, 10, 14, 14};
static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
0, 0, 2, 2, 4, 4, 6, 6};
static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13,
3, 3, 7, 7, 11, 11, 15, 15};
// Read 8 Y and 4 UV from NV12 or NV21
#define READNV12 \
@ -68,17 +72,17 @@ static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
"prfm pldl1keep, [%[src_uv], 448] \n"
// Read 8 YUY2
#define READYUY2 \
"ld2 {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \
"zip1 v0.16b, v0.16b, v0.16b \n" \
"prfm pldl1keep, [%[src_yuy2], 448] \n" \
"tbl v1.16b, {v1.16b}, v2.16b \n"
#define READYUY2 \
"ld1 {v3.16b}, [%[src_yuy2]], #16 \n" \
"trn1 v0.16b, v3.16b, v3.16b \n" \
"prfm pldl1keep, [%[src_yuy2], 448] \n" \
"tbl v1.16b, {v3.16b}, v2.16b \n"
// Read 8 UYVY
#define READUYVY \
"ld2 {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \
"zip1 v0.16b, v4.16b, v4.16b \n" \
"prfm pldl1keep, [%[src_uyvy], 448] \n" \
#define READUYVY \
"ld1 {v3.16b}, [%[src_uyvy]], #16 \n" \
"trn2 v0.16b, v3.16b, v3.16b \n" \
"prfm pldl1keep, [%[src_uyvy], 448] \n" \
"tbl v1.16b, {v3.16b}, v2.16b \n"
// UB VR UG VG
@ -569,18 +573,19 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READYUY2 YUVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV21InterleavedTable]] \n"
"1: \n" READYUY2 YUVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
"b.gt 1b \n"
: [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[kNV12Table] "r"(&kNV12Table)
[kNV21InterleavedTable] "r"(&kNV21InterleavedTable)
: "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}
@ -590,18 +595,19 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READUYVY YUVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12InterleavedTable]] \n"
"1: \n" READUYVY YUVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
"b.gt 1b \n"
: [src_uyvy] "+r"(src_uyvy), // %[src_yuy2]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
[kNV12Table] "r"(&kNV12Table)
[kNV12InterleavedTable] "r"(&kNV12InterleavedTable)
: "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}