From 8d0d885c2f4bc2479949b3e182ca9f649806b15b Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 12 Mar 2024 22:08:57 +0000 Subject: [PATCH] [AArch64] Avoid LD2 in YUY2ToARGBRow_NEON In this case we have an LD2 instruction followed by a pair of permutes (ZIP1 and TBL). On some micro-architectures LD2 involves use of the vector pipelines, so in these cases it is preferable to do an LD1 and then a different pair of permutes (TRN + TBL) instead to avoid the extra vector pipeline usage. Reduction in runtime on selected kernels (no observed performance delta on Cortex-A55): Kernel | Cortex-A76 | Cortex-X2 UYVYToARGBRow_NEON | -2.6% | -8.8% YUY2ToARGBRow_NEON | -6.2% | -4.9% Bug: libyuv:976 Change-Id: I7ca45e0c7bf7cb50cc5ab37c6a01215d9689039a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5366652 Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- source/row_neon64.cc | 48 +++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index a51b51aa8..d37b8dadd 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -55,8 +55,12 @@ extern "C" { static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6, 1, 1, 3, 3, 5, 5, 7, 7}; +static const uvec8 kNV12InterleavedTable = {0, 0, 4, 4, 8, 8, 12, 12, + 2, 2, 6, 6, 10, 10, 14, 14}; static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7, 0, 0, 2, 2, 4, 4, 6, 6}; +static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13, + 3, 3, 7, 7, 11, 11, 15, 15}; // Read 8 Y and 4 UV from NV12 or NV21 #define READNV12 \ @@ -68,17 +72,17 @@ static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7, "prfm pldl1keep, [%[src_uv], 448] \n" // Read 8 YUY2 -#define READYUY2 \ - "ld2 {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \ - "zip1 v0.16b, v0.16b, v0.16b \n" \ - "prfm pldl1keep, [%[src_yuy2], 448] \n" \ - "tbl v1.16b, {v1.16b}, v2.16b \n" +#define READYUY2 \ + "ld1 {v3.16b}, [%[src_yuy2]], #16 \n" \ + "trn1 v0.16b, v3.16b, v3.16b \n" \ + "prfm pldl1keep, [%[src_yuy2], 448] \n" \ + "tbl v1.16b, {v3.16b}, v2.16b \n" // Read 8 UYVY -#define READUYVY \ - "ld2 {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \ - "zip1 v0.16b, v4.16b, v4.16b \n" \ - "prfm pldl1keep, [%[src_uyvy], 448] \n" \ +#define READUYVY \ + "ld1 {v3.16b}, [%[src_uyvy]], #16 \n" \ + "trn2 v0.16b, v3.16b, v3.16b \n" \ + "prfm pldl1keep, [%[src_uyvy], 448] \n" \ "tbl v1.16b, {v3.16b}, v2.16b \n" // UB VR UG VG @@ -569,18 +573,19 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, int width) { asm volatile( YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READYUY2 YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV21InterleavedTable]] \n" + "1: \n" READYUY2 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" + "b.gt 1b \n" : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV12Table) + [kNV21InterleavedTable] "r"(&kNV21InterleavedTable) : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } @@ -590,18 +595,19 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, int width) { asm volatile( YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kNV12Table]] \n" - "1: \n" READUYVY YUVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12InterleavedTable]] \n" + "1: \n" READUYVY YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" + "b.gt 1b \n" : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [kNV12Table] "r"(&kNV12Table) + [kNV12InterleavedTable] "r"(&kNV12InterleavedTable) : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); }