From 5236846b648418089d9d88b797ed1b7a5e03e907 Mon Sep 17 00:00:00 2001 From: George Steed Date: Wed, 24 Apr 2024 18:03:20 +0100 Subject: [PATCH] [AArch64] Keep UV interleaved in some *ToARGBRow_SVE2 kernels The existing I4XXTORGB_SVE macro operates only on even byte lanes of the loaded U/V vectors. This is sub-optimal since we are effectively wasting half of the vector in any pre-processing steps before the conversion. In particular, where the UV components are loaded from interleaved data we can save a TBL instruction by maintaining the interleaved format. This commit introduces a new NVTORGB_SVE macro to handle the case where U/V components are interleaved into even/odd bytes of a vector, mirroring a similar macro in the AArch64 Neon implementation. Reduction in runtimes observed compared to the existing SVE2 code: | Cortex-A510 | Cortex-A720 | Cortex-X2 NV12ToARGBRow_SVE2 | -5.3% | -0.2% | -4.4% NV21ToARGBRow_SVE2 | -5.3% | -0.2% | -4.4% UYVYToARGBRow_SVE2 | -5.6% | 0.0% | -4.6% YUY2ToARGBRow_SVE2 | -5.5% | -0.1% | -4.2% Bug: libyuv:973 Change-Id: I418de2e684e0b6b0b9e41c39b564438531e44671 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5622133 Reviewed-by: Justin Green Reviewed-by: Frank Barchard --- source/row_sve.cc | 120 ++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 63 deletions(-) diff --git a/source/row_sve.cc b/source/row_sve.cc index dca5bc653..2180312ba 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -58,25 +58,22 @@ extern "C" { "inch %[src_y] \n" \ "inch %[src_uv] \n" \ "prfm pldl1keep, [%[src_y], 448] \n" \ - "prfm pldl1keep, [%[src_uv], 448] \n" \ + "prfm pldl1keep, [%[src_uv], 256] \n" \ "trn1 z0.b, z0.b, z0.b \n" /* YYYY */ \ - "tbl z2.h, {z1.h}, z23.h \n" /* V0V0 */ \ - "tbl z1.h, {z1.h}, z22.h \n" /* U0U0 */ + "tbl z1.b, {z1.b}, z22.b \n" /* UVUV */ #define READYUY2_SVE \ "ld1w {z0.s}, p2/z, [%[src_yuy2]] \n" /* YUYV */ \ "incb %[src_yuy2] \n" \ "prfm pldl1keep, [%[src_yuy2], 448] \n" \ - "tbl z2.b, {z0.b}, z23.b \n" /* V0V0 */ \ - "tbl z1.b, {z0.b}, z22.b \n" /* U0U0 */ \ + "tbl z1.b, {z0.b}, z22.b \n" /* UVUV */ \ "trn1 z0.b, z0.b, z0.b \n" /* YYYY */ #define READUYVY_SVE \ "ld1w {z0.s}, p2/z, [%[src_uyvy]] \n" /* UYVY */ \ "incb %[src_uyvy] \n" \ "prfm pldl1keep, [%[src_uyvy], 448] \n" \ - "tbl z2.b, {z0.b}, z23.b \n" /* V0V0 */ \ - "tbl z1.b, {z0.b}, z22.b \n" /* U0U0 */ \ + "tbl z1.b, {z0.b}, z22.b \n" /* UVUV */ \ "trn2 z0.b, z0.b, z0.b \n" /* YYYY */ #define YUVTORGB_SVE_SETUP \ @@ -89,6 +86,23 @@ extern "C" { "ld1rh {z26.h}, p0/z, [%[kRGBCoeffBias], #4] \n" \ "ld1rh {z27.h}, p0/z, [%[kRGBCoeffBias], #6] \n" +// Like I4XXTORGB_SVE but U/V components are stored in even/odd .b lanes of z1 +// rather than widened .h elements of z1/z2. +#define NVTORGB_SVE \ + "umulh z0.h, z24.h, z0.h \n" /* Y */ \ + "umullb z6.h, z30.b, z1.b \n" \ + "umullb z4.h, z28.b, z1.b \n" /* DB */ \ + "umullt z5.h, z29.b, z1.b \n" /* DR */ \ + "umlalt z6.h, z31.b, z1.b \n" /* DG */ \ + "add z17.h, z0.h, z26.h \n" /* G */ \ + "add z16.h, z0.h, z4.h \n" /* B */ \ + "add z18.h, z0.h, z5.h \n" /* R */ \ + "uqsub z17.h, z17.h, z6.h \n" /* G */ \ + "uqsub z16.h, z16.h, z25.h \n" /* B */ \ + "uqsub z18.h, z18.h, z27.h \n" /* R */ + +// Like NVTORGB_SVE but U/V components are stored in widened .h elements of +// z1/z2 rather than even/odd .b lanes of z1. #define I4XXTORGB_SVE \ "umulh z0.h, z24.h, z0.h \n" /* Y */ \ "umullb z6.h, z30.b, z1.b \n" \ @@ -417,20 +431,18 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width, - uint32_t nv_u_start, - uint32_t nv_u_step, - uint32_t nv_v_start, - uint32_t nv_v_step) { + uint32_t nv_uv_start, + uint32_t nv_uv_step) { uint64_t vl; asm volatile ( "cnth %0" : "=r"(vl)); int width_last_y = width & (vl - 1); width_last_y = width_last_y == 0 ? vl : width_last_y; int width_last_uv = width_last_y + (width_last_y & 1); - asm volatile ( - "ptrue p0.b \n" YUVTORGB_SVE_SETUP - "index z22.s, %w[nv_u_start], %w[nv_u_step] \n" - "index z23.s, %w[nv_v_start], %w[nv_v_step] \n" + asm volatile( + YUVTORGB_SVE_SETUP + "ptrue p0.b \n" + "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n" "dup z19.b, #255 \n" // A "subs %w[width], %w[width], %w[vl] \n" "b.le 2f \n" @@ -440,7 +452,7 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, "ptrue p1.h \n" "ptrue p2.h \n" "1: \n" READNV_SVE - I4XXTORGB_SVE RGBTOARGB8_SVE + NVTORGB_SVE RGBTOARGB8_SVE "subs %w[width], %w[width], %w[vl] \n" "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" @@ -454,7 +466,7 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, "3: \n" "whilelt p1.h, wzr, %w[width_last_y] \n" "whilelt p2.h, wzr, %w[width_last_uv] \n" READNV_SVE - I4XXTORGB_SVE RGBTOARGB8_SVE + NVTORGB_SVE RGBTOARGB8_SVE "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" "99: \n" @@ -465,10 +477,8 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, : [vl] "r"(vl), // %[vl] [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [nv_u_start] "r"(nv_u_start), // %[nv_u_start] - [nv_u_step] "r"(nv_u_step), // %[nv_u_step] - [nv_v_start] "r"(nv_v_start), // %[nv_v_start] - [nv_v_step] "r"(nv_v_step), // %[nv_v_step] + [nv_uv_start] "r"(nv_uv_start), // %[nv_uv_start] + [nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step] [width_last_y] "r"(width_last_y), // %[width_last_y] [width_last_uv] "r"(width_last_uv) // %[width_last_uv] : "cc", "memory", YUVTORGB_SVE_REGS); @@ -479,12 +489,10 @@ void NV12ToARGBRow_SVE2(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - uint32_t nv_u_start = 0x0000'0000U; - uint32_t nv_u_step = 0x0002'0002U; - uint32_t nv_v_start = 0x0001'0001U; - uint32_t nv_v_step = 0x0002'0002U; - NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_u_start, - nv_u_step, nv_v_start, nv_v_step); + uint32_t nv_uv_start = 0x0200'0200U; + uint32_t nv_uv_step = 0x0404'0404U; + NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_uv_start, + nv_uv_step); } void NV21ToARGBRow_SVE2(const uint8_t* src_y, @@ -492,12 +500,10 @@ void NV21ToARGBRow_SVE2(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - uint32_t nv_u_start = 0x0001'0001U; - uint32_t nv_u_step = 0x0002'0002U; - uint32_t nv_v_start = 0x0000'0000U; - uint32_t nv_v_step = 0x0002'0002U; - NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_u_start, - nv_u_step, nv_v_start, nv_v_step); + uint32_t nv_uv_start = 0x0002'0002U; + uint32_t nv_uv_step = 0x0404'0404U; + NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_uv_start, + nv_uv_step); } // Dot-product constants are stored as four-tuples with the two innermost @@ -998,19 +1004,15 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - uint32_t nv_u_start = 0x0001'0001U; - uint32_t nv_u_step = 0x0004'0004U; - uint32_t nv_v_start = 0x0003'0003U; - uint32_t nv_v_step = 0x0004'0004U; + uint32_t nv_uv_start = 0x0301'0301U; + uint32_t nv_uv_step = 0x0404'0404U; uint64_t vl; - asm volatile ( - "cnth %0" : "=r"(vl)); + asm("cnth %0" : "=r"(vl)); int width_last_y = width & (vl - 1); int width_last_uv = width_last_y + (width_last_y & 1); - asm volatile ( + asm volatile( "ptrue p0.b \n" - "index z22.s, %w[nv_u_start], %w[nv_u_step] \n" - "index z23.s, %w[nv_v_start], %w[nv_v_step] \n" + "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n" "dup z19.b, #255 \n" // A YUVTORGB_SVE_SETUP "subs %w[width], %w[width], %w[vl] \n" @@ -1021,7 +1023,7 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2, "ptrue p1.h \n" "ptrue p2.h \n" "1: \n" // - READYUY2_SVE I4XXTORGB_SVE RGBTOARGB8_SVE + READYUY2_SVE NVTORGB_SVE RGBTOARGB8_SVE "subs %w[width], %w[width], %w[vl] \n" "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" @@ -1034,7 +1036,7 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2, // Calculate a predicate for the final iteration to deal with the tail. "whilelt p1.h, wzr, %w[width_last_y] \n" "whilelt p2.h, wzr, %w[width_last_uv] \n" // - READYUY2_SVE I4XXTORGB_SVE RGBTOARGB8_SVE + READYUY2_SVE NVTORGB_SVE RGBTOARGB8_SVE "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" "99: \n" @@ -1044,10 +1046,8 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2, : [vl] "r"(vl), // %[vl] [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [nv_u_start] "r"(nv_u_start), // %[nv_u_start] - [nv_u_step] "r"(nv_u_step), // %[nv_u_step] - [nv_v_start] "r"(nv_v_start), // %[nv_v_start] - [nv_v_step] "r"(nv_v_step), // %[nv_v_step] + [nv_uv_start] "r"(nv_uv_start), // %[nv_uv_start] + [nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step] [width_last_y] "r"(width_last_y), // %[width_last_y] [width_last_uv] "r"(width_last_uv) // %[width_last_uv] : "cc", "memory", YUVTORGB_SVE_REGS, "p2"); @@ -1057,19 +1057,15 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - uint32_t nv_u_start = 0x0000'0000U; - uint32_t nv_u_step = 0x0004'0004U; - uint32_t nv_v_start = 0x0002'0002U; - uint32_t nv_v_step = 0x0004'0004U; + uint32_t nv_uv_start = 0x0200'0200U; + uint32_t nv_uv_step = 0x0404'0404U; uint64_t vl; - asm volatile ( - "cnth %0" : "=r"(vl)); + asm("cnth %0" : "=r"(vl)); int width_last_y = width & (vl - 1); int width_last_uv = width_last_y + (width_last_y & 1); - asm volatile ( + asm volatile( "ptrue p0.b \n" - "index z22.s, %w[nv_u_start], %w[nv_u_step] \n" - "index z23.s, %w[nv_v_start], %w[nv_v_step] \n" + "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n" "dup z19.b, #255 \n" // A YUVTORGB_SVE_SETUP "subs %w[width], %w[width], %w[vl] \n" @@ -1080,7 +1076,7 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy, "ptrue p1.h \n" "ptrue p2.h \n" "1: \n" // - READUYVY_SVE I4XXTORGB_SVE RGBTOARGB8_SVE + READUYVY_SVE NVTORGB_SVE RGBTOARGB8_SVE "subs %w[width], %w[width], %w[vl] \n" "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" @@ -1094,7 +1090,7 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy, "2: \n" "whilelt p1.h, wzr, %w[width_last_y] \n" "whilelt p2.h, wzr, %w[width_last_uv] \n" // - READUYVY_SVE I4XXTORGB_SVE RGBTOARGB8_SVE + READUYVY_SVE NVTORGB_SVE RGBTOARGB8_SVE "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" "99: \n" @@ -1104,10 +1100,8 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy, : [vl] "r"(vl), // %[vl] [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] - [nv_u_start] "r"(nv_u_start), // %[nv_u_start] - [nv_u_step] "r"(nv_u_step), // %[nv_u_step] - [nv_v_start] "r"(nv_v_start), // %[nv_v_start] - [nv_v_step] "r"(nv_v_step), // %[nv_v_step] + [nv_uv_start] "r"(nv_uv_start), // %[nv_uv_start] + [nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step] [width_last_y] "r"(width_last_y), // %[width_last_y] [width_last_uv] "r"(width_last_uv) // %[width_last_uv] : "cc", "memory", YUVTORGB_SVE_REGS, "p2");