From a64fffe632101caa83c2571fe6c6481802aa2e04 Mon Sep 17 00:00:00 2001 From: George Steed Date: Fri, 12 Jul 2024 19:02:39 +0100 Subject: [PATCH] Revert "Disable NV12ToARGB_SVE2 which fails the 'any' test" This reverts commit f480fa1c4a4af0ce3c34cd7b1ab0d85f1a36ce17. This code has a number of small issues: * The YUVTORGB_SVE_SETUP macro requires p0 to be initialized to all-true, however the existing kernel does not initialise p0 until after this macro is called, so flip the order. * The p2 register is missing from the clobber list, so add it. * The existing code uses the wrong condition flags when determining whether to do the tail iteration using WHILE instructions or not. Additionally the number of tail iterations is incorrect, as it was incorrectly not changed from when the tail code was always executed. While we are here, make another few small improvements: * Remove the single-quote digit separators as requested here: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5622133 * Remove "volatile" from the asm block counting the vector length. This particular asm block cannot be removed by the compiler since the output register is consumed by subsequent code, so "volatile" is unnecessary here and we remove it. * Add some additional empty comments to force clang-format to put macros into the next line rather than on the same line as other asm. Bug: b/352371649 Change-Id: I45676fab95343f588cf11ce2cf9186ffbe87489e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5703586 Reviewed-by: Frank Barchard --- include/libyuv/row.h | 5 ++--- source/row_sve.cc | 28 +++++++++++++--------------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index c1c405d63..d080bb2e8 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -600,9 +600,8 @@ extern "C" { #define HAS_I422TORGBAROW_SVE2 #define HAS_I444ALPHATOARGBROW_SVE2 #define HAS_I444TOARGBROW_SVE2 -// Any support for NV12 SVE2 fails -//#define HAS_NV12TOARGBROW_SVE2 -//#define HAS_NV21TOARGBROW_SVE2 +#define HAS_NV12TOARGBROW_SVE2 +#define HAS_NV21TOARGBROW_SVE2 #define HAS_RAWTOARGBROW_SVE2 #define HAS_RAWTORGB24ROW_SVE2 #define HAS_RAWTORGBAROW_SVE2 diff --git a/source/row_sve.cc b/source/row_sve.cc index 200433fec..97ba9cbac 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -434,29 +434,27 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, uint32_t nv_uv_start, uint32_t nv_uv_step) { uint64_t vl; - asm volatile ( - "cnth %0" : "=r"(vl)); + asm("cnth %0" : "=r"(vl)); int width_last_y = width & (vl - 1); - width_last_y = width_last_y == 0 ? vl : width_last_y; int width_last_uv = width_last_y + (width_last_y & 1); asm volatile( + "ptrue p0.b \n" // YUVTORGB_SVE_SETUP - "ptrue p0.b \n" "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n" "dup z19.b, #255 \n" // A "subs %w[width], %w[width], %w[vl] \n" - "b.le 2f \n" + "b.lt 2f \n" // Run bulk of computation with an all-true predicate to avoid predicate // generation overhead. "ptrue p1.h \n" "ptrue p2.h \n" - "1: \n" READNV_SVE - NVTORGB_SVE RGBTOARGB8_SVE + "1: \n" // + READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE "subs %w[width], %w[width], %w[vl] \n" "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" - "b.gt 1b \n" + "b.ge 1b \n" "2: \n" "adds %w[width], %w[width], %w[vl] \n" @@ -465,8 +463,8 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, // Calculate a predicate for the final iteration to deal with the tail. "3: \n" "whilelt p1.h, wzr, %w[width_last_y] \n" - "whilelt p2.h, wzr, %w[width_last_uv] \n" READNV_SVE - NVTORGB_SVE RGBTOARGB8_SVE + "whilelt p2.h, wzr, %w[width_last_uv] \n" // + READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" "99: \n" @@ -481,7 +479,7 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, [nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step] [width_last_y] "r"(width_last_y), // %[width_last_y] [width_last_uv] "r"(width_last_uv) // %[width_last_uv] - : "cc", "memory", YUVTORGB_SVE_REGS); + : "cc", "memory", YUVTORGB_SVE_REGS, "p2"); } void NV12ToARGBRow_SVE2(const uint8_t* src_y, @@ -489,8 +487,8 @@ void NV12ToARGBRow_SVE2(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - uint32_t nv_uv_start = 0x0200'0200U; - uint32_t nv_uv_step = 0x0404'0404U; + uint32_t nv_uv_start = 0x02000200U; + uint32_t nv_uv_step = 0x04040404U; NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_uv_start, nv_uv_step); } @@ -500,8 +498,8 @@ void NV21ToARGBRow_SVE2(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - uint32_t nv_uv_start = 0x0002'0002U; - uint32_t nv_uv_step = 0x0404'0404U; + uint32_t nv_uv_start = 0x00020002U; + uint32_t nv_uv_step = 0x04040404U; NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_uv_start, nv_uv_step); }