diff --git a/include/libyuv/row.h b/include/libyuv/row.h index c1c405d63..d080bb2e8 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -600,9 +600,8 @@ extern "C" { #define HAS_I422TORGBAROW_SVE2 #define HAS_I444ALPHATOARGBROW_SVE2 #define HAS_I444TOARGBROW_SVE2 -// Any support for NV12 SVE2 fails -//#define HAS_NV12TOARGBROW_SVE2 -//#define HAS_NV21TOARGBROW_SVE2 +#define HAS_NV12TOARGBROW_SVE2 +#define HAS_NV21TOARGBROW_SVE2 #define HAS_RAWTOARGBROW_SVE2 #define HAS_RAWTORGB24ROW_SVE2 #define HAS_RAWTORGBAROW_SVE2 diff --git a/source/row_sve.cc b/source/row_sve.cc index 200433fec..97ba9cbac 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -434,29 +434,27 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, uint32_t nv_uv_start, uint32_t nv_uv_step) { uint64_t vl; - asm volatile ( - "cnth %0" : "=r"(vl)); + asm("cnth %0" : "=r"(vl)); int width_last_y = width & (vl - 1); - width_last_y = width_last_y == 0 ? vl : width_last_y; int width_last_uv = width_last_y + (width_last_y & 1); asm volatile( + "ptrue p0.b \n" // YUVTORGB_SVE_SETUP - "ptrue p0.b \n" "index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n" "dup z19.b, #255 \n" // A "subs %w[width], %w[width], %w[vl] \n" - "b.le 2f \n" + "b.lt 2f \n" // Run bulk of computation with an all-true predicate to avoid predicate // generation overhead. "ptrue p1.h \n" "ptrue p2.h \n" - "1: \n" READNV_SVE - NVTORGB_SVE RGBTOARGB8_SVE + "1: \n" // + READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE "subs %w[width], %w[width], %w[vl] \n" "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" "add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n" - "b.gt 1b \n" + "b.ge 1b \n" "2: \n" "adds %w[width], %w[width], %w[vl] \n" @@ -465,8 +463,8 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, // Calculate a predicate for the final iteration to deal with the tail. "3: \n" "whilelt p1.h, wzr, %w[width_last_y] \n" - "whilelt p2.h, wzr, %w[width_last_uv] \n" READNV_SVE - NVTORGB_SVE RGBTOARGB8_SVE + "whilelt p2.h, wzr, %w[width_last_uv] \n" // + READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE "st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n" "99: \n" @@ -481,7 +479,7 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y, [nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step] [width_last_y] "r"(width_last_y), // %[width_last_y] [width_last_uv] "r"(width_last_uv) // %[width_last_uv] - : "cc", "memory", YUVTORGB_SVE_REGS); + : "cc", "memory", YUVTORGB_SVE_REGS, "p2"); } void NV12ToARGBRow_SVE2(const uint8_t* src_y, @@ -489,8 +487,8 @@ void NV12ToARGBRow_SVE2(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - uint32_t nv_uv_start = 0x0200'0200U; - uint32_t nv_uv_step = 0x0404'0404U; + uint32_t nv_uv_start = 0x02000200U; + uint32_t nv_uv_step = 0x04040404U; NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_uv_start, nv_uv_step); } @@ -500,8 +498,8 @@ void NV21ToARGBRow_SVE2(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - uint32_t nv_uv_start = 0x0002'0002U; - uint32_t nv_uv_step = 0x0404'0404U; + uint32_t nv_uv_start = 0x00020002U; + uint32_t nv_uv_step = 0x04040404U; NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_uv_start, nv_uv_step); }