Revert "Disable NV12ToARGB_SVE2 which fails the 'any' test"

This reverts commit f480fa1c4a4af0ce3c34cd7b1ab0d85f1a36ce17.

This code has a number of small issues:

* The YUVTORGB_SVE_SETUP macro requires p0 to be initialized to
  all-true, however the existing kernel does not initialise p0 until
  after this macro is called, so flip the order.

* The p2 register is missing from the clobber list, so add it.

* The existing code uses the wrong condition flags when determining
  whether to do the tail iteration using WHILE instructions or not.
  Additionally the number of tail iterations is incorrect, as it was
  incorrectly not changed from when the tail code was always executed.

While we are here, make another few small improvements:

* Remove the single-quote digit separators as requested here:
  https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5622133

* Remove "volatile" from the asm block counting the vector length.  This
  particular asm block cannot be removed by the compiler since the
  output register is consumed by subsequent code, so "volatile" is
  unnecessary here and we remove it.

* Add some additional empty comments to force clang-format to put macros
  into the next line rather than on the same line as other asm.

Bug: b/352371649
Change-Id: I45676fab95343f588cf11ce2cf9186ffbe87489e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5703586
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-07-12 19:02:39 +01:00 committed by Frank Barchard
parent e1a93c79fc
commit a64fffe632
2 changed files with 15 additions and 18 deletions

View File

@ -600,9 +600,8 @@ extern "C" {
#define HAS_I422TORGBAROW_SVE2
#define HAS_I444ALPHATOARGBROW_SVE2
#define HAS_I444TOARGBROW_SVE2
// Any support for NV12 SVE2 fails
//#define HAS_NV12TOARGBROW_SVE2
//#define HAS_NV21TOARGBROW_SVE2
#define HAS_NV12TOARGBROW_SVE2
#define HAS_NV21TOARGBROW_SVE2
#define HAS_RAWTOARGBROW_SVE2
#define HAS_RAWTORGB24ROW_SVE2
#define HAS_RAWTORGBAROW_SVE2

View File

@ -434,29 +434,27 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
uint32_t nv_uv_start,
uint32_t nv_uv_step) {
uint64_t vl;
asm volatile (
"cnth %0" : "=r"(vl));
asm("cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1);
width_last_y = width_last_y == 0 ? vl : width_last_y;
int width_last_uv = width_last_y + (width_last_y & 1);
asm volatile(
"ptrue p0.b \n" //
YUVTORGB_SVE_SETUP
"ptrue p0.b \n"
"index z22.s, %w[nv_uv_start], %w[nv_uv_step] \n"
"dup z19.b, #255 \n" // A
"subs %w[width], %w[width], %w[vl] \n"
"b.le 2f \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p1.h \n"
"ptrue p2.h \n"
"1: \n" READNV_SVE
NVTORGB_SVE RGBTOARGB8_SVE
"1: \n" //
READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE
"subs %w[width], %w[width], %w[vl] \n"
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"add %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
"b.gt 1b \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
@ -465,8 +463,8 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
// Calculate a predicate for the final iteration to deal with the tail.
"3: \n"
"whilelt p1.h, wzr, %w[width_last_y] \n"
"whilelt p2.h, wzr, %w[width_last_uv] \n" READNV_SVE
NVTORGB_SVE RGBTOARGB8_SVE
"whilelt p2.h, wzr, %w[width_last_uv] \n" //
READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE
"st2h {z16.h, z17.h}, p1, [%[dst_argb]] \n"
"99: \n"
@ -481,7 +479,7 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
[nv_uv_step] "r"(nv_uv_step), // %[nv_uv_step]
[width_last_y] "r"(width_last_y), // %[width_last_y]
[width_last_uv] "r"(width_last_uv) // %[width_last_uv]
: "cc", "memory", YUVTORGB_SVE_REGS);
: "cc", "memory", YUVTORGB_SVE_REGS, "p2");
}
void NV12ToARGBRow_SVE2(const uint8_t* src_y,
@ -489,8 +487,8 @@ void NV12ToARGBRow_SVE2(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
uint32_t nv_uv_start = 0x0200'0200U;
uint32_t nv_uv_step = 0x0404'0404U;
uint32_t nv_uv_start = 0x02000200U;
uint32_t nv_uv_step = 0x04040404U;
NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_uv_start,
nv_uv_step);
}
@ -500,8 +498,8 @@ void NV21ToARGBRow_SVE2(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
uint32_t nv_uv_start = 0x0002'0002U;
uint32_t nv_uv_step = 0x0404'0404U;
uint32_t nv_uv_start = 0x00020002U;
uint32_t nv_uv_step = 0x04040404U;
NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_uv_start,
nv_uv_step);
}