From a64fffe632101caa83c2571fe6c6481802aa2e04 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Fri, 12 Jul 2024 19:02:39 +0100
Subject: [PATCH] Revert "Disable NV12ToARGB_SVE2 which fails the 'any' test"

This reverts commit f480fa1c4a4af0ce3c34cd7b1ab0d85f1a36ce17.

This code has a number of small issues:

* The YUVTORGB_SVE_SETUP macro requires p0 to be initialized to
  all-true, however the existing kernel does not initialise p0 until
  after this macro is called, so flip the order.

* The p2 register is missing from the clobber list, so add it.

* The existing code uses the wrong condition flags when determining
  whether to do the tail iteration using WHILE instructions or not.
  Additionally the number of tail iterations is incorrect, as it was
  incorrectly not changed from when the tail code was always executed.

While we are here, make another few small improvements:

* Remove the single-quote digit separators as requested here:
  https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5622133

* Remove "volatile" from the asm block counting the vector length.  This
  particular asm block cannot be removed by the compiler since the
  output register is consumed by subsequent code, so "volatile" is
  unnecessary here and we remove it.

* Add some additional empty comments to force clang-format to put macros
  into the next line rather than on the same line as other asm.

Bug: b/352371649
Change-Id: I45676fab95343f588cf11ce2cf9186ffbe87489e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5703586
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h |  5 ++---
 source/row_sve.cc    | 28 +++++++++++++---------------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index c1c405d63..d080bb2e8 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -600,9 +600,8 @@ extern "C" {
 #define HAS_I422TORGBAROW_SVE2
 #define HAS_I444ALPHATOARGBROW_SVE2
 #define HAS_I444TOARGBROW_SVE2
-// Any support for NV12 SVE2 fails
-//#define HAS_NV12TOARGBROW_SVE2
-//#define HAS_NV21TOARGBROW_SVE2
+#define HAS_NV12TOARGBROW_SVE2
+#define HAS_NV21TOARGBROW_SVE2
 #define HAS_RAWTOARGBROW_SVE2
 #define HAS_RAWTORGB24ROW_SVE2
 #define HAS_RAWTORGBAROW_SVE2
diff --git a/source/row_sve.cc b/source/row_sve.cc
index 200433fec..97ba9cbac 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -434,29 +434,27 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
                                     uint32_t nv_uv_start,
                                     uint32_t nv_uv_step) {
   uint64_t vl;
-  asm volatile (
-      "cnth %0" : "=r"(vl));
+  asm("cnth %0" : "=r"(vl));
   int width_last_y = width & (vl - 1);
-  width_last_y = width_last_y == 0 ? vl : width_last_y;
   int width_last_uv = width_last_y + (width_last_y & 1);
   asm volatile(
+      "ptrue    p0.b                                    \n"  //
       YUVTORGB_SVE_SETUP
-      "ptrue    p0.b                                    \n"
       "index    z22.s, %w[nv_uv_start], %w[nv_uv_step]  \n"
       "dup      z19.b, #255                             \n"  // A
       "subs     %w[width], %w[width], %w[vl]            \n"
-      "b.le     2f                                      \n"
+      "b.lt     2f                                      \n"
 
       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p1.h                                    \n"
       "ptrue    p2.h                                    \n"
-      "1:                                               \n" READNV_SVE
-          NVTORGB_SVE RGBTOARGB8_SVE
+      "1:                                               \n"  //
+      READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE
       "subs     %w[width], %w[width], %w[vl]            \n"
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
       "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
-      "b.gt     1b                                      \n"
+      "b.ge     1b                                      \n"
 
       "2:                                               \n"
       "adds     %w[width], %w[width], %w[vl]            \n"
@@ -465,8 +463,8 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
       // Calculate a predicate for the final iteration to deal with the tail.
       "3:                                               \n"
       "whilelt  p1.h, wzr, %w[width_last_y]             \n"
-      "whilelt  p2.h, wzr, %w[width_last_uv]            \n" READNV_SVE
-          NVTORGB_SVE RGBTOARGB8_SVE
+      "whilelt  p2.h, wzr, %w[width_last_uv]            \n"  //
+      READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
 
       "99:                                              \n"
@@ -481,7 +479,7 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
         [nv_uv_step] "r"(nv_uv_step),                       // %[nv_uv_step]
         [width_last_y] "r"(width_last_y),                   // %[width_last_y]
         [width_last_uv] "r"(width_last_uv)                  // %[width_last_uv]
-      : "cc", "memory", YUVTORGB_SVE_REGS);
+      : "cc", "memory", YUVTORGB_SVE_REGS, "p2");
 }
 
 void NV12ToARGBRow_SVE2(const uint8_t* src_y,
@@ -489,8 +487,8 @@ void NV12ToARGBRow_SVE2(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  uint32_t nv_uv_start = 0x0200'0200U;
-  uint32_t nv_uv_step = 0x0404'0404U;
+  uint32_t nv_uv_start = 0x02000200U;
+  uint32_t nv_uv_step = 0x04040404U;
   NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_uv_start,
                    nv_uv_step);
 }
@@ -500,8 +498,8 @@ void NV21ToARGBRow_SVE2(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  uint32_t nv_uv_start = 0x0002'0002U;
-  uint32_t nv_uv_step = 0x0404'0404U;
+  uint32_t nv_uv_start = 0x00020002U;
+  uint32_t nv_uv_step = 0x04040404U;
   NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_uv_start,
                    nv_uv_step);
 }