Revert "Disable NV12ToARGB_SVE2 which fails the 'any' test"

This reverts commit f480fa1c4a4af0ce3c34cd7b1ab0d85f1a36ce17. This code has a number of small issues: * The YUVTORGB_SVE_SETUP macro requires p0 to be initialized to all-true, however the existing kernel does not initialise p0 until after this macro is called, so flip the order. * The p2 register is missing from the clobber list, so add it. * The existing code uses the wrong condition flags when determining whether to do the tail iteration using WHILE instructions or not. Additionally the number of tail iterations is incorrect, as it was incorrectly not changed from when the tail code was always executed. While we are here, make another few small improvements: * Remove the single-quote digit separators as requested here: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5622133 * Remove "volatile" from the asm block counting the vector length. This particular asm block cannot be removed by the compiler since the output register is consumed by subsequent code, so "volatile" is unnecessary here and we remove it. * Add some additional empty comments to force clang-format to put macros into the next line rather than on the same line as other asm. Bug: b/352371649 Change-Id: I45676fab95343f588cf11ce2cf9186ffbe87489e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5703586 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-06 16:56:55 +08:00 · 2024-07-12 19:02:39 +01:00 · 2024-07-12 19:02:39 +01:00 · a64fffe632
commit a64fffe632
parent e1a93c79fc
2 changed files with 15 additions and 18 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -600,9 +600,8 @@ extern "C" {
 #define HAS_I422TORGBAROW_SVE2
 #define HAS_I444ALPHATOARGBROW_SVE2
 #define HAS_I444TOARGBROW_SVE2
-// Any support for NV12 SVE2 fails
-//#define HAS_NV12TOARGBROW_SVE2
-//#define HAS_NV21TOARGBROW_SVE2
+#define HAS_NV12TOARGBROW_SVE2
+#define HAS_NV21TOARGBROW_SVE2
 #define HAS_RAWTOARGBROW_SVE2
 #define HAS_RAWTORGB24ROW_SVE2
 #define HAS_RAWTORGBAROW_SVE2
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -434,29 +434,27 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
                                    uint32_t nv_uv_start,
                                    uint32_t nv_uv_step) {
  uint64_t vl;
-  asm volatile (
-      "cnth %0" : "=r"(vl));
+  asm("cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
-  width_last_y = width_last_y == 0 ? vl : width_last_y;
  int width_last_uv = width_last_y + (width_last_y & 1);
  asm volatile(
+      "ptrue    p0.b                                    \n"  //
      YUVTORGB_SVE_SETUP
-      "ptrue    p0.b                                    \n"
      "index    z22.s, %w[nv_uv_start], %w[nv_uv_step]  \n"
      "dup      z19.b, #255                             \n"  // A
      "subs     %w[width], %w[width], %w[vl]            \n"
-      "b.le     2f                                      \n"
+      "b.lt     2f                                      \n"

      // Run bulk of computation with an all-true predicate to avoid predicate
      // generation overhead.
      "ptrue    p1.h                                    \n"
      "ptrue    p2.h                                    \n"
-      "1:                                               \n" READNV_SVE
-          NVTORGB_SVE RGBTOARGB8_SVE
+      "1:                                               \n"  //
+      READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE
      "subs     %w[width], %w[width], %w[vl]            \n"
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
      "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
-      "b.gt     1b                                      \n"
+      "b.ge     1b                                      \n"

      "2:                                               \n"
      "adds     %w[width], %w[width], %w[vl]            \n"
@ -465,8 +463,8 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
      // Calculate a predicate for the final iteration to deal with the tail.
      "3:                                               \n"
      "whilelt  p1.h, wzr, %w[width_last_y]             \n"
-      "whilelt  p2.h, wzr, %w[width_last_uv]            \n" READNV_SVE
-          NVTORGB_SVE RGBTOARGB8_SVE
+      "whilelt  p2.h, wzr, %w[width_last_uv]            \n"  //
+      READNV_SVE NVTORGB_SVE RGBTOARGB8_SVE
      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"

      "99:                                              \n"
@ -481,7 +479,7 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
        [nv_uv_step] "r"(nv_uv_step),                       // %[nv_uv_step]
        [width_last_y] "r"(width_last_y),                   // %[width_last_y]
        [width_last_uv] "r"(width_last_uv)                  // %[width_last_uv]
-      : "cc", "memory", YUVTORGB_SVE_REGS);
+      : "cc", "memory", YUVTORGB_SVE_REGS, "p2");
 }

 void NV12ToARGBRow_SVE2(const uint8_t* src_y,
@ -489,8 +487,8 @@ void NV12ToARGBRow_SVE2(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint32_t nv_uv_start = 0x0200'0200U;
-  uint32_t nv_uv_step = 0x0404'0404U;
+  uint32_t nv_uv_start = 0x02000200U;
+  uint32_t nv_uv_step = 0x04040404U;
  NVToARGBRow_SVE2(src_y, src_uv, dst_argb, yuvconstants, width, nv_uv_start,
                   nv_uv_step);
 }
@ -500,8 +498,8 @@ void NV21ToARGBRow_SVE2(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint32_t nv_uv_start = 0x0002'0002U;
-  uint32_t nv_uv_step = 0x0404'0404U;
+  uint32_t nv_uv_start = 0x00020002U;
+  uint32_t nv_uv_step = 0x04040404U;
  NVToARGBRow_SVE2(src_y, src_vu, dst_argb, yuvconstants, width, nv_uv_start,
                   nv_uv_step);
 }