From cd4113f4e8463b653985054e7f16699de35a3a9d Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 23 Apr 2024 15:13:47 +0100
Subject: [PATCH] [AArch64] Add SVE2 implementation of I400ToARGBRow

This is mostly a copy of the I422ToARGBRow_SVE2 implementation, but we
can pre-calculate the UV component results before the loop body.

Unlike in the Neon version of the code we can make use of MOVPRFX and
USQADD to avoid needing to apply the bias separately from the UV
coefficient multiply additions.

Reduction in runtime observed compared to the existing Neon code:

Cortex-A510: -26.1%
Cortex-A520:  -5.9%
Cortex-A715: -49.5%
Cortex-A720: -49.4%
  Cortex-X2: -22.5%
  Cortex-X3: -23.5%
  Cortex-X4: -21.6%

Bug: libyuv:973
Change-Id: Ib9fc52bd53a1c6a1aac8bd865ab88539aca098ea
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5598767
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h   |  5 ++++
 source/convert_argb.cc |  5 ++++
 source/row_sve.cc      | 60 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index f3e1d2eaf..0c5194b7a 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -585,6 +585,7 @@ extern "C" {
 #define HAS_AYUVTOUVROW_SVE2
 #define HAS_AYUVTOVUROW_SVE2
 #define HAS_BGRATOUVROW_SVE2
+#define HAS_I400TOARGBROW_SVE2
 #define HAS_I422ALPHATOARGBROW_SVE2
 #define HAS_I422TOARGBROW_SVE2
 #define HAS_I422TORGBAROW_SVE2
@@ -4743,6 +4744,10 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I400ToARGBRow_SVE2(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I400ToARGBRow_MSA(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index bc37802b0..0aae2f8b3 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -2980,6 +2980,11 @@ int I400ToARGBMatrix(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I400TOARGBROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I400ToARGBRow = I400ToARGBRow_SVE2;
+  }
+#endif
 #if defined(HAS_I400TOARGBROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I400ToARGBRow = I400ToARGBRow_Any_MSA;
diff --git a/source/row_sve.cc b/source/row_sve.cc
index f06e46fbd..814023056 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -43,6 +43,12 @@ extern "C" {
   "trn1       z1.h, z1.h, z1.h               \n" \
   "trn1       z2.h, z2.h, z2.h               \n"
 
+#define READYUV400_SVE                           \
+  "ld1b       {z0.h}, p1/z, [%[src_y]]       \n" \
+  "inch       %[src_y]                       \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "trn1       z0.b, z0.b, z0.b               \n"
+
 // We need a different predicate for the UV component to handle the tail.
 // If there is a single element remaining then we want to load one Y element
 // but two UV elements.
@@ -80,6 +86,14 @@ extern "C" {
   "uqsub      z16.h, z16.h, z25.h            \n" /* B */  \
   "uqsub      z18.h, z18.h, z27.h            \n" /* R */
 
+#define I400TORGB_SVE                                    \
+  "umulh      z18.h, z24.h, z0.h             \n" /* Y */ \
+  "movprfx    z16, z18                       \n"         \
+  "usqadd     z16.h, p0/m, z16.h, z4.h       \n" /* B */ \
+  "movprfx    z17, z18                       \n"         \
+  "usqadd     z17.h, p0/m, z17.h, z6.h       \n" /* G */ \
+  "usqadd     z18.h, p0/m, z18.h, z5.h       \n" /* R */
+
 // Convert from 2.14 fixed point RGB to 8 bit ARGB, interleaving as BG and RA
 // pairs to allow us to use ST2 for storing rather than ST4.
 #define RGBTOARGB8_SVE                                    \
@@ -146,6 +160,52 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
       : "cc", "memory", YUVTORGB_SVE_REGS);
 }
 
+void I400ToARGBRow_SVE2(const uint8_t* src_y,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  uint64_t vl;
+  asm("cnth     %[vl]                                   \n"
+      "ptrue    p0.b                                    \n"
+      "dup      z19.b, #255                             \n"  // A
+      YUVTORGB_SVE_SETUP
+      "cmp      %w[width], %w[vl]                       \n"
+      "mov      z1.h, #128                              \n"  // U/V
+      "mul      z6.h, z30.h, z1.h                       \n"
+      "mul      z4.h, z28.h, z1.h                       \n"  // DB
+      "mul      z5.h, z29.h, z1.h                       \n"  // DR
+      "mla      z6.h, p0/m, z31.h, z1.h                 \n"  // DG
+      "sub      z4.h, z4.h, z25.h                       \n"
+      "sub      z5.h, z5.h, z27.h                       \n"
+      "sub      z6.h, z26.h, z6.h                       \n"
+      "b.le     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p1.h                                    \n"
+      "sub      %w[width], %w[width], %w[vl]            \n"
+      "1:                                               \n"  //
+      READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
+      "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
+      "b.gt     1b                                      \n"
+      "add      %w[width], %w[width], %w[vl]            \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "2:                                               \n"
+      "whilelt  p1.h, wzr, %w[width]                    \n"  //
+      READYUV400_SVE I400TORGB_SVE RGBTOARGB8_SVE
+      "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
+        [width] "+r"(width),                               // %[width]
+        [vl] "=&r"(vl)                                     // %[vl]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
 void I422ToARGBRow_SVE2(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,