From 1b2f6cdbe81afd651da102e28ed3a1cf7daf06f9 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Fri, 6 Jun 2025 11:20:17 +0100
Subject: [PATCH] [AArch64] Unroll I210ToAR30Row_{SVE2,SME}

Now that we have a STOREAR30_SVE_2X implementation, we can use this to
unroll other kernels. The predication on I210ToAR30Row needs adjusting
to allow loading two vectors of Y compared to one vector of U/V, and
additionally UZP is needed to ensure the data arrangement in vector
lanes matches the U/V layout. LD2H could also be used, however this
provides no performance improvement on most cores and would necessitate
the addition of an "any" kernel to handle the case where width % 2 != 0.

Reduction in run times of I210ToAR30Row_SVE2 observed compared to the
previous SVE2 implementation: (note that even in the observed slowdowns,
the SVE2 implementation still outperforms the existing Neon code)

Cortex-A510: -37.1%
Cortex-A520: -39.1%
Cortex-A710: +1.6% (!)
Cortex-A715: +6.5% (!)
Cortex-A720: +6.5% (!)
  Cortex-X2: -2.9%
  Cortex-X3: -2.2%
  Cortex-X4: -8.8%
Cortex-X925: -3.5%

Change-Id: I2ff285b48105883526eceb8be1fcbe0e033a553b
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6640989
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
---
 include/libyuv/row_sve.h | 42 ++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h
index 9ed0e4efb..ece62bcc5 100644
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@@ -119,6 +119,26 @@ extern "C" {
   "uqshrnb    z1.b, z1.h, #2                 \n" \
   "uqshrnb    z2.b, z2.h, #2                 \n"
 
+#define READI210_SVE_2X                                      \
+  "ld1h       {z4.h}, p2/z, [%[src_y]]                   \n" \
+  "ld1h       {z5.h}, p3/z, [%[src_y], #1, mul vl]       \n" \
+  "ld1h       {z2.h}, p1/z, [%[src_u]]                   \n" \
+  "ld1h       {z3.h}, p1/z, [%[src_v]]                   \n" \
+  "incb       %[src_y], all, mul #2                      \n" \
+  "uzp1       z6.h, z4.h, z5.h                           \n" \
+  "uzp2       z5.h, z4.h, z5.h                           \n" \
+  "incb       %[src_u]                                   \n" \
+  "incb       %[src_v]                                   \n" \
+  "lsl        z0.h, z6.h, #6                             \n" \
+  "lsl        z1.h, z5.h, #6                             \n" \
+  "prfm       pldl1keep, [%[src_y], 448]                 \n" \
+  "prfm       pldl1keep, [%[src_u], 128]                 \n" \
+  "prfm       pldl1keep, [%[src_v], 128]                 \n" \
+  "usra       z0.h, z6.h, #4                             \n" \
+  "usra       z1.h, z5.h, #4                             \n" \
+  "uqshrnb    z2.b, z2.h, #2                             \n" \
+  "uqshrnb    z3.b, z3.h, #2                             \n"
+
 #define READP210_SVE                             \
   "ld1h       {z0.h}, p1/z, [%[src_y]]       \n" \
   "ld1h       {z1.h}, p2/z, [%[src_uv]]      \n" \
@@ -1387,31 +1407,36 @@ static inline void I210ToAR30Row_SVE_SC(const uint16_t* src_y,
                                         int width) STREAMING_COMPATIBLE {
   uint64_t vl;
   asm("cnth %0" : "=r"(vl));
-  int width_last_y = width & (vl - 1);
-  // The limit is used for saturating the 2.14 red channel in STOREAR30_SVE.
+  int width_last_y = width & (2 * vl - 1);
+  int width_last_uv = (width_last_y + 1) / 2;
+  // The limit is used for saturating the 2.14 red channel in STOREAR30_SVE_2X.
   uint16_t limit = 0x3ff0;
   asm volatile(
       "ptrue    p0.b                                    \n"  //
       YUVTORGB_SVE_SETUP
       "dup      z23.h, %w[limit]                        \n"
-      "subs     %w[width], %w[width], %w[vl]            \n"
+      "subs     %w[width], %w[width], %w[vl], lsl #1    \n"
       "b.lt     2f                                      \n"
 
       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p1.h                                    \n"
+      "ptrue    p2.h                                    \n"
+      "ptrue    p3.h                                    \n"
       "1:                                               \n"  //
-      READI210_SVE I4XXTORGB_SVE STOREAR30_SVE
-      "subs     %w[width], %w[width], %w[vl]            \n"
+      READI210_SVE_2X I422TORGB_SVE_2X STOREAR30_SVE_2X
+      "subs     %w[width], %w[width], %w[vl], lsl #1    \n"
       "b.ge     1b                                      \n"
 
       "2:                                               \n"
-      "adds     %w[width], %w[width], %w[vl]            \n"
+      "adds     %w[width], %w[width], %w[vl], lsl #1    \n"
       "b.eq     99f                                     \n"
 
       // Calculate a predicate for the final iteration to deal with the tail.
-      "whilelt  p1.h, wzr, %w[width_last_y]             \n"  //
-      READI210_SVE I4XXTORGB_SVE STOREAR30_SVE
+      "whilelt  p1.h, wzr, %w[width_last_uv]            \n"
+      "whilelt  p2.h, wzr, %w[width_last_y]             \n"
+      "whilelt  p3.h, %w[vl], %w[width_last_y]          \n"  //
+      READI210_SVE_2X I422TORGB_SVE_2X STOREAR30_SVE_2X
 
       "99:                                              \n"
       : [src_y] "+r"(src_y),                                // %[src_y]
@@ -1423,6 +1448,7 @@ static inline void I210ToAR30Row_SVE_SC(const uint16_t* src_y,
         [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
         [width_last_y] "r"(width_last_y),                   // %[width_last_y]
+        [width_last_uv] "r"(width_last_uv),                 // %[width_last_uv]
         [limit] "r"(limit)                                  // %[limit]
       : "cc", "memory", YUVTORGB_SVE_REGS);
 }