From f4eaeca22aa82f1b69eef4aca7f04b90f1584fb0 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Sun, 5 May 2024 19:26:06 +0100
Subject: [PATCH] [AArch64] Add SVE2 implementation of I422ToARGB1555Row

This makes use of the same approach as the Neon code to avoid redundant
narrowing and then widening shifts by instead placing the values at the
top portion of the lanes and then shifting down from there instead.

Observed reduction in runtime compared to the existing Neon code:

Cortex-A510: -41.8%
Cortex-A520: -42.6%
Cortex-A715: -22.5%
Cortex-A720: -22.6%
  Cortex-X2: -22.7%
  Cortex-X3: -22.4%
  Cortex-X4: -19.4%
Cortex-X925: -27.0%

Bug: b/42280942
Change-Id: I24b092bb352d9858e3d969d82b55940bb00ac7e0
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5802967
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h   |  7 ++++++
 source/convert_argb.cc |  5 ++++
 source/row_sve.cc      | 56 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 8b1328048..a7b1ecff9 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -550,6 +550,7 @@ extern "C" {
 #define HAS_BGRATOUVROW_SVE2
 #define HAS_I400TOARGBROW_SVE2
 #define HAS_I422ALPHATOARGBROW_SVE2
+#define HAS_I422TOARGB1555ROW_SVE2
 #define HAS_I422TOARGBROW_SVE2
 #define HAS_I422TORGB24ROW_SVE2
 #define HAS_I422TORGB565ROW_SVE2
@@ -1194,6 +1195,12 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width);
+void I422ToARGB1555Row_SVE2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 58ec386c8..eefe3fe27 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -5701,6 +5701,11 @@ int I420ToARGB1555(const uint8_t* src_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGB1555ROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_SVE2;
+  }
+#endif
 #if defined(HAS_I422TOARGB1555ROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
diff --git a/source/row_sve.cc b/source/row_sve.cc
index b16b298dc..5d4d79ff3 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -439,6 +439,62 @@ void I422ToRGB565Row_SVE2(const uint8_t* src_y,
       : "cc", "memory", YUVTORGB_SVE_REGS);
 }
 
+#define RGB8TOARGB1555_SVE_FROM_TOP_2X                      \
+  "dup      z0.h, #0x8000        \n" /* 1000000000000000 */ \
+  "dup      z1.h, #0x8000        \n" /* 1000000000000000 */ \
+  "sri      z0.h, z18.h, #1      \n" /* 1rrrrrxxxxxxxxxx */ \
+  "sri      z1.h, z22.h, #1      \n" /* 1rrrrrxxxxxxxxxx */ \
+  "sri      z0.h, z17.h, #6      \n" /* 1rrrrrgggggxxxxx */ \
+  "sri      z1.h, z21.h, #6      \n" /* 1rrrrrgggggxxxxx */ \
+  "sri      z0.h, z16.h, #11     \n" /* 1rrrrrgggggbbbbb */ \
+  "sri      z1.h, z20.h, #11     \n" /* 1rrrrrgggggbbbbb */
+
+void I422ToARGB1555Row_SVE2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  uint64_t vl;
+  asm volatile(
+      "cntb     %[vl]                                   \n"
+      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
+
+      // Run bulk of computation with an all-true predicate to avoid predicate
+      // generation overhead.
+      "ptrue    p1.b                                    \n"
+      "1:                                               \n" READYUV422_SVE_2X
+          I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
+      "subs     %w[width], %w[width], %w[vl]            \n"  //
+      RGB8TOARGB1555_SVE_FROM_TOP_2X
+      "st2h     {z0.h, z1.h}, p1, [%[dst]] \n"
+      "incb     %[dst], all, mul #2                     \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
+
+      // Calculate a predicate for the final iteration to deal with the tail.
+      "cnth     %[vl]                                   \n"
+      "whilelt  p1.b, wzr, %w[width]                    \n" READYUV422_SVE_2X
+          I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X
+      "st2h     {z0.h, z1.h}, p1, [%[dst]] \n"
+
+      "99:                                              \n"
+      : [src_y] "+r"(src_y),                               // %[src_y]
+        [src_u] "+r"(src_u),                               // %[src_u]
+        [src_v] "+r"(src_v),                               // %[src_v]
+        [dst] "+r"(dst_argb1555),                          // %[dst]
+        [width] "+r"(width),                               // %[width]
+        [vl] "=&r"(vl)                                     // %[vl]
+      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
+      : "cc", "memory", YUVTORGB_SVE_REGS);
+}
+
 void I422ToRGBARow_SVE2(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,