From f27b983f382be8d49b1d473562918820aa124ed1 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Sun, 5 May 2024 20:51:51 +0100
Subject: [PATCH] [AArch64] Add SVE2 implementation of DivideRow_16

SVE contains the UMULH instruction which allows us to multiply and take
the high half of the result in a single instruction rather than needing
separate widening multiply and then narrowing shift steps.

Observed reduction in runtime compared to the existing Neon code:

Cortex-A510: -21.2%
Cortex-A520: -20.9%
Cortex-A715: -47.9%
Cortex-A720: -47.6%
  Cortex-X2:  -5.2%
  Cortex-X3:  -2.6%
  Cortex-X4: -32.4%
Cortex-X925:  -1.5%

Bug: b/42280942
Change-Id: I25154699b17772db1fb5cb84c049919181d86f4b
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5975318
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h       |  5 ++++
 source/planar_functions.cc |  5 ++++
 source/row_sve.cc          | 50 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index cb529f282..da7bab8f3 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -548,6 +548,7 @@ extern "C" {
 #define HAS_AYUVTOUVROW_SVE2
 #define HAS_AYUVTOVUROW_SVE2
 #define HAS_BGRATOUVROW_SVE2
+#define HAS_DIVIDEROW_16_SVE2
 #define HAS_I400TOARGBROW_SVE2
 #define HAS_I422ALPHATOARGBROW_SVE2
 #define HAS_I422TOARGB1555ROW_SVE2
@@ -3302,6 +3303,10 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        uint16_t* dst_y,
                        int scale,
                        int width);
+void DivideRow_16_SVE2(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width);
 void DivideRow_16_Any_NEON(const uint16_t* src_ptr,
                            uint16_t* dst_ptr,
                            int scale,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 1fc625330..e13626f81 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -877,6 +877,11 @@ void ConvertToLSBPlane_16(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_DIVIDEROW_16_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    DivideRow = DivideRow_16_SVE2;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     DivideRow(src_y, dst_y, scale, width);
diff --git a/source/row_sve.cc b/source/row_sve.cc
index 0762398af..b70824444 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -1620,6 +1620,56 @@ void ARGBToRAWRow_SVE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   ARGBToXYZRow_SVE2(src_argb, dst_rgb, width, kARGBToRAWRowIndices);
 }
 
+void DivideRow_16_SVE2(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  uint64_t vl;
+  asm volatile(
+      "cnth   %x[vl]                                     \n"
+      "dup    z0.h, %w[scale]                            \n"
+      "subs   %w[width], %w[width], %w[vl], lsl #1       \n"
+      "b.le    2f                                        \n"
+
+      // Run bulk of computation with the same predicates to avoid predicate
+      // generation overhead.
+      "ptrue   p0.h                                      \n"
+      "1:                                                \n"
+      "ld1h   {z1.h}, p0/z, [%[src]]                     \n"
+      "ld1h   {z2.h}, p0/z, [%[src], #1, mul vl]         \n"
+      "incb   %[src], all, mul #2                        \n"
+      "umulh  z1.h, z1.h, z0.h                           \n"
+      "umulh  z2.h, z2.h, z0.h                           \n"
+      "subs   %w[width], %w[width], %w[vl], lsl #1       \n"
+      "st1h   {z1.h}, p0, [%[dst]]                       \n"
+      "st1h   {z2.h}, p0, [%[dst], #1, mul vl]           \n"
+      "incb   %[dst], all, mul #2                        \n"
+      "b.gt    1b                                        \n"
+
+      "2:                                                \n"
+      "adds     %w[width], %w[width], %w[vl], lsl #1     \n"
+      "b.eq     99f                                      \n"
+
+      // Calculate a pair of predicates for the final iteration to deal with
+      // the tail.
+      "whilelt p0.h, wzr, %w[width]                      \n"
+      "whilelt p1.h, %w[vl], %w[width]                   \n"
+      "ld1h   {z1.h}, p0/z, [%[src]]                     \n"
+      "ld1h   {z2.h}, p1/z, [%[src], #1, mul vl]         \n"
+      "umulh  z1.h, z1.h, z0.h                           \n"
+      "umulh  z2.h, z2.h, z0.h                           \n"
+      "st1h   {z1.h}, p0, [%[dst]]                       \n"
+      "st1h   {z2.h}, p1, [%[dst], #1, mul vl]           \n"
+
+      "99:                                               \n"
+      : [src] "+r"(src_y),    // %[src]
+        [dst] "+r"(dst_y),    // %[dst]
+        [width] "+r"(width),  // %[width]
+        [vl] "=&r"(vl)        // %[vl]
+      : [scale] "r"(scale)    // %[scale]
+      : "cc", "memory", "z0", "z1", "z2", "p0", "p1");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
 
 #ifdef __cplusplus