From 5c12e0b2de33e9a3031526c1f392cc0d11d49f5f Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Tue, 7 May 2024 13:26:07 +0100
Subject: [PATCH] [AArch64] Add SVE2 implementations of HalfFloat{,1}Row

For HalfFloat1Row, SVE has direct 16-bit integer to half-float
conversion instructions so there is no need to widen to 32-bits.

For HalfFloatRow, SVE zero-extending loads avoid the need for seperate
UXTL(2) instructions.

Observed reductions in runtime compared to the existing Neon code:

            | HalfFloat1Row | HalfFloatRow
Cortex-A510 |        -38.3% |       -17.3%
Cortex-A520 |        -37.6% |       -18.8%
Cortex-A720 |        -50.1% |        -7.8%
  Cortex-X2 |        -50.2% |        -0.4%
  Cortex-X4 |        -51.5% |       -12.5%

Bug: b/42280942
Change-Id: I445071ccd453113144ce42d465ba03c9ee89ec9e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5975319
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h       |   9 +++
 source/planar_functions.cc |   5 ++
 source/row_sve.cc          | 119 +++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index da7bab8f3..ccb849862 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -549,6 +549,7 @@ extern "C" {
 #define HAS_AYUVTOVUROW_SVE2
 #define HAS_BGRATOUVROW_SVE2
 #define HAS_DIVIDEROW_16_SVE2
+#define HAS_HALFFLOATROW_SVE2
 #define HAS_I400TOARGBROW_SVE2
 #define HAS_I422ALPHATOARGBROW_SVE2
 #define HAS_I422TOARGB1555ROW_SVE2
@@ -6609,6 +6610,10 @@ void HalfFloatRow_Any_NEON(const uint16_t* src_ptr,
                            uint16_t* dst_ptr,
                            float param,
                            int width);
+void HalfFloatRow_SVE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
 void HalfFloat1Row_NEON(const uint16_t* src,
                         uint16_t* dst,
                         float scale,
@@ -6617,6 +6622,10 @@ void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,
                             uint16_t* dst_ptr,
                             float param,
                             int width);
+void HalfFloat1Row_SVE2(const uint16_t* src,
+                        uint16_t* dst,
+                        float scale,
+                        int width);
 void HalfFloatRow_MSA(const uint16_t* src,
                       uint16_t* dst,
                       float scale,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index e13626f81..7e81b5cc2 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -5195,6 +5195,11 @@ int HalfFloatPlane(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_HALFFLOATROW_SVE2)
+  if (TestCpuFlag(kCpuHasSVE2)) {
+    HalfFloatRow = scale == 1.0f ? HalfFloat1Row_SVE2 : HalfFloatRow_SVE2;
+  }
+#endif
 #if defined(HAS_HALFFLOATROW_MSA)
   if (TestCpuFlag(kCpuHasMSA)) {
     HalfFloatRow = HalfFloatRow_Any_MSA;
diff --git a/source/row_sve.cc b/source/row_sve.cc
index b70824444..4920fef3d 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -1670,6 +1670,125 @@ void DivideRow_16_SVE2(const uint16_t* src_y,
       : "cc", "memory", "z0", "z1", "z2", "p0", "p1");
 }
 
+#define HALFFLOAT_SVE                                    \
+  "scvtf       z0.s, p0/m, z0.s                      \n" \
+  "scvtf       z1.s, p0/m, z1.s                      \n" \
+  "scvtf       z2.s, p0/m, z2.s                      \n" \
+  "scvtf       z3.s, p0/m, z3.s                      \n" \
+  "fmul        z0.s, z0.s, z4.s                      \n" \
+  "fmul        z1.s, z1.s, z4.s                      \n" \
+  "fmul        z2.s, z2.s, z4.s                      \n" \
+  "fmul        z3.s, z3.s, z4.s                      \n" \
+  "uqshrnb     z0.h, z0.s, #13                       \n" \
+  "uqshrnb     z1.h, z1.s, #13                       \n" \
+  "uqshrnb     z2.h, z2.s, #13                       \n" \
+  "uqshrnb     z3.h, z3.s, #13                       \n"
+
+void HalfFloatRow_SVE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  uint64_t vl;
+  asm("cntw %x0" : "=r"(vl));
+  asm volatile(
+      "mov         z4.s, %s[scale]                       \n"
+      "subs        %w[width], %w[width], %w[vl], lsl #2  \n"
+      "b.lt        2f                                    \n"
+
+      // Run bulk of computation with all-true predicates to avoid predicate
+      // generation overhead.
+      "ptrue       p0.s                                  \n"
+      "1:                                                \n"
+      "ld1h        {z0.s}, p0/z, [%[src]]                \n"
+      "ld1h        {z1.s}, p0/z, [%[src], #1, mul vl]    \n"
+      "ld1h        {z2.s}, p0/z, [%[src], #2, mul vl]    \n"
+      "ld1h        {z3.s}, p0/z, [%[src], #3, mul vl]    \n"
+      "incb        %[src], all, mul #2                   \n" HALFFLOAT_SVE
+      "subs        %w[width], %w[width], %w[vl], lsl #2  \n"
+      "st1h        {z0.s}, p0, [%[dst]]                  \n"
+      "st1h        {z1.s}, p0, [%[dst], #1, mul vl]      \n"
+      "st1h        {z2.s}, p0, [%[dst], #2, mul vl]      \n"
+      "st1h        {z3.s}, p0, [%[dst], #3, mul vl]      \n"
+      "incb        %[dst], all, mul #2                   \n"
+      "b.ge        1b                                    \n"
+
+      "2:                                                \n"
+      "adds     %w[width], %w[width], %w[vl], lsl #2     \n"
+      "b.eq     99f                                      \n"
+
+      // Calculate predicates for the final iteration to deal with the tail.
+      "whilelt     p0.s, wzr, %w[width]                  \n"
+      "whilelt     p1.s, %w[vl], %w[width]               \n"
+      "whilelt     p2.s, %w[vl2], %w[width]              \n"
+      "whilelt     p3.s, %w[vl3], %w[width]              \n"
+      "ld1h        {z0.s}, p0/z, [%[src]]                \n"
+      "ld1h        {z1.s}, p1/z, [%[src], #1, mul vl]    \n"
+      "ld1h        {z2.s}, p2/z, [%[src], #2, mul vl]    \n"
+      "ld1h        {z3.s}, p3/z, [%[src], #3, mul vl]    \n" HALFFLOAT_SVE
+      "st1h        {z0.s}, p0, [%[dst]]                  \n"
+      "st1h        {z1.s}, p1, [%[dst], #1, mul vl]      \n"
+      "st1h        {z2.s}, p2, [%[dst], #2, mul vl]      \n"
+      "st1h        {z3.s}, p3, [%[dst], #3, mul vl]      \n"
+
+      "99:                                               \n"
+      : [src] "+r"(src),                        // %[src]
+        [dst] "+r"(dst),                        // %[dst]
+        [width] "+r"(width)                     // %[width]
+      : [vl] "r"(vl),                           // %[vl]
+        [vl2] "r"(vl * 2),                      // %[vl2]
+        [vl3] "r"(vl * 3),                      // %[vl3]
+        [scale] "w"(scale * 1.9259299444e-34f)  // %[scale]
+      : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "p0", "p1", "p2", "p3");
+}
+
+void HalfFloat1Row_SVE2(const uint16_t* src,
+                        uint16_t* dst,
+                        float scale,
+                        int width) {
+  uint64_t vl;
+  asm volatile(
+      "cnth        %x[vl]                                \n"
+      "subs        %w[width], %w[width], %w[vl], lsl #1  \n"
+      "b.lt        2f                                    \n"
+
+      // Run bulk of computation with all-true predicates to avoid predicate
+      // generation overhead.
+      "ptrue       p0.h                                  \n"
+      "1:                                                \n"
+      "ld1h        {z0.h}, p0/z, [%[src]]                \n"
+      "ld1h        {z1.h}, p0/z, [%[src], #1, mul vl]    \n"
+      "incb        %[src], all, mul #2                   \n"
+      "ucvtf       z0.h, p0/m, z0.h                      \n"
+      "ucvtf       z1.h, p0/m, z1.h                      \n"
+      "subs        %w[width], %w[width], %w[vl], lsl #1  \n"
+      "st1h        {z0.h}, p0, [%[dst]]                  \n"
+      "st1h        {z1.h}, p0, [%[dst], #1, mul vl]      \n"
+      "incb        %[dst], all, mul #2                   \n"
+      "b.ge        1b                                    \n"
+
+      "2:                                                \n"
+      "adds     %w[width], %w[width], %w[vl], lsl #1     \n"
+      "b.eq     99f                                      \n"
+
+      // Calculate predicates for the final iteration to deal with the tail.
+      "whilelt     p0.h, wzr, %w[width]                  \n"
+      "whilelt     p1.h, %w[vl], %w[width]               \n"
+      "ld1h        {z0.h}, p0/z, [%[src]]                \n"
+      "ld1h        {z1.h}, p1/z, [%[src], #1, mul vl]    \n"
+      "ucvtf       z0.h, p0/m, z0.h                      \n"
+      "ucvtf       z1.h, p0/m, z1.h                      \n"
+      "st1h        {z0.h}, p0, [%[dst]]                  \n"
+      "st1h        {z1.h}, p1, [%[dst], #1, mul vl]      \n"
+
+      "99:                                               \n"
+      : [src] "+r"(src),      // %[src]
+        [dst] "+r"(dst),      // %[dst]
+        [width] "+r"(width),  // %[width]
+        [vl] "=&r"(vl)        // %[vl]
+      :
+      : "cc", "memory", "z0", "z1", "p0", "p1");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
 
 #ifdef __cplusplus