From a758a15dbf80565a69518d99f1fa8bea0acdc58d Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Wed, 10 Apr 2024 15:46:42 +0100
Subject: [PATCH] [AArch64] Add I8MM implementation of ARGBColorMatrixRow

We cannot use the standard dot-product instructions since the matrix of
coefficients are signed, but I8MM supports mixed-sign products which
work well here.

Reduction in runtimes observed compared to the previous Neon
implementation:

Cortex-A510: -50.8%
Cortex-A520: -33.3%
Cortex-A715: -38.6%
Cortex-A720: -38.5%
  Cortex-X2: -43.2%
  Cortex-X3: -40.0%
  Cortex-X4: -55.0%

Change-Id: Ia4fe486faf8f43d0b837ad21bb37e2159f3bdb77
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5621577
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h       |  6 ++++
 source/planar_functions.cc |  5 ++++
 source/row_neon64.cc       | 56 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 5625a9f25..d51e242b0 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -569,6 +569,8 @@ extern "C" {
 #define HAS_BGRATOYROW_NEON_DOTPROD
 #define HAS_RGBATOYJROW_NEON_DOTPROD
 #define HAS_RGBATOYROW_NEON_DOTPROD
+
+#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
 #endif
 
 // The following are available on AArch64 SVE platforms:
@@ -6022,6 +6024,10 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              const int8_t* matrix_argb,
                              int width);
+void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
+                                  uint8_t* dst_argb,
+                                  const int8_t* matrix_argb,
+                                  int width);
 void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const int8_t* matrix_argb,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index bbb7d44cb..789978472 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -3907,6 +3907,11 @@ int ARGBColorMatrix(const uint8_t* src_argb,
     ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON_I8MM;
+  }
+#endif
 #if defined(HAS_ARGBCOLORMATRIXROW_MSA)
   if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
     ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 5afae14f9..372b1efc2 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -4039,6 +4039,62 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
         "v17", "v18", "v19", "v22", "v23", "v24", "v25");
 }
 
+void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
+                                  uint8_t* dst_argb,
+                                  const int8_t* matrix_argb,
+                                  int width) {
+  asm("ld1        {v31.16b}, [%[matrix_argb]]           \n"
+
+      "1:                                               \n"
+      "ld1        {v0.16b, v1.16b}, [%[src_argb]], #32  \n"
+
+      "movi       v16.4s, #0                            \n"
+      "movi       v17.4s, #0                            \n"
+      "movi       v18.4s, #0                            \n"
+      "movi       v19.4s, #0                            \n"
+      "movi       v20.4s, #0                            \n"
+      "movi       v21.4s, #0                            \n"
+      "movi       v22.4s, #0                            \n"
+      "movi       v23.4s, #0                            \n"
+
+      // 8 processed per loop.
+      "subs       %w2, %w2, #8                          \n"
+      "prfm       pldl1keep, [%[src_argb], 448]         \n"
+
+      "sudot      v16.4s, v31.16b, v0.4b[0]             \n"
+      "sudot      v17.4s, v31.16b, v0.4b[1]             \n"
+      "sudot      v18.4s, v31.16b, v0.4b[2]             \n"
+      "sudot      v19.4s, v31.16b, v0.4b[3]             \n"
+      "sudot      v20.4s, v31.16b, v1.4b[0]             \n"
+      "sudot      v21.4s, v31.16b, v1.4b[1]             \n"
+      "sudot      v22.4s, v31.16b, v1.4b[2]             \n"
+      "sudot      v23.4s, v31.16b, v1.4b[3]             \n"
+
+      "shrn       v16.4h, v16.4s, #6                    \n"
+      "shrn       v18.4h, v18.4s, #6                    \n"
+      "shrn       v20.4h, v20.4s, #6                    \n"
+      "shrn       v22.4h, v22.4s, #6                    \n"
+      "shrn2      v16.8h, v17.4s, #6                    \n"
+      "shrn2      v18.8h, v19.4s, #6                    \n"
+      "shrn2      v20.8h, v21.4s, #6                    \n"
+      "shrn2      v22.8h, v23.4s, #6                    \n"
+
+      "uqxtn      v16.8b, v16.8h                        \n"
+      "uqxtn      v18.8b, v18.8h                        \n"
+      "uqxtn      v20.8b, v20.8h                        \n"
+      "uqxtn      v22.8b, v22.8h                        \n"
+
+      "stp        d16, d18, [%[dst_argb]], #16          \n"
+      "stp        d20, d22, [%[dst_argb]], #16          \n"
+      "b.gt       1b                                    \n"
+      : [src_argb] "+r"(src_argb),      // %[src_argb]
+        [dst_argb] "+r"(dst_argb),      // %[dst_argb]
+        [width] "+r"(width)             // %[width]
+      : [matrix_argb] "r"(matrix_argb)  // %[matrix_argb]
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+        "v22", "v23", "v31");
+}
+
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
 void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,