[AArch64] Add I8MM implementation of ARGBColorMatrixRow

We cannot use the standard dot-product instructions since the matrix of coefficients are signed, but I8MM supports mixed-sign products which work well here. Reduction in runtimes observed compared to the previous Neon implementation: Cortex-A510: -50.8% Cortex-A520: -33.3% Cortex-A715: -38.6% Cortex-A720: -38.5% Cortex-X2: -43.2% Cortex-X3: -40.0% Cortex-X4: -55.0% Change-Id: Ia4fe486faf8f43d0b837ad21bb37e2159f3bdb77 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5621577 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2024-04-10 15:46:42 +01:00 · 2024-04-10 15:46:42 +01:00 · a758a15dbf
commit a758a15dbf
parent 89cf221baa
3 changed files with 67 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -569,6 +569,8 @@ extern "C" {
 #define HAS_BGRATOYROW_NEON_DOTPROD
 #define HAS_RGBATOYJROW_NEON_DOTPROD
 #define HAS_RGBATOYROW_NEON_DOTPROD
+
+#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
 #endif

 // The following are available on AArch64 SVE platforms:
@ -6022,6 +6024,10 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const int8_t* matrix_argb,
                             int width);
+void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
+                                  uint8_t* dst_argb,
+                                  const int8_t* matrix_argb,
+                                  int width);
 void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            const int8_t* matrix_argb,
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -3907,6 +3907,11 @@ int ARGBColorMatrix(const uint8_t* src_argb,
    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
  }
 #endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON_I8MM;
+  }
+#endif
 #if defined(HAS_ARGBCOLORMATRIXROW_MSA)
  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
    ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -4039,6 +4039,62 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
        "v17", "v18", "v19", "v22", "v23", "v24", "v25");
 }

+void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
+                                  uint8_t* dst_argb,
+                                  const int8_t* matrix_argb,
+                                  int width) {
+  asm("ld1        {v31.16b}, [%[matrix_argb]]           \n"
+
+      "1:                                               \n"
+      "ld1        {v0.16b, v1.16b}, [%[src_argb]], #32  \n"
+
+      "movi       v16.4s, #0                            \n"
+      "movi       v17.4s, #0                            \n"
+      "movi       v18.4s, #0                            \n"
+      "movi       v19.4s, #0                            \n"
+      "movi       v20.4s, #0                            \n"
+      "movi       v21.4s, #0                            \n"
+      "movi       v22.4s, #0                            \n"
+      "movi       v23.4s, #0                            \n"
+
+      // 8 processed per loop.
+      "subs       %w2, %w2, #8                          \n"
+      "prfm       pldl1keep, [%[src_argb], 448]         \n"
+
+      "sudot      v16.4s, v31.16b, v0.4b[0]             \n"
+      "sudot      v17.4s, v31.16b, v0.4b[1]             \n"
+      "sudot      v18.4s, v31.16b, v0.4b[2]             \n"
+      "sudot      v19.4s, v31.16b, v0.4b[3]             \n"
+      "sudot      v20.4s, v31.16b, v1.4b[0]             \n"
+      "sudot      v21.4s, v31.16b, v1.4b[1]             \n"
+      "sudot      v22.4s, v31.16b, v1.4b[2]             \n"
+      "sudot      v23.4s, v31.16b, v1.4b[3]             \n"
+
+      "shrn       v16.4h, v16.4s, #6                    \n"
+      "shrn       v18.4h, v18.4s, #6                    \n"
+      "shrn       v20.4h, v20.4s, #6                    \n"
+      "shrn       v22.4h, v22.4s, #6                    \n"
+      "shrn2      v16.8h, v17.4s, #6                    \n"
+      "shrn2      v18.8h, v19.4s, #6                    \n"
+      "shrn2      v20.8h, v21.4s, #6                    \n"
+      "shrn2      v22.8h, v23.4s, #6                    \n"
+
+      "uqxtn      v16.8b, v16.8h                        \n"
+      "uqxtn      v18.8b, v18.8h                        \n"
+      "uqxtn      v20.8b, v20.8h                        \n"
+      "uqxtn      v22.8b, v22.8h                        \n"
+
+      "stp        d16, d18, [%[dst_argb]], #16          \n"
+      "stp        d20, d22, [%[dst_argb]], #16          \n"
+      "b.gt       1b                                    \n"
+      : [src_argb] "+r"(src_argb),      // %[src_argb]
+        [dst_argb] "+r"(dst_argb),      // %[dst_argb]
+        [width] "+r"(width)             // %[width]
+      : [matrix_argb] "r"(matrix_argb)  // %[matrix_argb]
+      : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
+        "v22", "v23", "v31");
+}
+
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
 void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                          const uint8_t* src_argb1,