[AArch64] Add Neon dot-product implementation for ARGBSepiaRow

We can use the dot product instructions to apply the coefficients directly without the need for LD4 de-interleaving load instructions, since these are known to be slow on some micro-architectures. ST4 is also known to be slow on more modern micro-architectures, however avoiding this is left for a future SVE implementation where we can make use of interleaving-narrowing instructions. Reduction in cycle counts observed compared to existing Neon code: Cortex-A55: -5.8% Cortex-A510: -18.9% Cortex-A76: -21.8% Cortex-A720: -30.2% Cortex-X1: -28.6% Cortex-X2: -23.4% Bug: b/42280946 Change-Id: I5887559649cc805a810d867b652c85d48285657d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790970 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2024-04-18 14:01:21 +01:00 · 2024-04-18 14:01:21 +01:00 · 432d186116
commit 432d186116
parent 1c31461771
3 changed files with 48 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -578,6 +578,7 @@ extern "C" {
 #define HAS_RGBATOYJROW_NEON_DOTPROD
 #define HAS_RGBATOYROW_NEON_DOTPROD
 #define HAS_ARGBGRAYROW_NEON_DOTPROD
+#define HAS_ARGBSEPIAROW_NEON_DOTPROD

 #define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
 #define HAS_ARGBTOUV444ROW_NEON_I8MM
@ -6162,6 +6163,7 @@ void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width);
 void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width);
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -3869,6 +3869,11 @@ int ARGBSepia(uint8_t* dst_argb,
    ARGBSepiaRow = ARGBSepiaRow_NEON;
  }
 #endif
+#if defined(HAS_ARGBSEPIAROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_NEON_DotProd;
+  }
+#endif
 #if defined(HAS_ARGBSEPIAROW_MSA)
  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
    ARGBSepiaRow = ARGBSepiaRow_MSA;
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -4265,6 +4265,47 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
        "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
 }

+static const uvec8 kARGBSepiaRowCoeffs = {17, 68, 35, 0,  22, 88,
+                                          45, 0,  24, 98, 50, 0};
+static const uvec8 kARGBSepiaRowAlphaIndices = {3, 7, 11, 15, 19, 23, 27, 31};
+
+void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "ld3r        {v20.4s, v21.4s, v22.4s}, [%[coeffs]] \n"
+      "ldr         d23, [%[indices]]            \n"
+      "1:                                       \n"
+      "ldp         q0, q1, [%[dst]]             \n"
+      "movi        v2.4s, #0                    \n"
+      "movi        v3.4s, #0                    \n"
+      "movi        v4.4s, #0                    \n"
+      "movi        v5.4s, #0                    \n"
+      "movi        v6.4s, #0                    \n"
+      "movi        v7.4s, #0                    \n"
+      "udot        v2.4s, v0.16b, v20.16b       \n"
+      "udot        v3.4s, v1.16b, v20.16b       \n"
+      "udot        v4.4s, v0.16b, v21.16b       \n"
+      "udot        v5.4s, v1.16b, v21.16b       \n"
+      "udot        v6.4s, v0.16b, v22.16b       \n"
+      "udot        v7.4s, v1.16b, v22.16b       \n"
+      "subs        %w1, %w1, #8                 \n"
+      "prfm        pldl1keep, [%[dst], 448]     \n"
+      "uzp1        v6.8h, v6.8h, v7.8h          \n"
+      "uzp1        v5.8h, v4.8h, v5.8h          \n"
+      "uzp1        v4.8h, v2.8h, v3.8h          \n"
+      "tbl         v3.16b, {v0.16b, v1.16b}, v23.16b \n"
+      "uqshrn      v0.8b, v4.8h, #7             \n"
+      "uqshrn      v1.8b, v5.8h, #7             \n"
+      "uqshrn      v2.8b, v6.8h, #7             \n"
+      "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%[dst]], #32 \n"
+      "b.gt        1b                           \n"
+      : [dst] "+r"(dst_argb),                      // %[dst]
+        [width] "+r"(width)                        // %[width]
+      : [coeffs] "r"(&kARGBSepiaRowCoeffs),        // %[coeffs]
+        [indices] "r"(&kARGBSepiaRowAlphaIndices)  // %[indices]
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
+}
+
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
 // needs to saturate.  Consider doing a non-saturating version.