[AArch64] Add Neon dot-product implementation for ARGBGrayRow

We can use dot product instructions to apply the coefficients without needing to use LD4 deinterleaving load instructions, and then TBL to mix in the original alpha component. This is significantly faster on some micro-architectures where LD4 instructions are known to be slow compared to normal loads. Reduction in cycle counts observed compared to existing Neon code: Cortex-A55: -12.6% Cortex-A510: -48.6% Cortex-A76: -39.7% Cortex-A720: -52.3% Cortex-X1: -63.5% Cortex-X2: -67.0% Bug: b/42280946 Change-Id: I3641785e74873438acc00d675f5bc490dfa95b50 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5785972 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-06 08:46:47 +08:00 · 2024-04-18 10:40:41 +01:00 · 2024-04-18 10:40:41 +01:00 · 1c31461771
commit 1c31461771
parent 2d62d8d22a
3 changed files with 46 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -577,6 +577,7 @@ extern "C" {
 #define HAS_BGRATOYROW_NEON_DOTPROD
 #define HAS_RGBATOYJROW_NEON_DOTPROD
 #define HAS_RGBATOYROW_NEON_DOTPROD
+#define HAS_ARGBGRAYROW_NEON_DOTPROD

 #define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
 #define HAS_ARGBTOUV444ROW_NEON_I8MM
@ -6151,6 +6152,9 @@ void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr,
 void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              int width);
 void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
 void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -3751,6 +3751,11 @@ int ARGBGrayTo(const uint8_t* src_argb,
    ARGBGrayRow = ARGBGrayRow_NEON;
  }
 #endif
+#if defined(HAS_ARGBGRAYROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON_DotProd;
+  }
+#endif
 #if defined(HAS_ARGBGRAYROW_MSA)
  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
    ARGBGrayRow = ARGBGrayRow_MSA;
@ -3806,6 +3811,11 @@ int ARGBGray(uint8_t* dst_argb,
    ARGBGrayRow = ARGBGrayRow_NEON;
  }
 #endif
+#if defined(HAS_ARGBGRAYROW_NEON_DOTPROD)
+  if (TestCpuFlag(kCpuHasNeonDotProd) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON_DotProd;
+  }
+#endif
 #if defined(HAS_ARGBGRAYROW_MSA)
  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
    ARGBGrayRow = ARGBGrayRow_MSA;
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -4192,6 +4192,38 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
 }

+static const uvec8 kARGBGrayRowCoeffs = {29, 150, 77, 0};
+static const uvec8 kARGBGrayRowIndices = {0, 0, 0, 19, 2, 2, 2, 23,
+                                          4, 4, 4, 27, 6, 6, 6, 31};
+
+void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              int width) {
+  asm volatile(
+      "ld1r     {v24.4s}, [%[coeffs]]             \n"
+      "ldr      q25, [%[indices]]                 \n"
+      "1:                                         \n"
+      "ldp      q1, q3, [%[src]], #32             \n"  // load 8 ARGB
+      "movi     v0.4s, #0                         \n"
+      "movi     v2.4s, #0                         \n"
+      "subs     %w[width], %w[width], #8          \n"  // 8 processed per loop
+      "udot     v0.4s, v1.16b, v24.16b            \n"
+      "udot     v2.4s, v3.16b, v24.16b            \n"
+      "prfm     pldl1keep, [%[src], 448]          \n"
+      "uqrshrn  v0.8b, v0.8h, #8                  \n"
+      "uqrshrn  v2.8b, v2.8h, #8                  \n"
+      "tbl      v0.16b, {v0.16b, v1.16b}, v25.16b \n"  // merge in alpha
+      "tbl      v1.16b, {v2.16b, v3.16b}, v25.16b \n"
+      "stp      q0, q1, [%[dst]], #32             \n"  // store 8 pixels
+      "b.gt     1b                                \n"
+      : [src] "+r"(src_argb),                // %[src]
+        [dst] "+r"(dst_argb),                // %[dst]
+        [width] "+r"(width)                  // %[width]
+      : [coeffs] "r"(&kARGBGrayRowCoeffs),   // %[coeffs]
+        [indices] "r"(&kARGBGrayRowIndices)  // %[indices]
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25");
+}
+
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 //    b = (r * 35 + g * 68 + b * 17) >> 7
 //    g = (r * 45 + g * 88 + b * 22) >> 7