From 67e5e79dbe6745bf2b0c25c1a56acf97e2b8966a Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Thu, 25 Apr 2024 10:30:29 +0100
Subject: [PATCH] [AArch64] Add Neon implementation of HashDjb2

Reduction in runtime observed compared to the existing C code compiled
with LLVM 18:

 Cortex-A55: -46.2%
Cortex-A510: -60.4%
 Cortex-A76: -82.9%
Cortex-A720: -87.4%
  Cortex-X1: -90.0%
  Cortex-X2: -91.7%

Change-Id: I39a4479f78299508043a864e64fb40578c66ce19
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5494094
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/compare_row.h |  3 ++
 source/compare.cc            |  5 +++
 source/compare_neon64.cc     | 67 ++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+)

diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h
index 4ab4b49ad..8a8358db3 100644
--- a/include/libyuv/compare_row.h
+++ b/include/libyuv/compare_row.h
@@ -84,6 +84,8 @@ extern "C" {
 
 // The following are available for AArch64 Neon:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_HASHDJB2_NEON
+
 #define HAS_HAMMINGDISTANCE_NEON_DOTPROD
 #define HAS_SUMSQUAREERROR_NEON_DOTPROD
 #endif
@@ -136,6 +138,7 @@ uint32_t SumSquareError_MSA(const uint8_t* src_a,
 uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
 uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed);
+uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/source/compare.cc b/source/compare.cc
index 3783baea2..3ce4cfea9 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -44,6 +44,11 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
     HashDjb2_SSE = HashDjb2_AVX2;
   }
 #endif
+#if defined(HAS_HASHDJB2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HashDjb2_SSE = HashDjb2_NEON;
+  }
+#endif
 
   while (count >= (uint64_t)kBlockSize) {
     seed = HashDjb2_SSE(src, kBlockSize, seed);
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index 0b378c632..b61b9f7ac 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -86,6 +86,73 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
   return sse;
 }
 
+static const uvec32 kDjb2Multiplicands[] = {
+    {0x0c3525e1,   // 33^15
+     0xa3476dc1,   // 33^14
+     0x3b4039a1,   // 33^13
+     0x4f5f0981},  // 33^12
+    {0x30f35d61,   // 33^11
+     0x855cb541,   // 33^10
+     0x040a9121,   // 33^9
+     0x747c7101},  // 33^8
+    {0xec41d4e1,   // 33^7
+     0x4cfa3cc1,   // 33^6
+     0x025528a1,   // 33^5
+     0x00121881},  // 33^4
+    {0x00008c61,   // 33^3
+     0x00000441,   // 33^2
+     0x00000021,   // 33^1
+     0x00000001},  // 33^0
+};
+
+static const uvec32 kDjb2WidenIndices[] = {
+    {0xffffff00U, 0xffffff01U, 0xffffff02U, 0xffffff03U},
+    {0xffffff04U, 0xffffff05U, 0xffffff06U, 0xffffff07U},
+    {0xffffff08U, 0xffffff09U, 0xffffff0aU, 0xffffff0bU},
+    {0xffffff0cU, 0xffffff0dU, 0xffffff0eU, 0xffffff0fU},
+};
+
+uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
+  uint32_t hash = seed;
+  const uint32_t c16 = 0x92d9e201;  // 33^16
+  uint32_t tmp, tmp2;
+  asm("ld1   {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
+      "ld1   {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]]    \n"
+
+      // count is always a multiple of 16.
+      // maintain two accumulators, reduce and then final sum in scalar since
+      // this has better performance on little cores.
+      "1:                                \n"
+      "ldr   q0, [%[src]], #16           \n"
+      "subs  %w[count], %w[count], #16   \n"
+      "tbl   v3.16b, {v0.16b}, v19.16b   \n"
+      "tbl   v2.16b, {v0.16b}, v18.16b   \n"
+      "tbl   v1.16b, {v0.16b}, v17.16b   \n"
+      "tbl   v0.16b, {v0.16b}, v16.16b   \n"
+      "mul   v3.4s, v3.4s, v7.4s         \n"
+      "mul   v2.4s, v2.4s, v6.4s         \n"
+      "mla   v3.4s, v1.4s, v5.4s         \n"
+      "mla   v2.4s, v0.4s, v4.4s         \n"
+      "addv  s1, v3.4s                   \n"
+      "addv  s0, v2.4s                   \n"
+      "fmov  %w[tmp2], s1                \n"
+      "fmov  %w[tmp], s0                 \n"
+      "add   %w[tmp], %w[tmp], %w[tmp2]  \n"
+      "madd  %w[hash], %w[hash], %w[c16], %w[tmp] \n"
+      "b.gt  1b                          \n"
+      : [hash] "+r"(hash),                // %[hash]
+        [count] "+r"(count),              // %[count]
+        [tmp] "=&r"(tmp),                 // %[tmp]
+        [tmp2] "=&r"(tmp2)                // %[tmp2]
+      : [src] "r"(src),                   // %[src]
+        [kMuls] "r"(kDjb2Multiplicands),  // %[kMuls]
+        [kIdx] "r"(kDjb2WidenIndices),    // %[kIdx]
+        [c16] "r"(c16)                    // %[c16]
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19");
+  return hash;
+}
+
 uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
                                       const uint8_t* src_b,
                                       int count) {