From 67e5e79dbe6745bf2b0c25c1a56acf97e2b8966a Mon Sep 17 00:00:00 2001 From: George Steed Date: Thu, 25 Apr 2024 10:30:29 +0100 Subject: [PATCH] [AArch64] Add Neon implementation of HashDjb2 Reduction in runtime observed compared to the existing C code compiled with LLVM 18: Cortex-A55: -46.2% Cortex-A510: -60.4% Cortex-A76: -82.9% Cortex-A720: -87.4% Cortex-X1: -90.0% Cortex-X2: -91.7% Change-Id: I39a4479f78299508043a864e64fb40578c66ce19 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5494094 Reviewed-by: Frank Barchard --- include/libyuv/compare_row.h | 3 ++ source/compare.cc | 5 +++ source/compare_neon64.cc | 67 ++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index 4ab4b49ad..8a8358db3 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -84,6 +84,8 @@ extern "C" { // The following are available for AArch64 Neon: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_HASHDJB2_NEON + #define HAS_HAMMINGDISTANCE_NEON_DOTPROD #define HAS_SUMSQUAREERROR_NEON_DOTPROD #endif @@ -136,6 +138,7 @@ uint32_t SumSquareError_MSA(const uint8_t* src_a, uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed); uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed); uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed); +uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed); #ifdef __cplusplus } // extern "C" diff --git a/source/compare.cc b/source/compare.cc index 3783baea2..3ce4cfea9 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -44,6 +44,11 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { HashDjb2_SSE = HashDjb2_AVX2; } #endif +#if defined(HAS_HASHDJB2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HashDjb2_SSE = HashDjb2_NEON; + } +#endif while (count >= (uint64_t)kBlockSize) { seed = HashDjb2_SSE(src, kBlockSize, seed); diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 0b378c632..b61b9f7ac 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -86,6 +86,73 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, return sse; } +static const uvec32 kDjb2Multiplicands[] = { + {0x0c3525e1, // 33^15 + 0xa3476dc1, // 33^14 + 0x3b4039a1, // 33^13 + 0x4f5f0981}, // 33^12 + {0x30f35d61, // 33^11 + 0x855cb541, // 33^10 + 0x040a9121, // 33^9 + 0x747c7101}, // 33^8 + {0xec41d4e1, // 33^7 + 0x4cfa3cc1, // 33^6 + 0x025528a1, // 33^5 + 0x00121881}, // 33^4 + {0x00008c61, // 33^3 + 0x00000441, // 33^2 + 0x00000021, // 33^1 + 0x00000001}, // 33^0 +}; + +static const uvec32 kDjb2WidenIndices[] = { + {0xffffff00U, 0xffffff01U, 0xffffff02U, 0xffffff03U}, + {0xffffff04U, 0xffffff05U, 0xffffff06U, 0xffffff07U}, + {0xffffff08U, 0xffffff09U, 0xffffff0aU, 0xffffff0bU}, + {0xffffff0cU, 0xffffff0dU, 0xffffff0eU, 0xffffff0fU}, +}; + +uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash = seed; + const uint32_t c16 = 0x92d9e201; // 33^16 + uint32_t tmp, tmp2; + asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" + "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n" + + // count is always a multiple of 16. + // maintain two accumulators, reduce and then final sum in scalar since + // this has better performance on little cores. + "1: \n" + "ldr q0, [%[src]], #16 \n" + "subs %w[count], %w[count], #16 \n" + "tbl v3.16b, {v0.16b}, v19.16b \n" + "tbl v2.16b, {v0.16b}, v18.16b \n" + "tbl v1.16b, {v0.16b}, v17.16b \n" + "tbl v0.16b, {v0.16b}, v16.16b \n" + "mul v3.4s, v3.4s, v7.4s \n" + "mul v2.4s, v2.4s, v6.4s \n" + "mla v3.4s, v1.4s, v5.4s \n" + "mla v2.4s, v0.4s, v4.4s \n" + "addv s1, v3.4s \n" + "addv s0, v2.4s \n" + "fmov %w[tmp2], s1 \n" + "fmov %w[tmp], s0 \n" + "add %w[tmp], %w[tmp], %w[tmp2] \n" + "madd %w[hash], %w[hash], %w[c16], %w[tmp] \n" + "b.gt 1b \n" + : [hash] "+r"(hash), // %[hash] + [count] "+r"(count), // %[count] + [tmp] "=&r"(tmp), // %[tmp] + [tmp2] "=&r"(tmp2) // %[tmp2] + : [src] "r"(src), // %[src] + [kMuls] "r"(kDjb2Multiplicands), // %[kMuls] + [kIdx] "r"(kDjb2WidenIndices), // %[kIdx] + [c16] "r"(c16) // %[c16] + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19"); + return hash; +} + uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a, const uint8_t* src_b, int count) {