[AArch64] Add Neon implementation of HashDjb2

Reduction in runtime observed compared to the existing C code compiled
with LLVM 18:

 Cortex-A55: -46.2%
Cortex-A510: -60.4%
 Cortex-A76: -82.9%
Cortex-A720: -87.4%
  Cortex-X1: -90.0%
  Cortex-X2: -91.7%

Change-Id: I39a4479f78299508043a864e64fb40578c66ce19
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5494094
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-25 10:30:29 +01:00 committed by Frank Barchard
parent 1eae2efbc7
commit 67e5e79dbe
3 changed files with 75 additions and 0 deletions

View File

@ -84,6 +84,8 @@ extern "C" {
// The following are available for AArch64 Neon:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_HASHDJB2_NEON
#define HAS_HAMMINGDISTANCE_NEON_DOTPROD
#define HAS_SUMSQUAREERROR_NEON_DOTPROD
#endif
@ -136,6 +138,7 @@ uint32_t SumSquareError_MSA(const uint8_t* src_a,
uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed);
uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed);
#ifdef __cplusplus
} // extern "C"

View File

@ -44,6 +44,11 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
HashDjb2_SSE = HashDjb2_AVX2;
}
#endif
#if defined(HAS_HASHDJB2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
HashDjb2_SSE = HashDjb2_NEON;
}
#endif
while (count >= (uint64_t)kBlockSize) {
seed = HashDjb2_SSE(src, kBlockSize, seed);

View File

@ -86,6 +86,73 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
return sse;
}
static const uvec32 kDjb2Multiplicands[] = {
{0x0c3525e1, // 33^15
0xa3476dc1, // 33^14
0x3b4039a1, // 33^13
0x4f5f0981}, // 33^12
{0x30f35d61, // 33^11
0x855cb541, // 33^10
0x040a9121, // 33^9
0x747c7101}, // 33^8
{0xec41d4e1, // 33^7
0x4cfa3cc1, // 33^6
0x025528a1, // 33^5
0x00121881}, // 33^4
{0x00008c61, // 33^3
0x00000441, // 33^2
0x00000021, // 33^1
0x00000001}, // 33^0
};
static const uvec32 kDjb2WidenIndices[] = {
{0xffffff00U, 0xffffff01U, 0xffffff02U, 0xffffff03U},
{0xffffff04U, 0xffffff05U, 0xffffff06U, 0xffffff07U},
{0xffffff08U, 0xffffff09U, 0xffffff0aU, 0xffffff0bU},
{0xffffff0cU, 0xffffff0dU, 0xffffff0eU, 0xffffff0fU},
};
uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash = seed;
const uint32_t c16 = 0x92d9e201; // 33^16
uint32_t tmp, tmp2;
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
// count is always a multiple of 16.
// maintain two accumulators, reduce and then final sum in scalar since
// this has better performance on little cores.
"1: \n"
"ldr q0, [%[src]], #16 \n"
"subs %w[count], %w[count], #16 \n"
"tbl v3.16b, {v0.16b}, v19.16b \n"
"tbl v2.16b, {v0.16b}, v18.16b \n"
"tbl v1.16b, {v0.16b}, v17.16b \n"
"tbl v0.16b, {v0.16b}, v16.16b \n"
"mul v3.4s, v3.4s, v7.4s \n"
"mul v2.4s, v2.4s, v6.4s \n"
"mla v3.4s, v1.4s, v5.4s \n"
"mla v2.4s, v0.4s, v4.4s \n"
"addv s1, v3.4s \n"
"addv s0, v2.4s \n"
"fmov %w[tmp2], s1 \n"
"fmov %w[tmp], s0 \n"
"add %w[tmp], %w[tmp], %w[tmp2] \n"
"madd %w[hash], %w[hash], %w[c16], %w[tmp] \n"
"b.gt 1b \n"
: [hash] "+r"(hash), // %[hash]
[count] "+r"(count), // %[count]
[tmp] "=&r"(tmp), // %[tmp]
[tmp2] "=&r"(tmp2) // %[tmp2]
: [src] "r"(src), // %[src]
[kMuls] "r"(kDjb2Multiplicands), // %[kMuls]
[kIdx] "r"(kDjb2WidenIndices), // %[kIdx]
[c16] "r"(c16) // %[c16]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19");
return hash;
}
uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
const uint8_t* src_b,
int count) {