mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
[AArch64] Add Neon implementation of HashDjb2
Reduction in runtime observed compared to the existing C code compiled with LLVM 18: Cortex-A55: -46.2% Cortex-A510: -60.4% Cortex-A76: -82.9% Cortex-A720: -87.4% Cortex-X1: -90.0% Cortex-X2: -91.7% Change-Id: I39a4479f78299508043a864e64fb40578c66ce19 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5494094 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
1eae2efbc7
commit
67e5e79dbe
@ -84,6 +84,8 @@ extern "C" {
|
||||
|
||||
// The following are available for AArch64 Neon:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_HASHDJB2_NEON
|
||||
|
||||
#define HAS_HAMMINGDISTANCE_NEON_DOTPROD
|
||||
#define HAS_SUMSQUAREERROR_NEON_DOTPROD
|
||||
#endif
|
||||
@ -136,6 +138,7 @@ uint32_t SumSquareError_MSA(const uint8_t* src_a,
|
||||
uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
|
||||
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
|
||||
uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed);
|
||||
uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
||||
@ -44,6 +44,11 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
|
||||
HashDjb2_SSE = HashDjb2_AVX2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HASHDJB2_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
HashDjb2_SSE = HashDjb2_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (count >= (uint64_t)kBlockSize) {
|
||||
seed = HashDjb2_SSE(src, kBlockSize, seed);
|
||||
|
||||
@ -86,6 +86,73 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
||||
return sse;
|
||||
}
|
||||
|
||||
static const uvec32 kDjb2Multiplicands[] = {
|
||||
{0x0c3525e1, // 33^15
|
||||
0xa3476dc1, // 33^14
|
||||
0x3b4039a1, // 33^13
|
||||
0x4f5f0981}, // 33^12
|
||||
{0x30f35d61, // 33^11
|
||||
0x855cb541, // 33^10
|
||||
0x040a9121, // 33^9
|
||||
0x747c7101}, // 33^8
|
||||
{0xec41d4e1, // 33^7
|
||||
0x4cfa3cc1, // 33^6
|
||||
0x025528a1, // 33^5
|
||||
0x00121881}, // 33^4
|
||||
{0x00008c61, // 33^3
|
||||
0x00000441, // 33^2
|
||||
0x00000021, // 33^1
|
||||
0x00000001}, // 33^0
|
||||
};
|
||||
|
||||
static const uvec32 kDjb2WidenIndices[] = {
|
||||
{0xffffff00U, 0xffffff01U, 0xffffff02U, 0xffffff03U},
|
||||
{0xffffff04U, 0xffffff05U, 0xffffff06U, 0xffffff07U},
|
||||
{0xffffff08U, 0xffffff09U, 0xffffff0aU, 0xffffff0bU},
|
||||
{0xffffff0cU, 0xffffff0dU, 0xffffff0eU, 0xffffff0fU},
|
||||
};
|
||||
|
||||
uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
|
||||
uint32_t hash = seed;
|
||||
const uint32_t c16 = 0x92d9e201; // 33^16
|
||||
uint32_t tmp, tmp2;
|
||||
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
|
||||
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
|
||||
|
||||
// count is always a multiple of 16.
|
||||
// maintain two accumulators, reduce and then final sum in scalar since
|
||||
// this has better performance on little cores.
|
||||
"1: \n"
|
||||
"ldr q0, [%[src]], #16 \n"
|
||||
"subs %w[count], %w[count], #16 \n"
|
||||
"tbl v3.16b, {v0.16b}, v19.16b \n"
|
||||
"tbl v2.16b, {v0.16b}, v18.16b \n"
|
||||
"tbl v1.16b, {v0.16b}, v17.16b \n"
|
||||
"tbl v0.16b, {v0.16b}, v16.16b \n"
|
||||
"mul v3.4s, v3.4s, v7.4s \n"
|
||||
"mul v2.4s, v2.4s, v6.4s \n"
|
||||
"mla v3.4s, v1.4s, v5.4s \n"
|
||||
"mla v2.4s, v0.4s, v4.4s \n"
|
||||
"addv s1, v3.4s \n"
|
||||
"addv s0, v2.4s \n"
|
||||
"fmov %w[tmp2], s1 \n"
|
||||
"fmov %w[tmp], s0 \n"
|
||||
"add %w[tmp], %w[tmp], %w[tmp2] \n"
|
||||
"madd %w[hash], %w[hash], %w[c16], %w[tmp] \n"
|
||||
"b.gt 1b \n"
|
||||
: [hash] "+r"(hash), // %[hash]
|
||||
[count] "+r"(count), // %[count]
|
||||
[tmp] "=&r"(tmp), // %[tmp]
|
||||
[tmp2] "=&r"(tmp2) // %[tmp2]
|
||||
: [src] "r"(src), // %[src]
|
||||
[kMuls] "r"(kDjb2Multiplicands), // %[kMuls]
|
||||
[kIdx] "r"(kDjb2WidenIndices), // %[kIdx]
|
||||
[c16] "r"(c16) // %[c16]
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19");
|
||||
return hash;
|
||||
}
|
||||
|
||||
uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user