mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Port HammingDistance_NEON 32 bit code to 64 bit
The 32 bit version of HammingDistance_NEON accumulates using vertical add and paired adds, which takes 3 instructions instead of 4. The instructions are also portable between 32 and 64 bit. Was BenchmarkHammingDistance_Opt (105 ms) Now BenchmarkHammingDistance_Opt (90 ms) TBR=kjellander@chromium.org BUG=libyuv:701 TEST=BenchmarkHammingDistance BenchmarkHammingDistance_Opt (90 ms) Change-Id: If9e621e0bd2fe2492a1532056f8a1b451ba53d7e Reviewed-on: https://chromium-review.googlesource.com/526365 Reviewed-by: Frank Barchard <fbarchard@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
47d6eaa377
commit
790e0634a8
@ -24,24 +24,22 @@ extern "C" {
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff;
|
||||
asm volatile (
|
||||
"movi d4, #0 \n"
|
||||
"movi v4.4s, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"eor v0.16b, v0.16b, v2.16b \n"
|
||||
"eor v1.16b, v1.16b, v3.16b \n"
|
||||
"eor v1.16b, v1.16b, v3.16b \n"
|
||||
"cnt v0.16b, v0.16b \n"
|
||||
"cnt v1.16b, v1.16b \n"
|
||||
"uaddlv h0, v0.16b \n"
|
||||
"uaddlv h1, v1.16b \n"
|
||||
"add d4, d4, d0 \n"
|
||||
"add d4, d4, d1 \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"add v0.16b, v0.16b, v1.16b \n"
|
||||
"uaddlp v0.8h, v0.16b \n"
|
||||
"uadalp v4.4s, v0.8h \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"addv s4, v4.4s \n"
|
||||
"fmov %w3, s4 \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user