diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index fdd9e0f87..2c0b68b2d 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -24,24 +24,22 @@ extern "C" { uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { uint32 diff; asm volatile ( - "movi d4, #0 \n" + "movi v4.4s, #0 \n" "1: \n" - MEMACCESS(0) "ld1 {v0.16b, v1.16b}, [%0], #32 \n" - MEMACCESS(1) "ld1 {v2.16b, v3.16b}, [%1], #32 \n" - "subs %w2, %w2, #32 \n" "eor v0.16b, v0.16b, v2.16b \n" - "eor v1.16b, v1.16b, v3.16b \n" + "eor v1.16b, v1.16b, v3.16b \n" "cnt v0.16b, v0.16b \n" "cnt v1.16b, v1.16b \n" - "uaddlv h0, v0.16b \n" - "uaddlv h1, v1.16b \n" - "add d4, d4, d0 \n" - "add d4, d4, d1 \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v4.4s, v0.8h \n" "b.gt 1b \n" + "addv s4, v4.4s \n" "fmov %w3, s4 \n" : "+r"(src_a), "+r"(src_b),