From 790e0634a8a974cdc4721e5de34a06dc4961f7fa Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 6 Jun 2017 17:50:32 -0700 Subject: [PATCH] Port HammingDistance_NEON 32 bit code to 64 bit The 32 bit version of HammingDistance_NEON accumulates using vertical add and paired adds, which takes 3 instructions instead of 4. The instructions are also portable between 32 and 64 bit. Was BenchmarkHammingDistance_Opt (105 ms) Now BenchmarkHammingDistance_Opt (90 ms) TBR=kjellander@chromium.org BUG=libyuv:701 TEST=BenchmarkHammingDistance BenchmarkHammingDistance_Opt (90 ms) Change-Id: If9e621e0bd2fe2492a1532056f8a1b451ba53d7e Reviewed-on: https://chromium-review.googlesource.com/526365 Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- source/compare_neon64.cc | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index fdd9e0f87..2c0b68b2d 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -24,24 +24,22 @@ extern "C" { uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { uint32 diff; asm volatile ( - "movi d4, #0 \n" + "movi v4.4s, #0 \n" "1: \n" - MEMACCESS(0) "ld1 {v0.16b, v1.16b}, [%0], #32 \n" - MEMACCESS(1) "ld1 {v2.16b, v3.16b}, [%1], #32 \n" - "subs %w2, %w2, #32 \n" "eor v0.16b, v0.16b, v2.16b \n" - "eor v1.16b, v1.16b, v3.16b \n" + "eor v1.16b, v1.16b, v3.16b \n" "cnt v0.16b, v0.16b \n" "cnt v1.16b, v1.16b \n" - "uaddlv h0, v0.16b \n" - "uaddlv h1, v1.16b \n" - "add d4, d4, d0 \n" - "add d4, d4, d1 \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v4.4s, v0.8h \n" "b.gt 1b \n" + "addv s4, v4.4s \n" "fmov %w3, s4 \n" : "+r"(src_a), "+r"(src_b),