From baf5248242e812ec173a785caad5238ab3c89a4a Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 5 Jun 2017 18:44:55 -0700 Subject: [PATCH] HammingDistance_NEON ported to 32 bit TBR=kjellander@chromium.org BUG=libyuv:701 TEST=BenchmarkHammingDistance Change-Id: I252efd8a27aa11a0fe7d8030d7c8b57f20f04760 Reviewed-on: https://chromium-review.googlesource.com/525232 Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- include/libyuv/compare_row.h | 4 -- source/compare_neon.cc | 79 ++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 4 deletions(-) diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index a29c4c09e..971aecf3c 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -66,10 +66,6 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SUMSQUAREERROR_NEON -#endif - -// The following are available for Neon 64 bit: -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #define HAS_HAMMINGDISTANCE_NEON #endif diff --git a/source/compare_neon.cc b/source/compare_neon.cc index b7991c171..660f138a5 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -21,6 +21,85 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) +// 256 bits at a time +uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { + uint32 diff; + uint32 total_diff = 0; + + for (int i = 0; i < count; i += 32, src_a += 32, src_b += 32) { + + __asm__ volatile( + // Load constants. + "vmov.u8 q12, #0x55 \n\t" // m1. + "vmov.u8 q13, #0x33 \n\t" // m2. + "vmov.u8 q14, #0x0f \n\t" // m4. + "vmov.u8 q15, #0x01 \n\t" // h01. + + // Load d1 + "vld1.32 {q0,q1}, [%1] \n\t" // load d1. + + // Load d2 + "vld1.32 {q2, q3}, [%2] \n\t" // load d2. + + // xor + "veor.32 q0, q0, q2 \n\t" // xor left side. + "veor.32 q3, q1, q3 \n\t" // xor right side. + + // x -= (x >> 1) & m1; + "vshr.u32 q1, q0, #1 \n\t" + "vshr.u32 q4, q3, #1 \n\t" + "vand.32 q1, q1, q12 \n\t" + "vand.32 q4, q4, q12 \n\t" + "vsub.u32 q0, q0, q1 \n\t" + "vsub.u32 q3, q3, q4 \n\t" + + // x = (x & m2) + ((x >> 2) & m2); + "vand.32 q1, q0, q13 \n\t" + "vand.32 q4, q3, q13 \n\t" + "vshr.u32 q2, q0, #2 \n\t" + "vshr.u32 q5, q3, #2 \n\t" + "vand.32 q2, q2, q13 \n\t" + "vand.32 q5, q5, q13 \n\t" + "vadd.u32 q0, q1, q2 \n\t" + "vadd.u32 q3, q4, q5 \n\t" + + // x = (x + (x >> 4)) & m4; + "vshr.u32 q1, q0, #4 \n\t" + "vshr.u32 q4, q3, #4 \n\t" + "vadd.u32 q0, q0, q1 \n\t" + "vadd.u32 q3, q3, q4 \n\t" + "vand.32 q0, q0, q14 \n\t" + "vand.32 q3, q3, q14 \n\t" + + // (x * h01) >> 24; + "vmul.u32 q0, q0, q15 \n\t" + "vmul.u32 q3, q3, q15 \n\t" + "vshr.u32 q0, q0, #24 \n\t" + "vshr.u32 q3, q3, #24 \n\t" + + // sum distances + "vpadd.u32 d0, d0, d1 \n\t" + "vpadd.u32 d6, d6, d7 \n\t" + "vpadd.u32 d0, d0, d0 \n\t" + "vpadd.u32 d6, d6, d6 \n\t" + + // add d0,d6. + "vadd.u32 d0, d0, d6 \n\t" + + // Move distance to return register. + "vmov.32 %0, d0[0] \n\t" + + // Output. + : "=r"(diff), "+r"(src_a), "+r"(src_b) + // input + : + // Clobber list. + : "q0", "q1", "q2", "q3", "q4", "q5", "q12", "q13", "q14", "q15"); + total_diff += diff; + } + return total_diff; +} + uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { uint32 sse; asm volatile (