mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
HammingDistance_NEON ported to 32 bit
TBR=kjellander@chromium.org BUG=libyuv:701 TEST=BenchmarkHammingDistance Change-Id: I252efd8a27aa11a0fe7d8030d7c8b57f20f04760 Reviewed-on: https://chromium-review.googlesource.com/525232 Reviewed-by: Frank Barchard <fbarchard@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
44abf70187
commit
baf5248242
@ -66,10 +66,6 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
#define HAS_SUMSQUAREERROR_NEON
|
||||
#endif
|
||||
|
||||
// The following are available for Neon 64 bit:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_HAMMINGDISTANCE_NEON
|
||||
#endif
|
||||
|
||||
|
||||
@ -21,6 +21,85 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
|
||||
!defined(__aarch64__)
|
||||
|
||||
// 256 bits at a time
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff;
|
||||
uint32 total_diff = 0;
|
||||
|
||||
for (int i = 0; i < count; i += 32, src_a += 32, src_b += 32) {
|
||||
|
||||
__asm__ volatile(
|
||||
// Load constants.
|
||||
"vmov.u8 q12, #0x55 \n\t" // m1.
|
||||
"vmov.u8 q13, #0x33 \n\t" // m2.
|
||||
"vmov.u8 q14, #0x0f \n\t" // m4.
|
||||
"vmov.u8 q15, #0x01 \n\t" // h01.
|
||||
|
||||
// Load d1
|
||||
"vld1.32 {q0,q1}, [%1] \n\t" // load d1.
|
||||
|
||||
// Load d2
|
||||
"vld1.32 {q2, q3}, [%2] \n\t" // load d2.
|
||||
|
||||
// xor
|
||||
"veor.32 q0, q0, q2 \n\t" // xor left side.
|
||||
"veor.32 q3, q1, q3 \n\t" // xor right side.
|
||||
|
||||
// x -= (x >> 1) & m1;
|
||||
"vshr.u32 q1, q0, #1 \n\t"
|
||||
"vshr.u32 q4, q3, #1 \n\t"
|
||||
"vand.32 q1, q1, q12 \n\t"
|
||||
"vand.32 q4, q4, q12 \n\t"
|
||||
"vsub.u32 q0, q0, q1 \n\t"
|
||||
"vsub.u32 q3, q3, q4 \n\t"
|
||||
|
||||
// x = (x & m2) + ((x >> 2) & m2);
|
||||
"vand.32 q1, q0, q13 \n\t"
|
||||
"vand.32 q4, q3, q13 \n\t"
|
||||
"vshr.u32 q2, q0, #2 \n\t"
|
||||
"vshr.u32 q5, q3, #2 \n\t"
|
||||
"vand.32 q2, q2, q13 \n\t"
|
||||
"vand.32 q5, q5, q13 \n\t"
|
||||
"vadd.u32 q0, q1, q2 \n\t"
|
||||
"vadd.u32 q3, q4, q5 \n\t"
|
||||
|
||||
// x = (x + (x >> 4)) & m4;
|
||||
"vshr.u32 q1, q0, #4 \n\t"
|
||||
"vshr.u32 q4, q3, #4 \n\t"
|
||||
"vadd.u32 q0, q0, q1 \n\t"
|
||||
"vadd.u32 q3, q3, q4 \n\t"
|
||||
"vand.32 q0, q0, q14 \n\t"
|
||||
"vand.32 q3, q3, q14 \n\t"
|
||||
|
||||
// (x * h01) >> 24;
|
||||
"vmul.u32 q0, q0, q15 \n\t"
|
||||
"vmul.u32 q3, q3, q15 \n\t"
|
||||
"vshr.u32 q0, q0, #24 \n\t"
|
||||
"vshr.u32 q3, q3, #24 \n\t"
|
||||
|
||||
// sum distances
|
||||
"vpadd.u32 d0, d0, d1 \n\t"
|
||||
"vpadd.u32 d6, d6, d7 \n\t"
|
||||
"vpadd.u32 d0, d0, d0 \n\t"
|
||||
"vpadd.u32 d6, d6, d6 \n\t"
|
||||
|
||||
// add d0,d6.
|
||||
"vadd.u32 d0, d0, d6 \n\t"
|
||||
|
||||
// Move distance to return register.
|
||||
"vmov.32 %0, d0[0] \n\t"
|
||||
|
||||
// Output.
|
||||
: "=r"(diff), "+r"(src_a), "+r"(src_b)
|
||||
// input
|
||||
:
|
||||
// Clobber list.
|
||||
: "q0", "q1", "q2", "q3", "q4", "q5", "q12", "q13", "q14", "q15");
|
||||
total_diff += diff;
|
||||
}
|
||||
return total_diff;
|
||||
}
|
||||
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 sse;
|
||||
asm volatile (
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user