mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
HammingDistance_NEON optimized looping
BenchmarkHammingDistance_Opt (93 ms) BenchmarkHammingDistance_C (389 ms) TBR=kjellander@chromium.org BUG=libyuv:701 TEST=BenchmarkHammingDistance Change-Id: I4ba920751eb130cac6a276e441a7c309c495554a Reviewed-on: https://chromium-review.googlesource.com/526401 Reviewed-by: Frank Barchard <fbarchard@google.com> Reviewed-by: Cheng Wang <wangcheng@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
baf5248242
commit
47d6eaa377
@ -24,80 +24,36 @@ extern "C" {
|
||||
// 256 bits at a time
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff;
|
||||
uint32 total_diff = 0;
|
||||
|
||||
for (int i = 0; i < count; i += 32, src_a += 32, src_b += 32) {
|
||||
|
||||
__asm__ volatile(
|
||||
asm volatile (
|
||||
// Load constants.
|
||||
"vmov.u8 q12, #0x55 \n\t" // m1.
|
||||
"vmov.u8 q13, #0x33 \n\t" // m2.
|
||||
"vmov.u8 q14, #0x0f \n\t" // m4.
|
||||
"vmov.u8 q15, #0x01 \n\t" // h01.
|
||||
"vmov.u8 q4, #0 \n" // accumulator
|
||||
|
||||
// Load d1
|
||||
"vld1.32 {q0,q1}, [%1] \n\t" // load d1.
|
||||
|
||||
// Load d2
|
||||
"vld1.32 {q2, q3}, [%2] \n\t" // load d2.
|
||||
|
||||
// xor
|
||||
"veor.32 q0, q0, q2 \n\t" // xor left side.
|
||||
"veor.32 q3, q1, q3 \n\t" // xor right side.
|
||||
|
||||
// x -= (x >> 1) & m1;
|
||||
"vshr.u32 q1, q0, #1 \n\t"
|
||||
"vshr.u32 q4, q3, #1 \n\t"
|
||||
"vand.32 q1, q1, q12 \n\t"
|
||||
"vand.32 q4, q4, q12 \n\t"
|
||||
"vsub.u32 q0, q0, q1 \n\t"
|
||||
"vsub.u32 q3, q3, q4 \n\t"
|
||||
|
||||
// x = (x & m2) + ((x >> 2) & m2);
|
||||
"vand.32 q1, q0, q13 \n\t"
|
||||
"vand.32 q4, q3, q13 \n\t"
|
||||
"vshr.u32 q2, q0, #2 \n\t"
|
||||
"vshr.u32 q5, q3, #2 \n\t"
|
||||
"vand.32 q2, q2, q13 \n\t"
|
||||
"vand.32 q5, q5, q13 \n\t"
|
||||
"vadd.u32 q0, q1, q2 \n\t"
|
||||
"vadd.u32 q3, q4, q5 \n\t"
|
||||
|
||||
// x = (x + (x >> 4)) & m4;
|
||||
"vshr.u32 q1, q0, #4 \n\t"
|
||||
"vshr.u32 q4, q3, #4 \n\t"
|
||||
"vadd.u32 q0, q0, q1 \n\t"
|
||||
"vadd.u32 q3, q3, q4 \n\t"
|
||||
"vand.32 q0, q0, q14 \n\t"
|
||||
"vand.32 q3, q3, q14 \n\t"
|
||||
|
||||
// (x * h01) >> 24;
|
||||
"vmul.u32 q0, q0, q15 \n\t"
|
||||
"vmul.u32 q3, q3, q15 \n\t"
|
||||
"vshr.u32 q0, q0, #24 \n\t"
|
||||
"vshr.u32 q3, q3, #24 \n\t"
|
||||
|
||||
// sum distances
|
||||
"vpadd.u32 d0, d0, d1 \n\t"
|
||||
"vpadd.u32 d6, d6, d7 \n\t"
|
||||
"vpadd.u32 d0, d0, d0 \n\t"
|
||||
"vpadd.u32 d6, d6, d6 \n\t"
|
||||
|
||||
// add d0,d6.
|
||||
"vadd.u32 d0, d0, d6 \n\t"
|
||||
"1: \n"
|
||||
"vld1.8 {q0, q1}, [%0]! \n"
|
||||
"vld1.8 {q2, q3}, [%1]! \n"
|
||||
"veor.32 q0, q0, q2 \n"
|
||||
"veor.32 q1, q1, q3 \n"
|
||||
"vcnt.i8 q0, q0 \n"
|
||||
"vcnt.i8 q1, q1 \n"
|
||||
"subs %2, %2, #32 \n"
|
||||
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
|
||||
"vpaddl.u8 q0, q0 \n" // 8 shorts
|
||||
"vpadal.u16 q4, q0 \n" // 4 ints
|
||||
"bgt 1b \n"
|
||||
|
||||
"vpadd.u32 d0, d8, d9 \n"
|
||||
"vpadd.u32 d0, d0, d0 \n"
|
||||
// Move distance to return register.
|
||||
"vmov.32 %0, d0[0] \n\t"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
|
||||
// Output.
|
||||
: "=r"(diff), "+r"(src_a), "+r"(src_b)
|
||||
// input
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(diff)
|
||||
:
|
||||
// Clobber list.
|
||||
: "q0", "q1", "q2", "q3", "q4", "q5", "q12", "q13", "q14", "q15");
|
||||
total_diff += diff;
|
||||
}
|
||||
return total_diff;
|
||||
: "cc", "q0", "q1", "q2", "q3", "q4");
|
||||
return diff;
|
||||
}
|
||||
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user