mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
Hamming Distance using 16 bit accumulators
Summing 16 bit hamming codes restricts the maximum length, but saves an inner loop instruction. The outer loop can sum the values. 32 bit Neon Now BenchmarkHammingDistance_Opt (78 ms) Was BenchmarkHammingDistance_Opt (92 ms) 64 bit Neon Now BenchmarkHammingDistance_Opt (85 ms) Was BenchmarkHammingDistance_Opt (92 ms) R=wangcheng@google.com TBR=kjellander@chromium.org BUG=libyuv:701 TEST=BenchmarkHammingDistance Change-Id: Ie40f0eac2f3339c33b833b42af5d394b122066ae Reviewed-on: https://chromium-review.googlesource.com/526932 Reviewed-by: Frank Barchard <fbarchard@google.com> Reviewed-by: Cheng Wang <wangcheng@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
790e0634a8
commit
d981495b42
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1659
|
Version: 1660
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1659
|
#define LIBYUV_VERSION 1660
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -22,12 +22,12 @@ extern "C" {
|
|||||||
!defined(__aarch64__)
|
!defined(__aarch64__)
|
||||||
|
|
||||||
// 256 bits at a time
|
// 256 bits at a time
|
||||||
|
// uses short accumulator which restricts count to 131 KB
|
||||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||||
uint32 diff;
|
uint32 diff;
|
||||||
|
|
||||||
asm volatile (
|
asm volatile (
|
||||||
// Load constants.
|
"vmov.u16 q4, #0 \n" // accumulator
|
||||||
"vmov.u8 q4, #0 \n" // accumulator
|
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0, q1}, [%0]! \n"
|
"vld1.8 {q0, q1}, [%0]! \n"
|
||||||
@ -38,13 +38,12 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
|||||||
"vcnt.i8 q1, q1 \n"
|
"vcnt.i8 q1, q1 \n"
|
||||||
"subs %2, %2, #32 \n"
|
"subs %2, %2, #32 \n"
|
||||||
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
|
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
|
||||||
"vpaddl.u8 q0, q0 \n" // 8 shorts
|
"vpadal.u8 q4, q0 \n" // 8 shorts
|
||||||
"vpadal.u16 q4, q0 \n" // 4 ints
|
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
|
|
||||||
"vpadd.u32 d0, d8, d9 \n"
|
"vpaddl.u16 q0, q4 \n" // 4 ints
|
||||||
|
"vpadd.u32 d0, d0, d1 \n"
|
||||||
"vpadd.u32 d0, d0, d0 \n"
|
"vpadd.u32 d0, d0, d0 \n"
|
||||||
// Move distance to return register.
|
|
||||||
"vmov.32 %3, d0[0] \n"
|
"vmov.32 %3, d0[0] \n"
|
||||||
|
|
||||||
: "+r"(src_a),
|
: "+r"(src_a),
|
||||||
|
|||||||
@ -21,10 +21,11 @@ extern "C" {
|
|||||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||||
|
|
||||||
// 256 bits at a time
|
// 256 bits at a time
|
||||||
|
// uses short accumulator which restricts count to 131 KB
|
||||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||||
uint32 diff;
|
uint32 diff;
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"movi v4.4s, #0 \n"
|
"movi v4.8h, #0 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
|
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
|
||||||
@ -35,11 +36,10 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
|||||||
"cnt v1.16b, v1.16b \n"
|
"cnt v1.16b, v1.16b \n"
|
||||||
"subs %w2, %w2, #32 \n"
|
"subs %w2, %w2, #32 \n"
|
||||||
"add v0.16b, v0.16b, v1.16b \n"
|
"add v0.16b, v0.16b, v1.16b \n"
|
||||||
"uaddlp v0.8h, v0.16b \n"
|
"uadalp v4.8h, v0.16b \n"
|
||||||
"uadalp v4.4s, v0.8h \n"
|
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
|
|
||||||
"addv s4, v4.4s \n"
|
"uaddlv s4, v4.8h \n"
|
||||||
"fmov %w3, s4 \n"
|
"fmov %w3, s4 \n"
|
||||||
: "+r"(src_a),
|
: "+r"(src_a),
|
||||||
"+r"(src_b),
|
"+r"(src_b),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user