Hamming code difference of 2 memory blocks

BUG=libyuv:701
TEST=built and disassembled for aarch64
R=kjellander@chromium.org

Change-Id: I7712b1c7934e5dfb55fda1fa7c8405c32d6964ce
Reviewed-on: https://chromium-review.googlesource.com/495327
Reviewed-by: Henrik Kjellander <kjellander@chromium.org>
Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
Frank Barchard 2017-05-05 16:08:30 -07:00
parent 945ea1b746
commit 2136e349da
6 changed files with 189 additions and 2 deletions

View File

@ -33,10 +33,10 @@ group("default") {
if (libyuv_include_tests) {
deps += [
":compare",
":yuvconvert",
":cpuid",
":libyuv_unittest",
":psnr",
":yuvconvert",
]
}
}
@ -158,7 +158,8 @@ static_library("libyuv_internal") {
}
# To enable AVX2 or other cpu optimization, pass flag here
# cflags = [ "-mavx2" ]
# cflags = [ "-mavx2" ]
# cflags = [ "-mpopcnt" ]
}
if (libyuv_use_neon) {

View File

@ -67,6 +67,10 @@ extern "C" {
#define HAS_SUMSQUAREERROR_NEON
#endif
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);

View File

@ -17,6 +17,41 @@ namespace libyuv {
extern "C" {
#endif
#if ORIGINAL_C
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 diff = 0u;
int i;
for (i = 0; i < count; ++i) {
int x = src_a[i] ^ src_b[i];
if (x & 1) ++diff;
if (x & 2) ++diff;
if (x & 4) ++diff;
if (x & 8) ++diff;
if (x & 16) ++diff;
if (x & 32) ++diff;
if (x & 64) ++diff;
if (x & 128) ++diff;
}
return diff;
}
#endif
// Hakmem method for hamming distance.
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 diff = 0u;
int i;
for (i = 0; i < count - 3; i += 4) {
uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b);
src_a += 4;
src_b += 4;
uint32 u = x - ((x >> 1) & 033333333333) - ((x >> 2) & 011111111111);
diff += ((u + (u >> 3)) & 030707070707) % 63;
}
return diff;
}
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse = 0u;
int i;

View File

@ -22,6 +22,19 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 diff = 0u;
int i;
for (i = 0; i < count - 7; i += 8) {
uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b);
src_a += 8;
src_b += 8;
diff += __builtin_popcountll(x);
}
return diff;
}
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (

View File

@ -20,6 +20,67 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#if 0
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 diff;
asm volatile (
"eor v4.16b, v4.16b, v4.16b \n"
"eor v5.16b, v5.16b, v5.16b \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"eor v2.16b, v0.16b, v1.16b \n"
"cnt v3.16b, v2.16b \n"
"addv b4, v3.16b \n"
"add d5, d5, d4 \n"
"b.gt 1b \n"
"fmov %w3, s5 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4", "v5");
return diff;
}
#endif
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 diff;
asm volatile (
"movi d6, #0 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
MEMACCESS(1)
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"subs %w2, %w2, #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"cnt v1.16b, v1.16b \n"
"addv b4, v0.16b \n"
"addv b5, v1.16b \n"
"add d6, d6, d4 \n"
"add d6, d6, d5 \n"
"b.gt 1b \n"
"fmov %w3, s6 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
return diff;
}
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (

View File

@ -15,6 +15,7 @@
#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/compare.h"
#include "libyuv/compare_row.h" /* For HammingDistance_C */
#include "libyuv/cpu_id.h"
#include "libyuv/video_common.h"
@ -202,6 +203,78 @@ TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Unaligned) {
free_aligned_buffer_page_end(src_a);
}
TEST_F(LibYUVBaseTest, BenchmarkHammingDistance_Opt) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth);
memset(src_a, 0, kMaxWidth);
memset(src_b, 0, kMaxWidth);
// Test known value
memcpy(src_a, "test0123test4567", 16);
memcpy(src_b, "tick0123tock4567", 16);
uint32 h1 = HammingDistance_C(src_a, src_b, 16);
EXPECT_EQ(16u, h1);
// Test C vs OPT on random buffer
MemRandomize(src_a, kMaxWidth);
MemRandomize(src_b, kMaxWidth);
uint32 h0 = HammingDistance_C(src_a, src_b, kMaxWidth);
int count =
benchmark_iterations_ *
((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
for (int i = 0; i < count; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
#elif !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
#else
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
#endif
}
EXPECT_EQ(h0, h1);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
}
TEST_F(LibYUVBaseTest, BenchmarkHammingDistance_C) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth);
memset(src_a, 0, kMaxWidth);
memset(src_b, 0, kMaxWidth);
// Test known value
memcpy(src_a, "test0123test4567", 16);
memcpy(src_b, "tick0123tock4567", 16);
uint32 h1 = HammingDistance_C(src_a, src_b, 16);
EXPECT_EQ(16u, h1);
// Test C vs OPT on random buffer
MemRandomize(src_a, kMaxWidth);
MemRandomize(src_b, kMaxWidth);
uint32 h0 = HammingDistance_C(src_a, src_b, kMaxWidth);
int count =
benchmark_iterations_ *
((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
for (int i = 0; i < count; ++i) {
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
}
EXPECT_EQ(h0, h1);
free_aligned_buffer_page_end(src_a);
free_aligned_buffer_page_end(src_b);
}
TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);