mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 09:16:48 +08:00
Optimize Hamming Distance C code to do 64 bits at a time.
BUG=libyuv:701 TEST=LibYUVBaseTest.BenchmarkHammingDistance_C R=wangcheng@google.com Change-Id: I243003b098bea8ef3809298bbec349ed52a43d8c Reviewed-on: https://chromium-review.googlesource.com/499487 Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
parent
bbbf30eecd
commit
e0615c0e69
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1655
|
||||
Version: 1656
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -22,6 +22,12 @@ extern "C" {
|
||||
LIBYUV_API
|
||||
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
|
||||
|
||||
// Hamming Distance
|
||||
LIBYUV_API
|
||||
uint64 ComputeHammingDistance(const uint8* src_a,
|
||||
const uint8* src_b,
|
||||
int count);
|
||||
|
||||
// Scan an opaque argb image and return fourcc based on alpha offset.
|
||||
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
|
||||
LIBYUV_API
|
||||
|
||||
@ -49,11 +49,16 @@ extern "C" {
|
||||
|
||||
// The following are available for Visual C and GCC:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86)))
|
||||
(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
|
||||
#define HAS_HASHDJB2_SSE41
|
||||
#define HAS_SUMSQUAREERROR_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available for GCC:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_HAMMINGDISTANCE_X86
|
||||
#endif
|
||||
|
||||
// The following are available for Visual C and clangcl 32 bit:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
|
||||
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
|
||||
@ -67,6 +72,11 @@ extern "C" {
|
||||
#define HAS_SUMSQUAREERROR_NEON
|
||||
#endif
|
||||
|
||||
// The following are available for Neon 64 bit:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_HAMMINGDISTANCE_NEON
|
||||
#endif
|
||||
|
||||
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1655
|
||||
#define LIBYUV_VERSION 1656
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -110,6 +110,51 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
|
||||
return fourcc;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
uint64 ComputeHammingDistance(const uint8* src_a,
|
||||
const uint8* src_b,
|
||||
int count) {
|
||||
const int kBlockSize = 65536;
|
||||
int remainder = count & (kBlockSize - 1) & ~31;
|
||||
uint64 diff = 0;
|
||||
int i;
|
||||
uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
|
||||
HammingDistance_C;
|
||||
#if defined(HAS_HAMMINGDISTANCE_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
HammingDistance = HammingDistance_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_X86)
|
||||
if (TestCpuFlag(kCpuHasX86)) {
|
||||
HammingDistance = HammingDistance_X86;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
HammingDistance = HammingDistance_AVX2;
|
||||
}
|
||||
#endif
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for reduction(+ : diff)
|
||||
#endif
|
||||
for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
|
||||
diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
|
||||
}
|
||||
src_a += count & ~(kBlockSize - 1);
|
||||
src_b += count & ~(kBlockSize - 1);
|
||||
if (remainder) {
|
||||
diff += HammingDistance(src_a, src_b, remainder);
|
||||
src_a += remainder;
|
||||
src_b += remainder;
|
||||
}
|
||||
remainder = count & 31;
|
||||
if (remainder) {
|
||||
diff += HammingDistance_C(src_a, src_b, remainder);
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Refactor into row function.
|
||||
LIBYUV_API
|
||||
uint64 ComputeSumSquareError(const uint8* src_a,
|
||||
|
||||
@ -17,21 +17,29 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if ORIGINAL_C
|
||||
#if ORIGINAL_OPT
|
||||
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff = 0u;
|
||||
|
||||
int i;
|
||||
for (i = 0; i < count; ++i) {
|
||||
int x = src_a[i] ^ src_b[i];
|
||||
if (x & 1) ++diff;
|
||||
if (x & 2) ++diff;
|
||||
if (x & 4) ++diff;
|
||||
if (x & 8) ++diff;
|
||||
if (x & 16) ++diff;
|
||||
if (x & 32) ++diff;
|
||||
if (x & 64) ++diff;
|
||||
if (x & 128) ++diff;
|
||||
if (x & 1)
|
||||
++diff;
|
||||
if (x & 2)
|
||||
++diff;
|
||||
if (x & 4)
|
||||
++diff;
|
||||
if (x & 8)
|
||||
++diff;
|
||||
if (x & 16)
|
||||
++diff;
|
||||
if (x & 32)
|
||||
++diff;
|
||||
if (x & 64)
|
||||
++diff;
|
||||
if (x & 128)
|
||||
++diff;
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
@ -44,10 +52,11 @@ uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) {
|
||||
int i;
|
||||
for (i = 0; i < count - 3; i += 4) {
|
||||
uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b);
|
||||
uint32 u = x - ((x >> 1) & 0x55555555);
|
||||
u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
|
||||
diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
|
||||
src_a += 4;
|
||||
src_b += 4;
|
||||
uint32 u = x - ((x >> 1) & 033333333333) - ((x >> 2) & 011111111111);
|
||||
diff += ((u + (u >> 3)) & 030707070707) % 63;
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
@ -20,40 +20,11 @@ extern "C" {
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#if 0
|
||||
// 256 bits at a time
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff;
|
||||
asm volatile (
|
||||
"eor v4.16b, v4.16b, v4.16b \n"
|
||||
"eor v5.16b, v5.16b, v5.16b \n"
|
||||
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"eor v2.16b, v0.16b, v1.16b \n"
|
||||
"cnt v3.16b, v2.16b \n"
|
||||
"addv b4, v3.16b \n"
|
||||
"add d5, d5, d4 \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"fmov %w3, s5 \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(diff)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v4", "v5");
|
||||
return diff;
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff;
|
||||
asm volatile (
|
||||
"movi d6, #0 \n"
|
||||
"movi d4, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
@ -65,19 +36,19 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
"eor v1.16b, v1.16b, v3.16b \n"
|
||||
"cnt v0.16b, v0.16b \n"
|
||||
"cnt v1.16b, v1.16b \n"
|
||||
"addv b4, v0.16b \n"
|
||||
"addv b5, v1.16b \n"
|
||||
"add d6, d6, d4 \n"
|
||||
"add d6, d6, d5 \n"
|
||||
"uaddlv h0, v0.16b \n"
|
||||
"uaddlv h1, v1.16b \n"
|
||||
"add d4, d4, d0 \n"
|
||||
"add d4, d4, d1 \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"fmov %w3, s6 \n"
|
||||
"fmov %w3, s4 \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(diff)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
||||
: "cc", "v0", "v1", "v2", "v3", "v4");
|
||||
return diff;
|
||||
}
|
||||
|
||||
|
||||
@ -275,6 +275,37 @@ TEST_F(LibYUVBaseTest, BenchmarkHammingDistance_C) {
|
||||
free_aligned_buffer_page_end(src_b);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVBaseTest, BenchmarkHammingDistance) {
|
||||
const int kMaxWidth = 4096 * 3;
|
||||
align_buffer_page_end(src_a, kMaxWidth);
|
||||
align_buffer_page_end(src_b, kMaxWidth);
|
||||
memset(src_a, 0, kMaxWidth);
|
||||
memset(src_b, 0, kMaxWidth);
|
||||
|
||||
memcpy(src_a, "test0123test4567", 16);
|
||||
memcpy(src_b, "tick0123tock4567", 16);
|
||||
uint64 h1 = ComputeHammingDistance(src_a, src_b, 16);
|
||||
EXPECT_EQ(16u, h1);
|
||||
|
||||
// Test C vs OPT on random buffer
|
||||
MemRandomize(src_a, kMaxWidth);
|
||||
MemRandomize(src_b, kMaxWidth);
|
||||
|
||||
uint32 h0 = HammingDistance_C(src_a, src_b, kMaxWidth);
|
||||
|
||||
int count =
|
||||
benchmark_iterations_ *
|
||||
((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
h1 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
|
||||
}
|
||||
|
||||
EXPECT_EQ(h0, h1);
|
||||
|
||||
free_aligned_buffer_page_end(src_a);
|
||||
free_aligned_buffer_page_end(src_b);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
|
||||
const int kMaxWidth = 4096 * 3;
|
||||
align_buffer_page_end(src_a, kMaxWidth);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user