diff --git a/README.chromium b/README.chromium index c048819a3..61b74f5bc 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1655 +Version: 1656 License: BSD License File: LICENSE diff --git a/include/libyuv/compare.h b/include/libyuv/compare.h index 4deca97f6..a06eff206 100644 --- a/include/libyuv/compare.h +++ b/include/libyuv/compare.h @@ -22,6 +22,12 @@ extern "C" { LIBYUV_API uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed); +// Hamming Distance +LIBYUV_API +uint64 ComputeHammingDistance(const uint8* src_a, + const uint8* src_b, + int count); + // Scan an opaque argb image and return fourcc based on alpha offset. // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. LIBYUV_API diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index 8b04b2b07..3ec95593f 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -49,11 +49,16 @@ extern "C" { // The following are available for Visual C and GCC: #if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86))) + (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86)) #define HAS_HASHDJB2_SSE41 #define HAS_SUMSQUAREERROR_SSE2 #endif +// The following are available for GCC: +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#define HAS_HAMMINGDISTANCE_X86 +#endif + // The following are available for Visual C and clangcl 32 bit: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) @@ -67,6 +72,11 @@ extern "C" { #define HAS_SUMSQUAREERROR_NEON #endif +// The following are available for Neon 64 bit: +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_HAMMINGDISTANCE_NEON +#endif + uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 6eacae219..0dc150ca1 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1655 +#define LIBYUV_VERSION 1656 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare.cc b/source/compare.cc index 1facd27b1..3f7f14751 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -110,6 +110,51 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { return fourcc; } +LIBYUV_API +uint64 ComputeHammingDistance(const uint8* src_a, + const uint8* src_b, + int count) { + const int kBlockSize = 65536; + int remainder = count & (kBlockSize - 1) & ~31; + uint64 diff = 0; + int i; + uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) = + HammingDistance_C; +#if defined(HAS_HAMMINGDISTANCE_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HammingDistance = HammingDistance_NEON; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_X86) + if (TestCpuFlag(kCpuHasX86)) { + HammingDistance = HammingDistance_X86; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HammingDistance = HammingDistance_AVX2; + } +#endif +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : diff) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + diff += HammingDistance(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + diff += HammingDistance(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & 31; + if (remainder) { + diff += HammingDistance_C(src_a, src_b, remainder); + } + return diff; +} + // TODO(fbarchard): Refactor into row function. LIBYUV_API uint64 ComputeSumSquareError(const uint8* src_a, diff --git a/source/compare_common.cc b/source/compare_common.cc index 0fdbfd061..d3e46fb50 100644 --- a/source/compare_common.cc +++ b/source/compare_common.cc @@ -17,21 +17,29 @@ namespace libyuv { extern "C" { #endif -#if ORIGINAL_C +#if ORIGINAL_OPT uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) { uint32 diff = 0u; int i; for (i = 0; i < count; ++i) { int x = src_a[i] ^ src_b[i]; - if (x & 1) ++diff; - if (x & 2) ++diff; - if (x & 4) ++diff; - if (x & 8) ++diff; - if (x & 16) ++diff; - if (x & 32) ++diff; - if (x & 64) ++diff; - if (x & 128) ++diff; + if (x & 1) + ++diff; + if (x & 2) + ++diff; + if (x & 4) + ++diff; + if (x & 8) + ++diff; + if (x & 16) + ++diff; + if (x & 32) + ++diff; + if (x & 64) + ++diff; + if (x & 128) + ++diff; } return diff; } @@ -44,10 +52,11 @@ uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) { int i; for (i = 0; i < count - 3; i += 4) { uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b); + uint32 u = x - ((x >> 1) & 0x55555555); + u = ((u >> 2) & 0x33333333) + (u & 0x33333333); + diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24); src_a += 4; src_b += 4; - uint32 u = x - ((x >> 1) & 033333333333) - ((x >> 2) & 011111111111); - diff += ((u + (u >> 3)) & 030707070707) % 63; } return diff; } diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 52f77e149..fdd9e0f87 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -20,40 +20,11 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -#if 0 +// 256 bits at a time uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { uint32 diff; asm volatile ( - "eor v4.16b, v4.16b, v4.16b \n" - "eor v5.16b, v5.16b, v5.16b \n" - - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "eor v2.16b, v0.16b, v1.16b \n" - "cnt v3.16b, v2.16b \n" - "addv b4, v3.16b \n" - "add d5, d5, d4 \n" - "b.gt 1b \n" - - "fmov %w3, s5 \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(diff) - : - : "cc", "v0", "v1", "v2", "v3", "v4", "v5"); - return diff; -} -#endif - -uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { - uint32 diff; - asm volatile ( - "movi d6, #0 \n" + "movi d4, #0 \n" "1: \n" MEMACCESS(0) @@ -65,19 +36,19 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { "eor v1.16b, v1.16b, v3.16b \n" "cnt v0.16b, v0.16b \n" "cnt v1.16b, v1.16b \n" - "addv b4, v0.16b \n" - "addv b5, v1.16b \n" - "add d6, d6, d4 \n" - "add d6, d6, d5 \n" + "uaddlv h0, v0.16b \n" + "uaddlv h1, v1.16b \n" + "add d4, d4, d0 \n" + "add d4, d4, d1 \n" "b.gt 1b \n" - "fmov %w3, s6 \n" + "fmov %w3, s4 \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : - : "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); + : "cc", "v0", "v1", "v2", "v3", "v4"); return diff; } diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index 7af6524ec..f664bfd4c 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -275,6 +275,37 @@ TEST_F(LibYUVBaseTest, BenchmarkHammingDistance_C) { free_aligned_buffer_page_end(src_b); } +TEST_F(LibYUVBaseTest, BenchmarkHammingDistance) { + const int kMaxWidth = 4096 * 3; + align_buffer_page_end(src_a, kMaxWidth); + align_buffer_page_end(src_b, kMaxWidth); + memset(src_a, 0, kMaxWidth); + memset(src_b, 0, kMaxWidth); + + memcpy(src_a, "test0123test4567", 16); + memcpy(src_b, "tick0123tock4567", 16); + uint64 h1 = ComputeHammingDistance(src_a, src_b, 16); + EXPECT_EQ(16u, h1); + + // Test C vs OPT on random buffer + MemRandomize(src_a, kMaxWidth); + MemRandomize(src_b, kMaxWidth); + + uint32 h0 = HammingDistance_C(src_a, src_b, kMaxWidth); + + int count = + benchmark_iterations_ * + ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth); + for (int i = 0; i < count; ++i) { + h1 = ComputeHammingDistance(src_a, src_b, kMaxWidth); + } + + EXPECT_EQ(h0, h1); + + free_aligned_buffer_page_end(src_a); + free_aligned_buffer_page_end(src_b); +} + TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) { const int kMaxWidth = 4096 * 3; align_buffer_page_end(src_a, kMaxWidth);