diff --git a/BUILD.gn b/BUILD.gn index f8a9dcc13..1971161ab 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -33,10 +33,10 @@ group("default") { if (libyuv_include_tests) { deps += [ ":compare", - ":yuvconvert", ":cpuid", ":libyuv_unittest", ":psnr", + ":yuvconvert", ] } } @@ -158,7 +158,8 @@ static_library("libyuv_internal") { } # To enable AVX2 or other cpu optimization, pass flag here - # cflags = [ "-mavx2" ] + # cflags = [ "-mavx2" ] + # cflags = [ "-mpopcnt" ] } if (libyuv_use_neon) { diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index 7abc2d4ae..8b04b2b07 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -67,6 +67,10 @@ extern "C" { #define HAS_SUMSQUAREERROR_NEON #endif +uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); +uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); +uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); + uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); diff --git a/source/compare_common.cc b/source/compare_common.cc index 42fc58935..bca095f33 100644 --- a/source/compare_common.cc +++ b/source/compare_common.cc @@ -17,6 +17,41 @@ namespace libyuv { extern "C" { #endif +#if ORIGINAL_C +uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 diff = 0u; + + int i; + for (i = 0; i < count; ++i) { + int x = src_a[i] ^ src_b[i]; + if (x & 1) ++diff; + if (x & 2) ++diff; + if (x & 4) ++diff; + if (x & 8) ++diff; + if (x & 16) ++diff; + if (x & 32) ++diff; + if (x & 64) ++diff; + if (x & 128) ++diff; + } + return diff; +} +#endif + +// Hakmem method for hamming distance. +uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32 x = *((uint32*)src_a) ^ *((uint32*)src_b); + src_a += 4; + src_b += 4; + uint32 u = x - ((x >> 1) & 033333333333) - ((x >> 2) & 011111111111); + diff += ((u + (u >> 3)) & 030707070707) % 63; + } + return diff; +} + uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) { uint32 sse = 0u; int i; diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 64522aaa3..cab1fe9e9 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -22,6 +22,19 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 diff = 0u; + + int i; + for (i = 0; i < count - 7; i += 8) { + uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b); + src_a += 8; + src_b += 8; + diff += __builtin_popcountll(x); + } + return diff; +} + uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 sse; asm volatile ( diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index f9c7df98c..f7d88d37a 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -20,6 +20,67 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#if 0 +uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 diff; + asm volatile ( + "eor v4.16b, v4.16b, v4.16b \n" + "eor v5.16b, v5.16b, v5.16b \n" + + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "eor v2.16b, v0.16b, v1.16b \n" + "cnt v3.16b, v2.16b \n" + "addv b4, v3.16b \n" + "add d5, d5, d4 \n" + "b.gt 1b \n" + + "fmov %w3, s5 \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(diff) + : + : "cc", "v0", "v1", "v2", "v3", "v4", "v5"); + return diff; +} +#endif + +uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 diff; + asm volatile ( + "movi d6, #0 \n" + + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + MEMACCESS(1) + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" + "subs %w2, %w2, #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "cnt v1.16b, v1.16b \n" + "addv b4, v0.16b \n" + "addv b5, v1.16b \n" + "add d6, d6, d4 \n" + "add d6, d6, d5 \n" + "b.gt 1b \n" + + "fmov %w3, s6 \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(diff) + : + : "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); + return diff; +} + uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { volatile uint32 sse; asm volatile ( diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index 13f747051..7af6524ec 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -15,6 +15,7 @@ #include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" #include "libyuv/compare.h" +#include "libyuv/compare_row.h" /* For HammingDistance_C */ #include "libyuv/cpu_id.h" #include "libyuv/video_common.h" @@ -202,6 +203,78 @@ TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Unaligned) { free_aligned_buffer_page_end(src_a); } + +TEST_F(LibYUVBaseTest, BenchmarkHammingDistance_Opt) { + const int kMaxWidth = 4096 * 3; + align_buffer_page_end(src_a, kMaxWidth); + align_buffer_page_end(src_b, kMaxWidth); + memset(src_a, 0, kMaxWidth); + memset(src_b, 0, kMaxWidth); + + // Test known value + memcpy(src_a, "test0123test4567", 16); + memcpy(src_b, "tick0123tock4567", 16); + uint32 h1 = HammingDistance_C(src_a, src_b, 16); + EXPECT_EQ(16u, h1); + + // Test C vs OPT on random buffer + MemRandomize(src_a, kMaxWidth); + MemRandomize(src_b, kMaxWidth); + + uint32 h0 = HammingDistance_C(src_a, src_b, kMaxWidth); + + int count = + benchmark_iterations_ * + ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth); + for (int i = 0; i < count; ++i) { +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth); +#elif !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) + h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); +#else + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); +#endif + } + + EXPECT_EQ(h0, h1); + + free_aligned_buffer_page_end(src_a); + free_aligned_buffer_page_end(src_b); +} + +TEST_F(LibYUVBaseTest, BenchmarkHammingDistance_C) { + const int kMaxWidth = 4096 * 3; + align_buffer_page_end(src_a, kMaxWidth); + align_buffer_page_end(src_b, kMaxWidth); + memset(src_a, 0, kMaxWidth); + memset(src_b, 0, kMaxWidth); + + // Test known value + memcpy(src_a, "test0123test4567", 16); + memcpy(src_b, "tick0123tock4567", 16); + uint32 h1 = HammingDistance_C(src_a, src_b, 16); + EXPECT_EQ(16u, h1); + + // Test C vs OPT on random buffer + MemRandomize(src_a, kMaxWidth); + MemRandomize(src_b, kMaxWidth); + + uint32 h0 = HammingDistance_C(src_a, src_b, kMaxWidth); + + int count = + benchmark_iterations_ * + ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth); + for (int i = 0; i < count; ++i) { + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); + } + + EXPECT_EQ(h0, h1); + + free_aligned_buffer_page_end(src_a); + free_aligned_buffer_page_end(src_b); +} + TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) { const int kMaxWidth = 4096 * 3; align_buffer_page_end(src_a, kMaxWidth);