diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index 4eccfccd0..9b7013a24 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -60,7 +60,7 @@ extern "C" { (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86)) #define HAS_HASHDJB2_SSE41 #define HAS_SUMSQUAREERROR_SSE2 -#define HAS_HAMMINGDISTANCE_X86 +#define HAS_HAMMINGDISTANCE_SSE42 #endif // The following are available for Visual C and clangcl 32 bit: @@ -98,7 +98,7 @@ extern "C" { #endif uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); -uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); +uint32 HammingDistance_SSE42(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); diff --git a/source/compare.cc b/source/compare.cc index bfb8c080d..8c379b59c 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -130,16 +130,16 @@ uint64 ComputeHammingDistance(const uint8* src_a, HammingDistance = HammingDistance_NEON; } #endif -#if defined(HAS_HAMMINGDISTANCE_X86) - if (TestCpuFlag(kCpuHasX86)) { - HammingDistance = HammingDistance_X86; - } -#endif #if defined(HAS_HAMMINGDISTANCE_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { HammingDistance = HammingDistance_SSSE3; } #endif +#if defined(HAS_HAMMINGDISTANCE_SSE42) + if (TestCpuFlag(kCpuHasSSE42)) { + HammingDistance = HammingDistance_SSE42; + } +#endif #if defined(HAS_HAMMINGDISTANCE_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { HammingDistance = HammingDistance_AVX2; diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 6b2b2d85e..8e673291b 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -22,18 +22,92 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) -uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { +#if defined(__x86_64__) +uint32 HammingDistance_SSE42(const uint8* src_a, + const uint8* src_b, + int count) { + uint64 diff = 0u; + + asm volatile( + "xor %%r15,%%r15 \n" + "xor %%r14,%%r14 \n" + "xor %%r13,%%r13 \n" + "xor %%r12,%%r12 \n" + + LABELALIGN + "1: \n" + "mov (%0),%%rax \n" + "mov 0x8(%0),%%rdx \n" + "xor (%1),%%rax \n" + "xor 0x8(%1),%%rdx \n" + "popcnt %%rax,%%rax \n" + "popcnt %%rdx,%%rdx \n" + "mov 0x10(%0),%%rcx \n" + "mov 0x18(%0),%%rsi \n" + "xor 0x10(%1),%%rcx \n" + "xor 0x18(%1),%%rsi \n" + "popcnt %%rcx,%%rcx \n" + "popcnt %%rsi,%%rsi \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "add %%rax,%%r15 \n" + "add %%rdx,%%r14 \n" + "add %%rcx,%%r13 \n" + "add %%rsi,%%r12 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "add %%r15, %%r14 \n" + "add %%r13, %%r12 \n" + "add %%r14, %%r12 \n" + "mov %%r12, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : + : "memory", "cc", "rax", "rdx", "rcx", "rsi", "r12", "r13", "r14", "r15"); + + return static_cast(diff); +} +#else +uint32 HammingDistance_SSE42(const uint8* src_a, + const uint8* src_b, + int count) { uint32 diff = 0u; - int i; - for (i = 0; i < count - 7; i += 8) { - uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b); - src_a += 8; - src_b += 8; - diff += __builtin_popcountll(x); - } + asm volatile(LABELALIGN + "1: \n" + "mov (%0),%%eax \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%eax \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%eax,%%eax \n" + "add %%eax,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%eax \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%eax \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%eax,%%eax \n" + "add %%eax,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "+r"(diff) // %3 + : + : "memory", "cc", "eax", "edx"); + return diff; } +#endif static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; diff --git a/source/compare_win.cc b/source/compare_win.cc index 0cf3f989c..bcd6a88eb 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -25,7 +25,9 @@ extern "C" { // This module is for 32 bit Visual C x86 and clangcl #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { +uint32 HammingDistance_SSE42(const uint8* src_a, + const uint8* src_b, + int count) { uint32 diff = 0u; int i; diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index fbcebd8b9..7258809a8 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -234,18 +234,29 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) { if (has_avx2) { h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); } else { - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - if (has_ssse3) { - h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); + int has_sse42 = TestCpuFlag(kCpuHasSSE42); + if (has_sse42) { + h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { - h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + if (has_ssse3) { + h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); + } else { + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); + } } } -#elif defined(HAS_HAMMINGDISTANCE_X86) - h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); -#else +#elif defined(HAS_HAMMINGDISTANCE_SSE42) + int has_sse42 = TestCpuFlag(kCpuHasSSE42); + if (has_sse42) { + h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); + } else { + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); + } +#else h1 = HammingDistance_C(src_a, src_b, kMaxWidth); #endif + } EXPECT_EQ(h0, h1); @@ -328,59 +339,63 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848 TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { uint32 h1 = 0; - align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_); - align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_); - memset(src_a, 255u, benchmark_width_ * benchmark_height_); - memset(src_b, 0, benchmark_width_ * benchmark_height_); + const int kMaxWidth =benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_a, kMaxWidth); + align_buffer_page_end(src_b, kMaxWidth); + memset(src_a, 255u, kMaxWidth); + memset(src_b, 0u, kMaxWidth); - uint64 h0 = ComputeHammingDistance(src_a, src_b, - benchmark_width_ * benchmark_height_); - EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h0); + uint64 h0 = ComputeHammingDistance(src_a, src_b, kMaxWidth); + EXPECT_EQ(kMaxWidth * 8ULL, h0); for (int i = 0; i < benchmark_iterations_; ++i) { #if defined(HAS_HAMMINGDISTANCE_NEON) - h1 = HammingDistance_NEON(src_a, src_b, - benchmark_width_ * benchmark_height_); + h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth); #elif defined(HAS_HAMMINGDISTANCE_AVX2) int has_avx2 = TestCpuFlag(kCpuHasAVX2); if (has_avx2) { - h1 = HammingDistance_AVX2(src_a, src_b, - benchmark_width_ * benchmark_height_); + h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); } else { - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - if (has_ssse3) { - h1 = HammingDistance_SSSE3(src_a, src_b, - benchmark_width_ * benchmark_height_); + int has_sse42 = TestCpuFlag(kCpuHasSSE42); + if (has_sse42) { + h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { - h1 = HammingDistance_X86(src_a, src_b, - benchmark_width_ * benchmark_height_); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + if (has_ssse3) { + h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); + } else { + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); + } } } -#elif defined(HAS_HAMMINGDISTANCE_X86) - h1 = - HammingDistance_X86(src_a, src_b, benchmark_width_ * benchmark_height_); -#else - h1 = HammingDistance_C(src_a, src_b, benchmark_width_ * benchmark_height_); +#elif defined(HAS_HAMMINGDISTANCE_SSE42) + int has_sse42 = TestCpuFlag(kCpuHasSSE42); + if (has_sse42) { + h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); + } else { + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); + } +#else + h1 = HammingDistance_C(src_a, src_b, kMaxWidth); #endif } + // A large count will cause the low level to potentially overflow so the // result can not be expected to be correct. // TODO(fbarchard): Consider expecting the low 16 bits to match. - if ((benchmark_width_ * benchmark_height_) <= kMaxOptCount) { - EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8U, h1); + if (kMaxWidth<= kMaxOptCount) { + EXPECT_EQ(kMaxWidth * 8U, h1); } else { - if (benchmark_width_ * benchmark_height_ * 8ULL != - static_cast(h1)) { + if (kMaxWidth * 8ULL != static_cast(h1)) { printf( "warning - HammingDistance_Opt %u does not match %llu " "but length of %u is longer than guaranteed.\n", - h1, benchmark_width_ * benchmark_height_ * 8ULL, - benchmark_width_ * benchmark_height_); + h1, kMaxWidth * 8ULL, kMaxWidth); } else { printf( "warning - HammingDistance_Opt %u matches but length of %u " "is longer than guaranteed.\n", - h1, benchmark_width_ * benchmark_height_); + h1, kMaxWidth); } }