diff --git a/BUILD.gn b/BUILD.gn index b0c43725d..52301d122 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -158,9 +158,13 @@ static_library("libyuv_internal") { } # To enable AVX2 or other cpu optimization, pass flag here - # cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ] if (!is_win) { - cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON. + cflags = [ + # "-mpopcnt", + # "-mavx2", + # "-mfma", + "-ffp-contract=fast", # Enable fma vectorization for NEON. + ] } } if (libyuv_use_neon) { diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index 9316dc22b..16d8c4bb3 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -49,6 +49,7 @@ extern "C" { // #define DISABLE_CLANG_MSA 1 #endif +// The following are available for Visual C: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) #define HAS_HASHDJB2_AVX2 @@ -69,6 +70,12 @@ extern "C" { #define HAS_SUMSQUAREERROR_AVX2 #endif +// The following are available for VGCC and clangcl 64 bit: +#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_HAMMINGDISTANCE_AVX2 +#endif + // The following are available for Neon: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) @@ -86,6 +93,8 @@ extern "C" { uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); +uint32 HammingDistance_SSE2(const uint8* src_a, const uint8* src_b, int count); +uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count); diff --git a/source/compare.cc b/source/compare.cc index 20afa0cef..9f4403d7e 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -115,7 +115,8 @@ uint64 ComputeHammingDistance(const uint8* src_a, const uint8* src_b, int count) { const int kBlockSize = 65536; - int remainder = count & (kBlockSize - 1) & ~31; + // SIMD for multiple of 64, and C for remainder + int remainder = count & (kBlockSize - 1) & ~63; uint64 diff = 0; int i; uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) = diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 994fb10fd..7432eb446 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -35,6 +35,63 @@ uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { return diff; } +#ifdef HAS_HAMMINGDISTANCE_AVX2 +static uint32 kNibbleMask = 0x0f0f0f0fu; +static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; + +uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) { + uint32 diff = 0u; + + asm volatile( + "vbroadcastss %4,%%ymm2 \n" + "vbroadcastf128 %5,%%ymm3 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm1,%%ymm1,%%ymm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqa (%0),%%ymm4 \n" + "vmovdqa 0x20(%0), %%ymm5 \n" + "vpxor (%0,%1), %%ymm4, %%ymm4 \n" + "vpand %%ymm2,%%ymm4,%%ymm6 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" + "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" + "add $0x40,%0 \n" + "vpand %%ymm2,%%ymm4,%%ymm5 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" + "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" + "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + + "vpermq $0xb1,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xaa,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vmovd %%xmm0, %3 \n" + "vzeroupper \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + + return diff; +} +#endif // HAS_HAMMINGDISTANCE_AVX2 + uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 sse; asm volatile ( diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index 149a9a13a..dabc61851 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -229,13 +229,19 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) { for (int i = 0; i < count; ++i) { #if defined(HAS_HAMMINGDISTANCE_NEON) h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth); +#elif defined(HAS_HAMMINGDISTANCE_AVX2) + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + if (has_avx2) { + h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); + } else { + h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); + } #elif defined(HAS_HAMMINGDISTANCE_X86) h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); #else h1 = HammingDistance_C(src_a, src_b, kMaxWidth); #endif } - EXPECT_EQ(h0, h1); free_aligned_buffer_page_end(src_a); diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 6ef7059f4..04591fbcf 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2878,13 +2878,13 @@ float TestCopySamples(int benchmark_width, TEST_F(LibYUVPlanarTest, TestCopySamples_C) { float diff = TestCopySamples(benchmark_width_, benchmark_height_, - benchmark_iterations_, false); + benchmark_iterations_, false); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestCopySamples_Opt) { float diff = TestCopySamples(benchmark_width_, benchmark_height_, - benchmark_iterations_, true); + benchmark_iterations_, true); EXPECT_EQ(0, diff); }