diff --git a/BUILD.gn b/BUILD.gn index 52301d122..9badf08c8 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -160,9 +160,9 @@ static_library("libyuv_internal") { # To enable AVX2 or other cpu optimization, pass flag here if (!is_win) { cflags = [ - # "-mpopcnt", - # "-mavx2", - # "-mfma", + # "-mpopcnt", + # "-mavx2", + # "-mfma", "-ffp-contract=fast", # Enable fma vectorization for NEON. ] } diff --git a/README.chromium b/README.chromium index 192264554..9d05c559e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1671 +Version: 1672 License: BSD License File: LICENSE diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index 16d8c4bb3..4eccfccd0 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -70,7 +70,13 @@ extern "C" { #define HAS_SUMSQUAREERROR_AVX2 #endif -// The following are available for VGCC and clangcl 64 bit: +// The following are available for GCC and clangcl 64 bit: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_HAMMINGDISTANCE_SSSE3 +#endif + +// The following are available for GCC and clangcl 64 bit: #if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) #define HAS_HAMMINGDISTANCE_AVX2 @@ -93,7 +99,7 @@ extern "C" { uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); -uint32 HammingDistance_SSE2(const uint8* src_a, const uint8* src_b, int count); +uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 6d614557a..2314ef0a0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1671 +#define LIBYUV_VERSION 1672 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare.cc b/source/compare.cc index 9f4403d7e..63ec526e4 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -131,6 +131,11 @@ uint64 ComputeHammingDistance(const uint8* src_a, HammingDistance = HammingDistance_X86; } #endif +#if defined(HAS_HAMMINGDISTANCE_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + HammingDistance = HammingDistance_SSSE3; + } +#endif #if defined(HAS_HAMMINGDISTANCE_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { HammingDistance = HammingDistance_AVX2; diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 7432eb446..6b2b2d85e 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -35,15 +35,74 @@ uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) { return diff; } -#ifdef HAS_HAMMINGDISTANCE_AVX2 -static uint32 kNibbleMask = 0x0f0f0f0fu; +static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15}; static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; +uint32 HammingDistance_SSSE3(const uint8* src_a, + const uint8* src_b, + int count) { + uint32 diff = 0u; + + asm volatile( + "movdqa %4,%%xmm2 \n" + "movdqa %5,%%xmm3 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm4 \n" + "movdqa 0x10(%0), %%xmm5 \n" + "pxor (%0,%1), %%xmm4 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pand %%xmm2,%%xmm6 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm6,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "paddb %%xmm7,%%xmm6 \n" + "pxor 0x10(%0,%1),%%xmm5 \n" + "add $0x20,%0 \n" + "movdqa %%xmm5,%%xmm4 \n" + "pand %%xmm2,%%xmm5 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm5,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm4,%%xmm5 \n" + "paddb %%xmm7,%%xmm5 \n" + "paddb %%xmm5,%%xmm6 \n" + "psadbw %%xmm1,%%xmm6 \n" + "paddd %%xmm6,%%xmm0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "pshufd $0xaa,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "vmovd %%xmm0, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); + + return diff; +} + +#ifdef HAS_HAMMINGDISTANCE_AVX2 uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) { uint32 diff = 0u; asm volatile( - "vbroadcastss %4,%%ymm2 \n" + "vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %5,%%ymm3 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n" "vpxor %%ymm1,%%ymm1,%%ymm1 \n" @@ -83,7 +142,7 @@ uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) { : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 - "=g"(diff) // %3 + "=r"(diff) // %3 : "m"(kNibbleMask), // %4 "m"(kBitCount) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); diff --git a/source/row_neon64.cc b/source/row_neon64.cc index f7cc789ce..5616d8a5b 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -636,10 +636,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { "subs %w2, %w2, #32 \n" // 32 processed per loop "stp q0, q1, [%1], #32 \n" "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 // Output registers - : // Input registers + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 // Output registers + : // Input registers : "cc", "memory", "v0", "v1" // Clobber List ); } diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index dabc61851..74e155008 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -234,7 +234,12 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) { if (has_avx2) { h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); } else { - h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + if (has_ssse3) { + h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); + } else { + h1 = HammingDistance_X86(src_a, src_b, kMaxWidth); + } } #elif defined(HAS_HAMMINGDISTANCE_X86) h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);