diff --git a/README.chromium b/README.chromium index e1cde060d..1647a093b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 791 +Version: 792 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 3b5f932d0..672514940 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 791 +#define LIBYUV_VERSION 792 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/compare.cc b/source/compare.cc index 1759a9336..78ef80a02 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -10,6 +10,8 @@ #include "libyuv/compare.h" +#include // printf + #include #include #ifdef _OPENMP @@ -34,9 +36,13 @@ uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); (defined(_M_IX86) || \ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))) #define HAS_HASHDJB2_SSE41 - uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); +#if _MSC_VER >= 1700 +#define HAS_HASHDJB2_AVX2 +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); +#endif + #endif // HAS_HASHDJB2_SSE41 // hash seed of 5381 recommended. @@ -48,6 +54,11 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { HashDjb2_SSE = HashDjb2_SSE41; } #endif +#if defined(HAS_HASHDJB2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HashDjb2_SSE = HashDjb2_AVX2; + } +#endif const int kBlockSize = 1 << 15; // 32768; while (count >= static_cast(kBlockSize)) { diff --git a/source/compare_win.cc b/source/compare_win.cc index dd2661073..8bc3f7400 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -184,6 +184,46 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { ret } } + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +__declspec(naked) __declspec(align(16)) +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + movdqa xmm6, kHash16x33 + + align 16 + wloop: + vpmovzxbd xmm3, dword ptr [eax] // src[0-3] + pmulld xmm0, xmm6 // hash *= 33 ^ 16 + vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] + pmulld xmm3, kHashMul0 + vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] + pmulld xmm4, kHashMul1 + vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] + pmulld xmm2, kHashMul2 + lea eax, [eax + 16] + pmulld xmm1, kHashMul3 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + sub ecx, 16 + paddd xmm1, xmm3 + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} +#endif // _MSC_VER >= 1700 + #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #ifdef __cplusplus