diff --git a/README.chromium b/README.chromium index 2263bf9c1..5beb92e5d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 572 +Version: 573 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index cf4b72a5b..f5f4ad4c6 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 572 +#define LIBYUV_VERSION 573 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/compare.cc b/source/compare.cc index 2dbf311b5..b829eb035 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -102,7 +102,9 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, } #endif #if defined(HAS_SUMSQUAREERROR_AVX2) + bool clear = false; if (TestCpuFlag(kCpuHasAVX2)) { + clear = true; // Note only used for multiples of 32 so count is not checked. SumSquareError = SumSquareError_AVX2; } @@ -130,6 +132,12 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, if (remainder) { sse += SumSquareError_C(src_a, src_b, remainder); } + +#if defined(HAS_SUMSQUAREERROR_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return sse; } @@ -157,7 +165,9 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, } #endif #if defined(HAS_SUMSQUAREERROR_AVX2) + bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + clear = true; SumSquareError = SumSquareError_AVX2; } #endif @@ -168,6 +178,11 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, src_b += stride_b; } +#if defined(HAS_SUMSQUAREERROR_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return sse; } diff --git a/source/compare_win.cc b/source/compare_win.cc index 1a4ad1985..720152fd4 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -47,9 +47,9 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { paddd xmm0, xmm2 jg wloop - pshufd xmm1, xmm0, 0EEh + pshufd xmm1, xmm0, 0xee paddd xmm0, xmm1 - pshufd xmm1, xmm0, 01h + pshufd xmm1, xmm0, 0x01 paddd xmm0, xmm1 movd eax, xmm0 ret @@ -67,7 +67,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { mov edx, [esp + 8] // src_b mov ecx, [esp + 12] // count vpxor ymm0, ymm0, ymm0 // sum - vpxor ymm5, ymm5, ymm5 // for unpack. + vpxor ymm5, ymm5, ymm5 // constant 0 for unpck sub edx, eax align 16 @@ -92,9 +92,8 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. vpaddd ymm0, ymm0, ymm1 vpermq ymm1, ymm0, 0x02 // high + low lane. - vpaddd ymm4, ymm0, ymm1 - vzeroupper // TODO(fbarchard): Remove. - movd eax, xmm4 + vpaddd ymm0, ymm0, ymm1 + vmovd eax, xmm0 ret } } @@ -173,14 +172,14 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { sub ecx, 16 paddd xmm1, xmm3 - pshufd xmm2, xmm1, 14 // upper 2 dwords + pshufd xmm2, xmm1, 0x0e // upper 2 dwords paddd xmm1, xmm2 - pshufd xmm2, xmm1, 1 + pshufd xmm2, xmm1, 0x01 paddd xmm1, xmm2 paddd xmm0, xmm1 jg wloop - movd eax, xmm0 // return hash + movd eax, xmm0 // return hash ret } }