diff --git a/README.chromium b/README.chromium index 6a7da8db4..2ce0a625a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 846 +Version: 847 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index ad6b5253d..19fe8dbd3 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 846 +#define LIBYUV_VERSION 847 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 2d7b63eac..7122f48f1 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -5090,19 +5090,20 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, int width, int area, uint8* dst, int count) { asm volatile ( - "movd %5,%%xmm4 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "rcpss %%xmm4,%%xmm4 \n" + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "sub $0x4,%3 \n" "jl 49f \n" "cmpl $0x80,%5 \n" "ja 40f \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x1f,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" - "cvtdq2ps %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" "mulps %%xmm4,%%xmm5 \n" "cvtps2dq %%xmm5,%%xmm5 \n" "packssdw %%xmm5,%%xmm5 \n" @@ -5222,7 +5223,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, , "r14" #endif #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" #endif ); } diff --git a/source/row_win.cc b/source/row_win.cc index 0b8258b03..429f907b7 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -5763,11 +5763,11 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, mov eax, topleft // eax topleft mov esi, botleft // esi botleft mov edx, width - movd xmm4, area + movd xmm5, area mov edi, dst mov ecx, count - cvtdq2ps xmm4, xmm4 - rcpss xmm4, xmm4 // 1.0f / area + cvtdq2ps xmm5, xmm5 + rcpss xmm4, xmm5 // 1.0f / area pshufd xmm4, xmm4, 0 sub ecx, 4 jl l4b @@ -5775,13 +5775,14 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, cmp area, 128 // 128 pixels will not overflow 15 bits. ja l4 - pcmpeqb xmm5, xmm5 // constant of 65536.0 - psrld xmm5, 31 - pslld xmm5, 16 - cvtdq2ps xmm5, xmm5 - mulps xmm5, xmm4 // 65536.0 * 1 / area + pshufd xmm5, xmm5, 0 // area + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 + psrld xmm6, 16 + cvtdq2ps xmm6, xmm6 + addps xmm5, xmm6 // (65536.0 + area - 1) + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area cvtps2dq xmm5, xmm5 // 0.16 fixed point - packssdw xmm5, xmm5 + packssdw xmm5, xmm5 // 16 bit shorts // 4 pixel loop small blocks. align 4