diff --git a/README.chromium b/README.chromium index cf227896d..766b50a13 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 781 +Version: 782 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d5dfc9b85..0f24bd848 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 781 +#define LIBYUV_VERSION 782 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 823dc2ca5..24f03dfa3 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2052,7 +2052,7 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, const float* poly, int width) = ARGBPolynomialRow_C; #if defined(HAS_ARGBPOLYNOMIALROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { ARGBPolynomialRow = ARGBPolynomialRow_SSE2; } #endif diff --git a/source/row_win.cc b/source/row_win.cc index a5ac8abd9..3e1aabd1b 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6774,42 +6774,53 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, uint8* dst_argb, const float* poly, int width) { __asm { - mov eax, [esp + 12] /* poly */ - movdqu xmm4, [eax] - movdqu xmm5, [eax + 16] - movdqu xmm6, [eax + 32] - movdqu xmm7, [eax + 48] - - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 16] /* width */ + push esi + mov eax, [esp + 4 + 4] /* src_argb */ + mov edx, [esp + 4 + 8] /* dst_argb */ + mov esi, [esp + 4 + 12] /* poly */ + mov ecx, [esp + 4 + 16] /* width */ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. align 16 convertloop: -// (slow) vpmovzxbd xmm0, dword ptr [eax] // BGRA pixel - movd xmm0, [eax] // BGRA - lea eax, [eax + 4] +// (slow) pmovzxbd xmm0, dword ptr [eax] // BGRA pixel + movq xmm0, qword ptr [eax] // BGRABGRA + lea eax, [eax + 8] punpcklbw xmm0, xmm3 - punpcklwd xmm0, xmm3 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 // pixel 0 + punpckhwd xmm4, xmm3 // pixel 1 cvtdq2ps xmm0, xmm0 // 4 floats + cvtdq2ps xmm4, xmm4 movdqa xmm1, xmm0 // X - mulps xmm0, xmm5 // C1 * X - addps xmm0, xmm4 // result = C0 + C1 * X + movdqa xmm5, xmm4 + mulps xmm0, [esi + 16] // C1 * X + mulps xmm4, [esi + 16] + addps xmm0, [esi] // result = C0 + C1 * X + addps xmm4, [esi] movdqa xmm2, xmm1 + movdqa xmm6, xmm5 mulps xmm2, xmm1 // X * X + mulps xmm6, xmm5 mulps xmm1, xmm2 // X * X * X - mulps xmm2, xmm6 // C2 * X * X - mulps xmm1, xmm7 // C3 * X * X * X + mulps xmm5, xmm6 + mulps xmm2, [esi + 32] // C2 * X * X + mulps xmm6, [esi + 32] + mulps xmm1, [esi + 48] // C3 * X * X * X + mulps xmm5, [esi + 48] addps xmm0, xmm2 // result += C2 * X * X + addps xmm4, xmm6 addps xmm0, xmm1 // result += C3 * X * X * X + addps xmm4, xmm5 cvttps2dq xmm0, xmm0 + cvttps2dq xmm4, xmm4 + packuswb xmm0, xmm4 packuswb xmm0, xmm0 - packuswb xmm0, xmm0 - sub ecx, 1 - movd [edx], xmm0 - lea edx, [edx + 4] + sub ecx, 2 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] jg convertloop + pop esi ret } } diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 6baecefe0..117d6d276 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1660,7 +1660,7 @@ TEST_F(libyuvTest, TestARGBPolynomial) { SIMD_ALIGNED(uint8 orig_pixels[1280][4]); SIMD_ALIGNED(uint8 dst_pixels[1280][4]); - static const float kWarmifyPolynomial[16] = { + SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = { 0.94230f, -3.03300f, -2.92500f, 0.f, // C0 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x