diff --git a/README.chromium b/README.chromium
index cf227896d..766b50a13 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 781
+Version: 782
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index d5dfc9b85..0f24bd848 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 781
+#define LIBYUV_VERSION 782
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 823dc2ca5..24f03dfa3 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -2052,7 +2052,7 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
                             uint8* dst_argb, const float* poly,
                             int width) = ARGBPolynomialRow_C;
 #if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
     ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
   }
 #endif
diff --git a/source/row_win.cc b/source/row_win.cc
index a5ac8abd9..3e1aabd1b 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -6774,42 +6774,53 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
                             uint8* dst_argb, const float* poly,
                             int width) {
   __asm {
-    mov        eax, [esp + 12]   /* poly */
-    movdqu     xmm4, [eax]
-    movdqu     xmm5, [eax + 16]
-    movdqu     xmm6, [eax + 32]
-    movdqu     xmm7, [eax + 48]
-
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 16]  /* width */
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* src_argb */
+    mov        edx, [esp + 4 + 8]   /* dst_argb */
+    mov        esi, [esp + 4 + 12]  /* poly */
+    mov        ecx, [esp + 4 + 16]  /* width */
     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
 
     align      16
  convertloop:
-// (slow)   vpmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-    movd       xmm0, [eax]  // BGRA
-    lea        eax, [eax + 4]
+// (slow)   pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+    movq       xmm0, qword ptr [eax]  // BGRABGRA
+    lea        eax, [eax + 8]
     punpcklbw  xmm0, xmm3
-    punpcklwd  xmm0, xmm3
+    movdqa     xmm4, xmm0
+    punpcklwd  xmm0, xmm3  // pixel 0
+    punpckhwd  xmm4, xmm3  // pixel 1
     cvtdq2ps   xmm0, xmm0  // 4 floats
+    cvtdq2ps   xmm4, xmm4
     movdqa     xmm1, xmm0  // X
-    mulps      xmm0, xmm5  // C1 * X
-    addps      xmm0, xmm4  // result = C0 + C1 * X
+    movdqa     xmm5, xmm4
+    mulps      xmm0, [esi + 16]  // C1 * X
+    mulps      xmm4, [esi + 16]
+    addps      xmm0, [esi]  // result = C0 + C1 * X
+    addps      xmm4, [esi]
     movdqa     xmm2, xmm1
+    movdqa     xmm6, xmm5
     mulps      xmm2, xmm1  // X * X
+    mulps      xmm6, xmm5
     mulps      xmm1, xmm2  // X * X * X
-    mulps      xmm2, xmm6  // C2 * X * X
-    mulps      xmm1, xmm7  // C3 * X * X * X
+    mulps      xmm5, xmm6
+    mulps      xmm2, [esi + 32]  // C2 * X * X
+    mulps      xmm6, [esi + 32]
+    mulps      xmm1, [esi + 48]  // C3 * X * X * X
+    mulps      xmm5, [esi + 48]
     addps      xmm0, xmm2  // result += C2 * X * X
+    addps      xmm4, xmm6
     addps      xmm0, xmm1  // result += C3 * X * X * X
+    addps      xmm4, xmm5
     cvttps2dq  xmm0, xmm0
+    cvttps2dq  xmm4, xmm4
+    packuswb   xmm0, xmm4
     packuswb   xmm0, xmm0
-    packuswb   xmm0, xmm0
-    sub        ecx, 1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
+    sub        ecx, 2
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
     jg         convertloop
+    pop        esi
     ret
   }
 }
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 6baecefe0..117d6d276 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -1660,7 +1660,7 @@ TEST_F(libyuvTest, TestARGBPolynomial) {
   SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
   SIMD_ALIGNED(uint8 dst_pixels[1280][4]);
 
-  static const float kWarmifyPolynomial[16] = {
+  SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = {
     0.94230f,  -3.03300f,    -2.92500f,  0.f,  // C0
     0.584500f,  1.112000f,    1.535000f, 1.f,  // C1 x
     0.001313f, -0.002503f,   -0.004496f, 0.f,  // C2 x * x