diff --git a/README.chromium b/README.chromium
index 23439308d..2c4937a49 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 104
+Version: 106
 License: BSD
 License File: LICENSE
 
diff --git a/source/row_posix.cc b/source/row_posix.cc
index fad20b2a8..eadde7818 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -558,47 +558,49 @@ void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,  // rdi
 #endif
 
 #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
+
 void FastConvertYToARGBRow_SSE2(const uint8* y_buf,  // rdi
                                 uint8* rgb_buf,      // rcx
                                 int width) {         // r8
   asm volatile (
-  "pcmpeqb     %%xmm5,%%xmm5                   \n"
-  "pslld       $0x18,%%xmm5                    \n"
-  "pxor        %%xmm4,%%xmm4                   \n"
-  "movdqa      %3,%%xmm3                       \n"
-  "movdqa      %4,%%xmm2                       \n"
+  "pcmpeqb     %%xmm4,%%xmm4                   \n"
+  "pslld       $0x18,%%xmm4                    \n"
+  "mov         $0x10001000,%%eax               \n"
+  "movd        %%eax,%%xmm3                    \n"
+  "pshufd      $0x0,%%xmm3,%%xmm3              \n"
+  "mov         $0x012a012a,%%eax               \n"
+  "movd        %%eax,%%xmm2                    \n"
+  "pshufd      $0x0,%%xmm2,%%xmm2              \n"
 
   "1:                                          \n"
-  // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-  "movq        (%0),%%xmm0                     \n"
-  "lea         0x8(%0),%0                      \n"
-  "punpcklbw   %%xmm4,%%xmm0                   \n"
-  "psubsw      %%xmm3,%%xmm0                   \n"
-  "pmullw      %%xmm2,%%xmm0                   \n"
-  "psraw       $0x6,%%xmm0                     \n"
-  "packuswb    %%xmm0,%%xmm0                   \n"
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    "movq        (%0),%%xmm0                   \n"
+    "lea         0x8(%0),%0                    \n"
+    "punpcklbw   %%xmm0,%%xmm0                 \n"
+    "psubusw     %%xmm3,%%xmm0                 \n"
+    "pmulhuw     %%xmm2,%%xmm0                 \n"
+    "packuswb    %%xmm0,%%xmm0                 \n"
 
-  // Step 2: Weave into ARGB
-  "punpcklbw   %%xmm0,%%xmm0                   \n"
-  "movdqa      %%xmm0,%%xmm1                   \n"
-  "punpcklwd   %%xmm0,%%xmm0                   \n"
-  "por         %%xmm5,%%xmm0                   \n"
-  "movdqa      %%xmm0,(%1)                     \n"
-  "punpckhwd   %%xmm1,%%xmm1                   \n"
-  "por         %%xmm5,%%xmm1                   \n"
-  "movdqa      %%xmm1,16(%1)                   \n"
-  "lea         32(%1),%1                       \n"
+    // Step 2: Weave into ARGB
+    "punpcklbw   %%xmm0,%%xmm0                 \n"
+    "movdqa      %%xmm0,%%xmm1                 \n"
+    "punpcklwd   %%xmm0,%%xmm0                 \n"
+    "punpckhwd   %%xmm1,%%xmm1                 \n"
+    "por         %%xmm4,%%xmm0                 \n"
+    "por         %%xmm4,%%xmm1                 \n"
+    "movdqa      %%xmm0,(%1)                   \n"
+    "movdqa      %%xmm1,16(%1)                 \n"
+    "lea         32(%1),%1                     \n"
 
-  "sub         $0x8,%2                         \n"
-  "ja          1b                              \n"
+    "sub         $0x8,%2                       \n"
+    "ja          1b                            \n"
   : "+r"(y_buf),    // %0
     "+r"(rgb_buf),  // %1
     "+rm"(width)    // %2
-  : "m"(kYuvConstants.kYSub16),  // %3
-    "m"(kYuvConstants.kYToRgb)   // %4
-  : "memory", "cc"
+  :
+  : "memory", "cc", "eax"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
 #endif
   );
 }
diff --git a/source/row_win.cc b/source/row_win.cc
index 217985a57..9acd70764 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -780,23 +780,25 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
                                 uint8* rgb_buf,
                                 int width) {
   __asm {
+    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pslld      xmm4, 24
+    mov        eax,0x10001000
+    movd       xmm3,eax
+    pshufd     xmm3,xmm3,0
+    mov        eax,0x012a012a
+    movd       xmm2,eax
+    pshufd     xmm2,xmm2,0
     mov        eax, [esp + 4]       // Y
     mov        edx, [esp + 8]       // rgb
     mov        ecx, [esp + 12]      // width
-    pcmpeqb    xmm5, xmm5           // generate mask 0xff000000
-    pslld      xmm5, 24
-    pxor       xmm4, xmm4
-    movdqa     xmm3, kYSub16
-    movdqa     xmm2, kYToRgb
 
  convertloop:
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    movq       xmm0, qword ptr [eax]
+    movq       xmm0, [eax]
     lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm4
-    psubsw     xmm0, xmm3
-    pmullw     xmm0, xmm2
-    psraw      xmm0, 6
+    punpcklbw  xmm0, xmm0           // Y.Y
+    psubusw    xmm0, xmm3
+    pmulhuw    xmm0, xmm2
     packuswb   xmm0, xmm0           // G
 
     // Step 2: Weave into ARGB
@@ -804,8 +806,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
     movdqa     xmm1, xmm0
     punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
-    por        xmm0, xmm5
-    por        xmm1, xmm5
+    por        xmm0, xmm4
+    por        xmm1, xmm4
     movdqa     [edx], xmm0
     movdqa     [edx + 16], xmm1
     lea        edx,  [edx + 32]