diff --git a/README.chromium b/README.chromium
index f891fcca1..2564fcc8d 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 137
+Version: 140
 License: BSD
 License File: LICENSE
 
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 018b9c11a..9469dbb6a 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1653,7 +1653,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
 
   SIMD_ALIGNED(uint8 row[kMaxStride]);
   void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
-#if defined(HAS_ARGBTORGB24ROW_SSSE3_DISABLED)
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) &&
       IS_ALIGNED(width, 16) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -1709,7 +1709,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
 
   SIMD_ALIGNED(uint8 row[kMaxStride]);
   void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix);
-#if defined(HAS_ARGBTORAWROW_SSSE3_DISABLED)
+#if defined(HAS_ARGBTORAWROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) &&
       IS_ALIGNED(width, 16) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -1765,7 +1765,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
 
   SIMD_ALIGNED(uint8 row[kMaxStride]);
   void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
-#if defined(HAS_ARGBTORGB565ROW_SSE2_DISABLED)
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) &&
       IS_ALIGNED(width, 16) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -1821,7 +1821,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
 
   SIMD_ALIGNED(uint8 row[kMaxStride]);
   void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
-#if defined(HAS_ARGBTOARGB1555ROW_SSE2_DISABLED)
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) &&
       IS_ALIGNED(width, 16) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -2195,7 +2195,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
-#if defined(HAS_ARGBTORGB24ROW_SSSE3_DISABLED)
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) &&
       IS_ALIGNED(width, 16) &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
@@ -2225,7 +2225,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix);
-#if defined(HAS_ARGBTORAWROW_SSSE3_DISABLED)
+#if defined(HAS_ARGBTORAWROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) &&
       IS_ALIGNED(width, 16) &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
@@ -2276,10 +2276,10 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
 
   SIMD_ALIGNED(uint8 row[kMaxStride]);
   void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
-#if defined(HAS_ARGBTORGB565ROW_SSE2_DISABLED)
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) &&
       IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+      IS_ALIGNED(dst_rgb, 16) && IS_ALIGNED(dst_stride_rgb, 16)) {
     ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
   } else
 #endif
diff --git a/source/row_win.cc b/source/row_win.cc
index ecd9a82e9..f47ea4b83 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -523,16 +523,16 @@ __asm {
     pshufb    xmm2, xmm5
     pshufb    xmm3, xmm5
     movdqa    xmm4, xmm1
-    psllq     xmm4, 12
+    pslldq    xmm4, 12
     por       xmm4, xmm0
     movdqa    [edx], xmm4   // first 16 bytes
     movdqa    xmm4, xmm2
-    psrlq     xmm1, 4
-    psllq     xmm4, 8
+    psrldq    xmm1, 4
+    pslldq    xmm4, 8
     por       xmm1, xmm4
     movdqa    [edx + 16], xmm1   // middle 16 bytes
-    psrlq     xmm2, 8
-    psllq     xmm3, 4
+    psrldq    xmm2, 8
+    pslldq    xmm3, 4
     por       xmm2, xmm3
     movdqa    [edx + 32], xmm2   // last 16 bytes
     lea       edx, [edx + 48]
@@ -562,16 +562,16 @@ __asm {
     pshufb    xmm2, xmm5
     pshufb    xmm3, xmm5
     movdqa    xmm4, xmm1
-    psllq     xmm4, 12
+    pslldq    xmm4, 12
     por       xmm4, xmm0
     movdqa    [edx], xmm4   // first 16 bytes
     movdqa    xmm4, xmm2
-    psrlq     xmm1, 4
-    psllq     xmm4, 8
+    psrldq    xmm1, 4
+    pslldq    xmm4, 8
     por       xmm1, xmm4
     movdqa    [edx + 16], xmm1   // middle 16 bytes
-    psrlq     xmm2, 8
-    psllq     xmm3, 4
+    psrldq    xmm2, 8
+    pslldq    xmm3, 4
     por       xmm2, xmm3
     movdqa    [edx + 32], xmm2   // last 16 bytes
     lea       edx, [edx + 48]
@@ -582,6 +582,7 @@ __asm {
 }
 
 // TODO(fbarchard): Port to gcc
+// TODO(fbarchard): Improve sign extension/packing
 __declspec(naked)
 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
@@ -591,7 +592,7 @@ __asm {
     psrlw     xmm4, 10
     psllw     xmm4, 5
     pcmpeqb   xmm5, xmm5       // generate mask 0xf800f800
-    psrlw     xmm5, 11
+    psllw     xmm5, 11
 
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
@@ -599,20 +600,20 @@ __asm {
 
  convertloop:
     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
-    lea       eax, [eax + 16]
     movdqa    xmm1, xmm0    // B
-    psrlw     xmm1, 3
+    psrld     xmm1, 3
     pand      xmm1, xmm3
     movdqa    xmm2, xmm0    // G
-    psrlw     xmm2, 5
+    psrld     xmm2, 5
     pand      xmm2, xmm4
     por       xmm1, xmm2
-    psrlw     xmm0, 8       // R
+    psrld     xmm0, 8       // R
     pand      xmm0, xmm5
     por       xmm0, xmm1
     pslld     xmm0, 16
     psrad     xmm0, 16
     packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
     lea       edx, [edx + 8]
     sub       ecx, 4
@@ -622,6 +623,7 @@ __asm {
 }
 
 // TODO(fbarchard): Port to gcc
+// TODO(fbarchard): Improve sign extension/packing
 __declspec(naked)
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
@@ -629,10 +631,10 @@ __asm {
     psrlw     xmm3, 11
     movdqa    xmm4, xmm3       // generate mask 0x03e003e0
     psllw     xmm4, 5
-    movdqa    xmm5, xmm3       // generate mask 0x7c007c00
-    psllw     xmm5, 10
+    movdqa    xmm5, xmm4       // generate mask 0x7c007c00
+    psllw     xmm5, 5
     pcmpeqb   xmm6, xmm6       // generate mask 0x80008000
-    psrlw     xmm6, 15
+    psllw     xmm6, 15
 
     mov       eax, [esp + 4]   // src_argb
     mov       edx, [esp + 8]   // dst_rgb
@@ -640,26 +642,25 @@ __asm {
 
  convertloop:
     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
-    lea       eax, [eax + 16]
     movdqa    xmm1, xmm0    // B
-    psrlw     xmm1, 3
+    psrld     xmm1, 3
     pand      xmm1, xmm3
     movdqa    xmm2, xmm0    // G
-    psrlw     xmm2, 6
+    psrld     xmm2, 6
     pand      xmm2, xmm4
     por       xmm1, xmm2
     movdqa    xmm2, xmm0    // R
-    psrlw     xmm2, 9
+    psrld     xmm2, 9
     pand      xmm2, xmm5
     por       xmm1, xmm2
-    movdqa    xmm2, xmm0    // A
-    psrlw     xmm2, 16
-    pand      xmm2, xmm6
-    por       xmm1, xmm2
+    psrld     xmm0, 16      // A
+    pand      xmm0, xmm6
+    por       xmm0, xmm1
     pslld     xmm0, 16
     psrad     xmm0, 16
-    packssdw  xmm1, xmm1
-    movq      qword ptr [edx], xmm1  // store 4 pixels of ARGB1555
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
     lea       edx, [edx + 8]
     sub       ecx, 4
     ja        convertloop
@@ -682,7 +683,6 @@ __asm {
 
  convertloop:
     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
-    lea       eax, [eax + 16]
     movdqa    xmm1, xmm0
     pand      xmm0, xmm3    // low nibble
     pand      xmm1, xmm4    // high nibble
@@ -690,6 +690,7 @@ __asm {
     psrl      xmm1, 8
     por       xmm0, xmm1
     packuswb  xmm0, xmm0
+    lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
     lea       edx, [edx + 8]
     sub       ecx, 4