diff --git a/README.chromium b/README.chromium
index 7688ae57b..b60adf7c7 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1788
+Version: 1789
 License: BSD
 License File: LICENSE
 
diff --git a/docs/formats.md b/docs/formats.md
index d628f7f96..12ea9465e 100644
--- a/docs/formats.md
+++ b/docs/formats.md
@@ -189,7 +189,6 @@ In memory R is the lowest and A is the highest.
 Each channel has value ranges from 0 to 65535.
 AR64 is similar to ARGB.
 
-
 # NV12 and NV21
 
 NV12 is a biplanar format with a full sized Y plane followed by a single
@@ -200,3 +199,10 @@ height chroma channel, and therefore is a 420 subsampling.
 NV16 is 16 bits per pixel, with half width and full height.  aka 422.
 NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
 Most NV12 functions allow the destination Y pointer to be NULL.
+
+# YUY2 and UYVY
+
+YUY2 is a packed YUV format with half width, full height.
+
+YUY2 is YUYV in memory
+UYVY is UYVY in memory
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index f713c4770..8b06777fc 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1788
+#define LIBYUV_VERSION 1789
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert_to_i420.cc b/source/convert_to_i420.cc
index ac6eeab24..5869ecd7b 100644
--- a/source/convert_to_i420.cc
+++ b/source/convert_to_i420.cc
@@ -89,18 +89,26 @@ int ConvertToI420(const uint8_t* sample,
 
   switch (format) {
     // Single plane formats
-    case FOURCC_YUY2:
+    case FOURCC_YUY2: {  // TODO(fbarchard): Find better odd crop fix.
+      uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
+      uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
+      int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
+      int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
+      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
+                     stride_u, v, stride_v, crop_width, inv_crop_height);
       break;
-    case FOURCC_UYVY:
+    }
+    case FOURCC_UYVY: {
+      uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
+      uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
+      int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
+      int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
+      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
+                     stride_u, v, stride_v, crop_width, inv_crop_height);
       break;
+    }
     case FOURCC_RGBP:
       src = sample + (src_width * crop_y + crop_x) * 2;
       r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
diff --git a/source/row_win.cc b/source/row_win.cc
index 5203b57c6..7dccacc7f 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2584,7 +2584,7 @@ __declspec(naked) void I422ToRGBARow_AVX2(
     __asm psraw      xmm2, 6                                                   \
     __asm packuswb   xmm0, xmm0 /* B */                                        \
     __asm packuswb   xmm1, xmm1 /* G */                                        \
-    __asm packuswb   xmm2, xmm2 /* R */                                        \
+    __asm packuswb   xmm2, xmm2 /* R */             \
   }
 
 // Store 8 ARGB values.
@@ -4746,22 +4746,22 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
     pmaddubsw  xmm6, xmm2
     phaddw     xmm0, xmm6
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm0, xmm0  // 8 B values
     movdqu     xmm5, [eax]  // G
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm3
     pmaddubsw  xmm1, xmm3
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 G values
-    punpcklbw  xmm0, xmm5   // 8 BG values
+    packuswb   xmm5, xmm5  // 8 G values
+    punpcklbw  xmm0, xmm5  // 8 BG values
     movdqu     xmm5, [eax]  // R
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 R values
+    packuswb   xmm5, xmm5  // 8 R values
     movdqu     xmm6, [eax]  // A
     movdqu     xmm1, [eax + 16]
     psrld      xmm6, 24
@@ -4811,25 +4811,25 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm7   // B
-    phaddsw    xmm6, xmm1   // G
-    psraw      xmm0, 6      // B
-    psraw      xmm6, 6      // G
-    packuswb   xmm0, xmm0   // 8 B values
-    packuswb   xmm6, xmm6   // 8 G values
-    punpcklbw  xmm0, xmm6   // 8 BG values
+    phaddsw    xmm0, xmm7  // B
+    phaddsw    xmm6, xmm1  // G
+    psraw      xmm0, 6  // B
+    psraw      xmm6, 6  // G
+    packuswb   xmm0, xmm0  // 8 B values
+    packuswb   xmm6, xmm6  // 8 G values
+    punpcklbw  xmm0, xmm6  // 8 BG values
     movdqu     xmm1, [eax]  // R
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm7, xmm4
-    phaddsw    xmm1, xmm7   // R
+    phaddsw    xmm1, xmm7  // R
     movdqu     xmm6, [eax]  // A
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm6, xmm5
     pmaddubsw  xmm7, xmm5
     phaddsw    xmm6, xmm7  // A
-    psraw      xmm1, 6     // R
-    psraw      xmm6, 6     // A
+    psraw      xmm1, 6  // R
+    psraw      xmm6, 6  // A
     packuswb   xmm1, xmm1  // 8 R values
     packuswb   xmm6, xmm6  // 8 A values
     punpcklbw  xmm1, xmm6  // 8 RA values
@@ -4872,16 +4872,16 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
 
  convertloop:
     movdqu     xmm0, [eax]  // read 4 pixels
-    punpcklbw  xmm0, xmm5   // first 2 pixels
-    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    punpcklbw  xmm0, xmm5  // first 2 pixels
+    pmulhuw    xmm0, xmm2  // pixel * scale >> 16
     movdqu     xmm1, [eax]  // read 4 pixels
-    punpckhbw  xmm1, xmm5   // next 2 pixels
+    punpckhbw  xmm1, xmm5  // next 2 pixels
     pmulhuw    xmm1, xmm2
-    pmullw     xmm0, xmm3   // * interval_size
+    pmullw     xmm0, xmm3  // * interval_size
     movdqu     xmm7, [eax]  // read 4 pixels
     pmullw     xmm1, xmm3
-    pand       xmm7, xmm6   // mask alpha
-    paddw      xmm0, xmm4   // + interval_size / 2
+    pand       xmm7, xmm6  // mask alpha
+    paddw      xmm0, xmm4  // + interval_size / 2
     paddw      xmm1, xmm4
     packuswb   xmm0, xmm1
     por        xmm0, xmm7
@@ -4901,9 +4901,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
                                          int width,
                                          uint32_t value) {
   __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_argb
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
     movd       xmm2, [esp + 16]  // value
     punpcklbw  xmm2, xmm2
     punpcklqdq xmm2, xmm2
@@ -4912,10 +4912,10 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
     movdqu     xmm0, [eax]  // read 4 pixels
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0   // first 2
-    punpckhbw  xmm1, xmm1   // next 2
-    pmulhuw    xmm0, xmm2   // argb * value
-    pmulhuw    xmm1, xmm2   // argb * value
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    pmulhuw    xmm0, xmm2  // argb * value
+    pmulhuw    xmm1, xmm2  // argb * value
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
@@ -4937,23 +4937,23 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    pxor       xmm5, xmm5   // constant 0
+    pxor       xmm5, xmm5  // constant 0
 
  convertloop:
     movdqu     xmm0, [eax]  // read 4 pixels from src_argb
     movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
     movdqu     xmm1, xmm0
     movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0   // first 2
-    punpckhbw  xmm1, xmm1   // next 2
-    punpcklbw  xmm2, xmm5   // first 2
-    punpckhbw  xmm3, xmm5   // next 2
-    pmulhuw    xmm0, xmm2   // src_argb * src_argb1 first 2
-    pmulhuw    xmm1, xmm3   // src_argb * src_argb1 next 2
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    punpcklbw  xmm2, xmm5  // first 2
+    punpckhbw  xmm3, xmm5  // next 2
+    pmulhuw    xmm0, xmm2  // src_argb * src_argb1 first 2
+    pmulhuw    xmm1, xmm3  // src_argb * src_argb1 next 2
     lea        eax, [eax + 16]
     lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
@@ -4977,8 +4977,8 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
                                        int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
@@ -4990,7 +4990,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
     lea        eax, [eax + 16]
     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1   // src_argb + src_argb1
+    paddusb    xmm0, xmm1  // src_argb + src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -5005,7 +5005,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
     lea        eax, [eax + 4]
     movd       xmm1, [esi]  // read 1 pixels from src_argb1
     lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1   // src_argb + src_argb1
+    paddusb    xmm0, xmm1  // src_argb + src_argb1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -5026,8 +5026,8 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
@@ -5036,7 +5036,7 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
     lea        eax, [eax + 16]
     movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1   // src_argb - src_argb1
+    psubusb    xmm0, xmm1  // src_argb - src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -5056,11 +5056,11 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5     // constant 0
+    vpxor      ymm5, ymm5, ymm5  // constant 0
 
  convertloop:
     vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb
@@ -5094,8 +5094,8 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
                                        int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
@@ -5124,8 +5124,8 @@ __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
@@ -5159,8 +5159,8 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_y0
-    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        eax, [esp + 8 + 4]  // src_y0
+    mov        esi, [esp + 8 + 8]  // src_y1
     mov        edi, [esp + 8 + 12]  // src_y2
     mov        edx, [esp + 8 + 16]  // dst_sobelx
     mov        ecx, [esp + 8 + 20]  // width
@@ -5170,17 +5170,17 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]      // read 8 pixels from src_y0[0]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
     movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5215,8 +5215,8 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_y0
-    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        eax, [esp + 4 + 4]  // src_y0
+    mov        esi, [esp + 4 + 8]  // src_y1
     mov        edx, [esp + 4 + 12]  // dst_sobely
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
@@ -5224,17 +5224,17 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]        // read 8 pixels from src_y0[0]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
     movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5269,8 +5269,8 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
                                      int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
@@ -5278,7 +5278,7 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
     pslld      xmm5, 24  // 0xff000000
 
  convertloop:
-    movdqu     xmm0, [eax]        // read 16 pixels src_sobelx
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     paddusb    xmm0, xmm1  // sobel = sobelx + sobely
@@ -5317,8 +5317,8 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
@@ -5351,15 +5351,15 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
                                        int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
     pcmpeqb    xmm5, xmm5  // alpha 255
 
  convertloop:
-    movdqu     xmm0, [eax]        // read 16 pixels src_sobelx
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
     movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     movdqa     xmm2, xmm0
@@ -5529,7 +5529,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+            // 1 pixel loop
   l1:
     movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
@@ -5571,7 +5571,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
     test       edx, 15
     jne        l4b
 
-    // 4 pixel loop
+        // 4 pixel loop
   l4:
     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
     lea        eax, [eax + 16]
@@ -5617,7 +5617,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+            // 1 pixel loop
   l1:
     movd       xmm2, dword ptr [eax]  // 1 argb pixel
     lea        eax, [eax + 4]
@@ -5651,7 +5651,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
     mov        esi, [esp + 16]  // stride
     mov        edx, [esp + 20]  // dst_argb
     mov        ecx, [esp + 24]  // pointer to uv_dudv
-    movq       xmm2, qword ptr [ecx]      // uv
+    movq       xmm2, qword ptr [ecx]  // uv
     movq       xmm7, qword ptr [ecx + 8]  // dudv
     mov        ecx, [esp + 28]  // width
     shl        esi, 16  // 4, stride
@@ -5660,7 +5660,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
     sub        ecx, 4
     jl         l4b
 
-    // setup for 4 pixel loop
+        // setup for 4 pixel loop
     pshufd     xmm7, xmm7, 0x44  // dup dudv
     pshufd     xmm5, xmm5, 0  // dup 4, stride
     movdqa     xmm0, xmm2  // x0, y0, x1, y1
@@ -5672,16 +5672,16 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
     addps      xmm3, xmm4
     addps      xmm4, xmm4  // dudv *= 4
 
-    // 4 pixel loop
+        // 4 pixel loop
   l4:
     cvttps2dq  xmm0, xmm2  // x, y float to int first 2
     cvttps2dq  xmm1, xmm3  // x, y float to int next 2
     packssdw   xmm0, xmm1  // x, y as 8 shorts
     pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
     movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39   // shift right
+    pshufd     xmm0, xmm0, 0x39  // shift right
     movd       edi, xmm0
-    pshufd     xmm0, xmm0, 0x39   // shift right
+    pshufd     xmm0, xmm0, 0x39  // shift right
     movd       xmm1, [eax + esi]  // read pixel 0
     movd       xmm6, [eax + edi]  // read pixel 1
     punpckldq  xmm1, xmm6  // combine pixel 0 and 1
@@ -5733,8 +5733,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@@ -5743,7 +5743,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
     je         xloop100  // 0 / 256.  Blend 100 / 0.
     sub        edi, esi
     cmp        eax, 128
-    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
+    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
 
     vmovd      xmm0, eax  // high fraction 0..255
     neg        eax
@@ -5770,7 +5770,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
     vpaddw     ymm0, ymm0, ymm4
     vpsrlw     ymm1, ymm1, 8
     vpsrlw     ymm0, ymm0, 8
-    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vpackuswb  ymm0, ymm0, ymm1            // unmutates
     vmovdqu    [esi + edi], ymm0
     lea        esi, [esi + 32]
     sub        ecx, 32
@@ -5811,17 +5811,17 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
     push       esi
     push       edi
 
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
     sub        edi, esi
-    // Dispatch to specialized filters if applicable.
+        // Dispatch to specialized filters if applicable.
     cmp        eax, 0
     je         xloop100  // 0 /256.  Blend 100 / 0.
     cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
 
     movd       xmm0, eax  // high fraction 0..255
     neg        eax
@@ -5840,7 +5840,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
     movdqu     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
-    psubb      xmm0, xmm4  // bias image by -128
+    psubb      xmm0, xmm4            // bias image by -128
     psubb      xmm1, xmm4
     movdqa     xmm2, xmm5
     movdqa     xmm3, xmm5
@@ -5889,8 +5889,8 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                                             const uint8_t* shuffler,
                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // shuffler
     movdqu     xmm5, [ecx]
     mov        ecx, [esp + 16]  // width
@@ -5916,8 +5916,8 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                                            const uint8_t* shuffler,
                                            int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // shuffler
     vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
     mov        ecx, [esp + 16]  // width
@@ -5954,18 +5954,18 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_y
-    mov        esi, [esp + 8 + 8]   // src_u
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
     mov        edx, [esp + 8 + 12]  // src_v
     mov        edi, [esp + 8 + 16]  // dst_frame
     mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi]        // U
+    movq       xmm2, qword ptr [esi]  // U
     movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3   // UV
+    punpcklbw  xmm2, xmm3  // UV
     movdqu     xmm0, [eax]  // Y
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
@@ -5991,22 +5991,22 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_y
-    mov        esi, [esp + 8 + 8]   // src_u
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
     mov        edx, [esp + 8 + 12]  // src_v
     mov        edi, [esp + 8 + 16]  // dst_frame
     mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi]        // U
+    movq       xmm2, qword ptr [esi]  // U
     movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3   // UV
+    punpcklbw  xmm2, xmm3  // UV
     movdqu     xmm0, [eax]  // Y
     movdqa     xmm1, xmm2
     lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0   // UYVY
+    punpcklbw  xmm1, xmm0  // UYVY
     punpckhbw  xmm2, xmm0
     movdqu     [edi], xmm1
     movdqu     [edi + 16], xmm2
@@ -6033,10 +6033,10 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
     mov        ecx, [esp + 4 + 16] /* width */
     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
 
-    // 2 pixel loop.
+        // 2 pixel loop.
  convertloop:
-    //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-    //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
     movq       xmm0, qword ptr [eax]  // BGRABGRA
     lea        eax, [eax + 8]
     punpcklbw  xmm0, xmm3
@@ -6085,8 +6085,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
                                               const float* poly,
                                               int width) {
   __asm {
-    mov        eax, [esp + 4]  /* src_argb */
-    mov        edx, [esp + 8]  /* dst_argb */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
     mov        ecx, [esp + 12] /* poly */
     vbroadcastf128 ymm4, [ecx]  // C0
     vbroadcastf128 ymm5, [ecx + 16]  // C1
@@ -6125,8 +6125,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
                                          float scale,
                                          int width) {
   __asm {
-    mov        eax, [esp + 4]  /* src */
-    mov        edx, [esp + 8]  /* dst */
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
     movd       xmm4, dword ptr [esp + 12] /* scale */
     mov        ecx, [esp + 16] /* width */
     mulss      xmm4, kExpBias
@@ -6134,7 +6134,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
     pxor       xmm5, xmm5
     sub        edx, eax
 
-    // 8 pixel loop.
+        // 8 pixel loop.
  convertloop:
     movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
     add         eax, 16
@@ -6172,7 +6172,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
     vpxor      ymm5, ymm5, ymm5
     sub        edx, eax
 
-    // 16 pixel loop.
+        // 16 pixel loop.
  convertloop:
     vmovdqu     ymm2, [eax]  // 16 shorts
     add         eax, 32
@@ -6182,7 +6182,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
     vcvtdq2ps   ymm2, ymm2
     vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
     vmulps      ymm2, ymm2, ymm4
-    vpsrld      ymm3, ymm3, 13    // float convert to 8 half floats truncate
+    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
     vpsrld      ymm2, ymm2, 13
     vpackssdw   ymm2, ymm2, ymm3
     vmovdqu     [eax + edx - 32], ymm2
@@ -6200,22 +6200,22 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
                                          float scale,
                                          int width) {
   __asm {
-    mov        eax, [esp + 4]     /* src */
-    mov        edx, [esp + 8]     /* dst */
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
     vbroadcastss ymm4, [esp + 12] /* scale */
-    mov        ecx, [esp + 16]    /* width */
+    mov        ecx, [esp + 16] /* width */
     sub        edx, eax
 
-    // 16 pixel loop.
+        // 16 pixel loop.
  convertloop:
     vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
     vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
     add         eax, 32
-    vcvtdq2ps   ymm2, ymm2        // convert 8 ints to floats
+    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
     vcvtdq2ps   ymm3, ymm3
     vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
     vmulps      ymm3, ymm3, ymm4
-    vcvtps2ph   xmm2, ymm2, 3     // float convert to 8 half floats truncate
+    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
     vcvtps2ph   xmm3, ymm3, 3
     vmovdqu     [eax + edx + 32], xmm2
     vmovdqu     [eax + edx + 32 + 16], xmm3
@@ -6234,8 +6234,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
                                              int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  /* dst_argb */
-    mov        esi, [esp + 4 + 8]  /* table_argb */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
     mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
@@ -6268,8 +6268,8 @@ __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]  /* dst_argb */
-    mov        esi, [esp + 4 + 8]  /* table_argb */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
     mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
@@ -6303,8 +6303,8 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]  /* src_argb */
-    mov        edi, [esp + 8 + 8]  /* dst_argb */
+    mov        eax, [esp + 8 + 4] /* src_argb */
+    mov        edi, [esp + 8 + 8] /* dst_argb */
     mov        ecx, [esp + 8 + 12] /* width */
     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
@@ -6314,7 +6314,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
     psllw      xmm4, 8
     pxor       xmm5, xmm5
 
-    // 4 pixel loop.
+        // 4 pixel loop.
   convertloop:
     movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
     pmaddubsw  xmm0, xmm3