Fix ConvertToI420 when using YUY2 or UYVY with odd crop_x.

- swap U and V when crop x is odd - document YUY2 and UYVY formats - apply clang-format Bug: libyuv:902 Change-Id: I045e44c907f4a9eb625d7c024b669bb308055f32 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3039549 Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
2025-12-06 16:56:55 +08:00 · 2021-07-19 12:04:32 -07:00 · 2021-07-19 12:04:32 -07:00 · 639dd4ea76
commit 639dd4ea76
parent 0572e0a0b1
5 changed files with 147 additions and 133 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1788
+Version: 1789
 License: BSD
 License File: LICENSE

--- a/docs/formats.md
+++ b/docs/formats.md
@ -189,7 +189,6 @@ In memory R is the lowest and A is the highest.
 Each channel has value ranges from 0 to 65535.
 AR64 is similar to ARGB.

-
 # NV12 and NV21

 NV12 is a biplanar format with a full sized Y plane followed by a single
@ -200,3 +199,10 @@ height chroma channel, and therefore is a 420 subsampling.
 NV16 is 16 bits per pixel, with half width and full height.  aka 422.
 NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
 Most NV12 functions allow the destination Y pointer to be NULL.
+
+# YUY2 and UYVY
+
+YUY2 is a packed YUV format with half width, full height.
+
+YUY2 is YUYV in memory
+UYVY is UYVY in memory
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1788
+#define LIBYUV_VERSION 1789

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert_to_i420.cc
+++ b/source/convert_to_i420.cc
@ -89,18 +89,26 @@ int ConvertToI420(const uint8_t* sample,

  switch (format) {
    // Single plane formats
-    case FOURCC_YUY2:
+    case FOURCC_YUY2: {  // TODO(fbarchard): Find better odd crop fix.
+      uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
+      uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
+      int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
+      int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
+      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
+                     stride_u, v, stride_v, crop_width, inv_crop_height);
      break;
-    case FOURCC_UYVY:
+    }
+    case FOURCC_UYVY: {
+      uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
+      uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
+      int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
+      int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
-                     dst_stride_u, dst_v, dst_stride_v, crop_width,
-                     inv_crop_height);
+      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
+                     stride_u, v, stride_v, crop_width, inv_crop_height);
      break;
+    }
    case FOURCC_RGBP:
      src = sample + (src_width * crop_y + crop_x) * 2;
      r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -2584,7 +2584,7 @@ __declspec(naked) void I422ToRGBARow_AVX2(
    __asm psraw      xmm2, 6                                                   \
    __asm packuswb   xmm0, xmm0 /* B */                                        \
    __asm packuswb   xmm1, xmm1 /* G */                                        \
-    __asm packuswb   xmm2, xmm2 /* R */                                        \
+    __asm packuswb   xmm2, xmm2 /* R */             \
  }

 // Store 8 ARGB values.
@ -4746,22 +4746,22 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
    pmaddubsw  xmm6, xmm2
    phaddw     xmm0, xmm6
    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm0, xmm0  // 8 B values
    movdqu     xmm5, [eax]  // G
    movdqu     xmm1, [eax + 16]
    pmaddubsw  xmm5, xmm3
    pmaddubsw  xmm1, xmm3
    phaddw     xmm5, xmm1
    psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 G values
-    punpcklbw  xmm0, xmm5   // 8 BG values
+    packuswb   xmm5, xmm5  // 8 G values
+    punpcklbw  xmm0, xmm5  // 8 BG values
    movdqu     xmm5, [eax]  // R
    movdqu     xmm1, [eax + 16]
    pmaddubsw  xmm5, xmm4
    pmaddubsw  xmm1, xmm4
    phaddw     xmm5, xmm1
    psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 R values
+    packuswb   xmm5, xmm5  // 8 R values
    movdqu     xmm6, [eax]  // A
    movdqu     xmm1, [eax + 16]
    psrld      xmm6, 24
@ -4811,25 +4811,25 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
    movdqu     xmm1, [eax + 16]
    pmaddubsw  xmm6, xmm3
    pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm7   // B
-    phaddsw    xmm6, xmm1   // G
-    psraw      xmm0, 6      // B
-    psraw      xmm6, 6      // G
-    packuswb   xmm0, xmm0   // 8 B values
-    packuswb   xmm6, xmm6   // 8 G values
-    punpcklbw  xmm0, xmm6   // 8 BG values
+    phaddsw    xmm0, xmm7  // B
+    phaddsw    xmm6, xmm1  // G
+    psraw      xmm0, 6  // B
+    psraw      xmm6, 6  // G
+    packuswb   xmm0, xmm0  // 8 B values
+    packuswb   xmm6, xmm6  // 8 G values
+    punpcklbw  xmm0, xmm6  // 8 BG values
    movdqu     xmm1, [eax]  // R
    movdqu     xmm7, [eax + 16]
    pmaddubsw  xmm1, xmm4
    pmaddubsw  xmm7, xmm4
-    phaddsw    xmm1, xmm7   // R
+    phaddsw    xmm1, xmm7  // R
    movdqu     xmm6, [eax]  // A
    movdqu     xmm7, [eax + 16]
    pmaddubsw  xmm6, xmm5
    pmaddubsw  xmm7, xmm5
    phaddsw    xmm6, xmm7  // A
-    psraw      xmm1, 6     // R
-    psraw      xmm6, 6     // A
+    psraw      xmm1, 6  // R
+    psraw      xmm6, 6  // A
    packuswb   xmm1, xmm1  // 8 R values
    packuswb   xmm6, xmm6  // 8 A values
    punpcklbw  xmm1, xmm6  // 8 RA values
@ -4872,16 +4872,16 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,

 convertloop:
    movdqu     xmm0, [eax]  // read 4 pixels
-    punpcklbw  xmm0, xmm5   // first 2 pixels
-    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    punpcklbw  xmm0, xmm5  // first 2 pixels
+    pmulhuw    xmm0, xmm2  // pixel * scale >> 16
    movdqu     xmm1, [eax]  // read 4 pixels
-    punpckhbw  xmm1, xmm5   // next 2 pixels
+    punpckhbw  xmm1, xmm5  // next 2 pixels
    pmulhuw    xmm1, xmm2
-    pmullw     xmm0, xmm3   // * interval_size
+    pmullw     xmm0, xmm3  // * interval_size
    movdqu     xmm7, [eax]  // read 4 pixels
    pmullw     xmm1, xmm3
-    pand       xmm7, xmm6   // mask alpha
-    paddw      xmm0, xmm4   // + interval_size / 2
+    pand       xmm7, xmm6  // mask alpha
+    paddw      xmm0, xmm4  // + interval_size / 2
    paddw      xmm1, xmm4
    packuswb   xmm0, xmm1
    por        xmm0, xmm7
@ -4901,9 +4901,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
                                         int width,
                                         uint32_t value) {
  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_argb
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
    movd       xmm2, [esp + 16]  // value
    punpcklbw  xmm2, xmm2
    punpcklqdq xmm2, xmm2
@ -4912,10 +4912,10 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
    movdqu     xmm0, [eax]  // read 4 pixels
    lea        eax, [eax + 16]
    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0   // first 2
-    punpckhbw  xmm1, xmm1   // next 2
-    pmulhuw    xmm0, xmm2   // argb * value
-    pmulhuw    xmm1, xmm2   // argb * value
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    pmulhuw    xmm0, xmm2  // argb * value
+    pmulhuw    xmm1, xmm2  // argb * value
    psrlw      xmm0, 8
    psrlw      xmm1, 8
    packuswb   xmm0, xmm1
@ -4937,23 +4937,23 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                                            int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width
-    pxor       xmm5, xmm5   // constant 0
+    pxor       xmm5, xmm5  // constant 0

 convertloop:
    movdqu     xmm0, [eax]  // read 4 pixels from src_argb
    movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
    movdqu     xmm1, xmm0
    movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0   // first 2
-    punpckhbw  xmm1, xmm1   // next 2
-    punpcklbw  xmm2, xmm5   // first 2
-    punpckhbw  xmm3, xmm5   // next 2
-    pmulhuw    xmm0, xmm2   // src_argb * src_argb1 first 2
-    pmulhuw    xmm1, xmm3   // src_argb * src_argb1 next 2
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    punpcklbw  xmm2, xmm5  // first 2
+    punpckhbw  xmm3, xmm5  // next 2
+    pmulhuw    xmm0, xmm2  // src_argb * src_argb1 first 2
+    pmulhuw    xmm1, xmm3  // src_argb * src_argb1 next 2
    lea        eax, [eax + 16]
    lea        esi, [esi + 16]
    packuswb   xmm0, xmm1
@ -4977,8 +4977,8 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
                                       int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width

@ -4990,7 +4990,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
    lea        eax, [eax + 16]
    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
    lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1   // src_argb + src_argb1
+    paddusb    xmm0, xmm1  // src_argb + src_argb1
    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
    sub        ecx, 4
@ -5005,7 +5005,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
    lea        eax, [eax + 4]
    movd       xmm1, [esi]  // read 1 pixels from src_argb1
    lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1   // src_argb + src_argb1
+    paddusb    xmm0, xmm1  // src_argb + src_argb1
    movd       [edx], xmm0
    lea        edx, [edx + 4]
    sub        ecx, 1
@ -5026,8 +5026,8 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
                                            int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width

@ -5036,7 +5036,7 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
    lea        eax, [eax + 16]
    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
    lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1   // src_argb - src_argb1
+    psubusb    xmm0, xmm1  // src_argb - src_argb1
    movdqu     [edx], xmm0
    lea        edx, [edx + 16]
    sub        ecx, 4
@ -5056,11 +5056,11 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                                            int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5     // constant 0
+    vpxor      ymm5, ymm5, ymm5  // constant 0

 convertloop:
    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb
@ -5094,8 +5094,8 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
                                       int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width

@ -5124,8 +5124,8 @@ __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
                                            int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_argb1
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width

@ -5159,8 +5159,8 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
  __asm {
    push       esi
    push       edi
-    mov        eax, [esp + 8 + 4]   // src_y0
-    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        eax, [esp + 8 + 4]  // src_y0
+    mov        esi, [esp + 8 + 8]  // src_y1
    mov        edi, [esp + 8 + 12]  // src_y2
    mov        edx, [esp + 8 + 16]  // dst_sobelx
    mov        ecx, [esp + 8 + 20]  // width
@ -5170,17 +5170,17 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
    pxor       xmm5, xmm5  // constant 0

 convertloop:
-    movq       xmm0, qword ptr [eax]      // read 8 pixels from src_y0[0]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
    movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
    punpcklbw  xmm0, xmm5
    punpcklbw  xmm1, xmm5
    psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
    punpcklbw  xmm1, xmm5
    punpcklbw  xmm2, xmm5
    psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
    punpcklbw  xmm2, xmm5
    punpcklbw  xmm3, xmm5
@ -5215,8 +5215,8 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
                                      int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]   // src_y0
-    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        eax, [esp + 4 + 4]  // src_y0
+    mov        esi, [esp + 4 + 8]  // src_y1
    mov        edx, [esp + 4 + 12]  // dst_sobely
    mov        ecx, [esp + 4 + 16]  // width
    sub        esi, eax
@ -5224,17 +5224,17 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
    pxor       xmm5, xmm5  // constant 0

 convertloop:
-    movq       xmm0, qword ptr [eax]        // read 8 pixels from src_y0[0]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
    punpcklbw  xmm0, xmm5
    punpcklbw  xmm1, xmm5
    psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
    punpcklbw  xmm1, xmm5
    punpcklbw  xmm2, xmm5
    psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
    punpcklbw  xmm2, xmm5
    punpcklbw  xmm3, xmm5
@ -5269,8 +5269,8 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
                                     int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width
    sub        esi, eax
@ -5278,7 +5278,7 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
    pslld      xmm5, 24  // 0xff000000

 convertloop:
-    movdqu     xmm0, [eax]        // read 16 pixels src_sobelx
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
    lea        eax, [eax + 16]
    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
@ -5317,8 +5317,8 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
                                            int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width
    sub        esi, eax
@ -5351,15 +5351,15 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
                                       int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width
    sub        esi, eax
    pcmpeqb    xmm5, xmm5  // alpha 255

 convertloop:
-    movdqu     xmm0, [eax]        // read 16 pixels src_sobelx
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
    lea        eax, [eax + 16]
    movdqa     xmm2, xmm0
@ -5529,7 +5529,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
    add        ecx, 4 - 1
    jl         l1b

-    // 1 pixel loop
+            // 1 pixel loop
  l1:
    movdqu     xmm0, [eax]
    psubd      xmm0, [eax + edx * 4]
@ -5571,7 +5571,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
    test       edx, 15
    jne        l4b

-    // 4 pixel loop
+        // 4 pixel loop
  l4:
    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
    lea        eax, [eax + 16]
@ -5617,7 +5617,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
    add        ecx, 4 - 1
    jl         l1b

-    // 1 pixel loop
+            // 1 pixel loop
  l1:
    movd       xmm2, dword ptr [eax]  // 1 argb pixel
    lea        eax, [eax + 4]
@ -5651,7 +5651,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
    mov        esi, [esp + 16]  // stride
    mov        edx, [esp + 20]  // dst_argb
    mov        ecx, [esp + 24]  // pointer to uv_dudv
-    movq       xmm2, qword ptr [ecx]      // uv
+    movq       xmm2, qword ptr [ecx]  // uv
    movq       xmm7, qword ptr [ecx + 8]  // dudv
    mov        ecx, [esp + 28]  // width
    shl        esi, 16  // 4, stride
@ -5660,7 +5660,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
    sub        ecx, 4
    jl         l4b

-    // setup for 4 pixel loop
+        // setup for 4 pixel loop
    pshufd     xmm7, xmm7, 0x44  // dup dudv
    pshufd     xmm5, xmm5, 0  // dup 4, stride
    movdqa     xmm0, xmm2  // x0, y0, x1, y1
@ -5672,16 +5672,16 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
    addps      xmm3, xmm4
    addps      xmm4, xmm4  // dudv *= 4

-    // 4 pixel loop
+        // 4 pixel loop
  l4:
    cvttps2dq  xmm0, xmm2  // x, y float to int first 2
    cvttps2dq  xmm1, xmm3  // x, y float to int next 2
    packssdw   xmm0, xmm1  // x, y as 8 shorts
    pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
    movd       esi, xmm0
-    pshufd     xmm0, xmm0, 0x39   // shift right
+    pshufd     xmm0, xmm0, 0x39  // shift right
    movd       edi, xmm0
-    pshufd     xmm0, xmm0, 0x39   // shift right
+    pshufd     xmm0, xmm0, 0x39  // shift right
    movd       xmm1, [eax + esi]  // read pixel 0
    movd       xmm6, [eax + edi]  // read pixel 1
    punpckldq  xmm1, xmm6  // combine pixel 0 and 1
@ -5733,8 +5733,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
  __asm {
    push       esi
    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
    mov        edx, [esp + 8 + 12]  // src_stride
    mov        ecx, [esp + 8 + 16]  // dst_width
    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@ -5743,7 +5743,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
    je         xloop100  // 0 / 256.  Blend 100 / 0.
    sub        edi, esi
    cmp        eax, 128
-    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
+    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.

    vmovd      xmm0, eax  // high fraction 0..255
    neg        eax
@ -5770,7 +5770,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
    vpaddw     ymm0, ymm0, ymm4
    vpsrlw     ymm1, ymm1, 8
    vpsrlw     ymm0, ymm0, 8
-    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vpackuswb  ymm0, ymm0, ymm1            // unmutates
    vmovdqu    [esi + edi], ymm0
    lea        esi, [esi + 32]
    sub        ecx, 32
@ -5811,17 +5811,17 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
    push       esi
    push       edi

-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
    mov        edx, [esp + 8 + 12]  // src_stride
    mov        ecx, [esp + 8 + 16]  // dst_width
    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
    sub        edi, esi
-    // Dispatch to specialized filters if applicable.
+        // Dispatch to specialized filters if applicable.
    cmp        eax, 0
    je         xloop100  // 0 /256.  Blend 100 / 0.
    cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.

    movd       xmm0, eax  // high fraction 0..255
    neg        eax
@ -5840,7 +5840,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
    movdqu     xmm1, xmm0
    punpcklbw  xmm0, xmm2
    punpckhbw  xmm1, xmm2
-    psubb      xmm0, xmm4  // bias image by -128
+    psubb      xmm0, xmm4            // bias image by -128
    psubb      xmm1, xmm4
    movdqa     xmm2, xmm5
    movdqa     xmm3, xmm5
@ -5889,8 +5889,8 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                                            const uint8_t* shuffler,
                                            int width) {
  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
    mov        ecx, [esp + 12]  // shuffler
    movdqu     xmm5, [ecx]
    mov        ecx, [esp + 16]  // width
@ -5916,8 +5916,8 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                                           const uint8_t* shuffler,
                                           int width) {
  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
    mov        ecx, [esp + 12]  // shuffler
    vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
    mov        ecx, [esp + 16]  // width
@ -5954,18 +5954,18 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
  __asm {
    push       esi
    push       edi
-    mov        eax, [esp + 8 + 4]   // src_y
-    mov        esi, [esp + 8 + 8]   // src_u
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
    mov        edx, [esp + 8 + 12]  // src_v
    mov        edi, [esp + 8 + 16]  // dst_frame
    mov        ecx, [esp + 8 + 20]  // width
    sub        edx, esi

  convertloop:
-    movq       xmm2, qword ptr [esi]        // U
+    movq       xmm2, qword ptr [esi]  // U
    movq       xmm3, qword ptr [esi + edx]  // V
    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3   // UV
+    punpcklbw  xmm2, xmm3  // UV
    movdqu     xmm0, [eax]  // Y
    lea        eax, [eax + 16]
    movdqa     xmm1, xmm0
@ -5991,22 +5991,22 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
  __asm {
    push       esi
    push       edi
-    mov        eax, [esp + 8 + 4]   // src_y
-    mov        esi, [esp + 8 + 8]   // src_u
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
    mov        edx, [esp + 8 + 12]  // src_v
    mov        edi, [esp + 8 + 16]  // dst_frame
    mov        ecx, [esp + 8 + 20]  // width
    sub        edx, esi

  convertloop:
-    movq       xmm2, qword ptr [esi]        // U
+    movq       xmm2, qword ptr [esi]  // U
    movq       xmm3, qword ptr [esi + edx]  // V
    lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3   // UV
+    punpcklbw  xmm2, xmm3  // UV
    movdqu     xmm0, [eax]  // Y
    movdqa     xmm1, xmm2
    lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0   // UYVY
+    punpcklbw  xmm1, xmm0  // UYVY
    punpckhbw  xmm2, xmm0
    movdqu     [edi], xmm1
    movdqu     [edi + 16], xmm2
@ -6033,10 +6033,10 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
    mov        ecx, [esp + 4 + 16] /* width */
    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.

-    // 2 pixel loop.
+        // 2 pixel loop.
 convertloop:
-    //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-    //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
    movq       xmm0, qword ptr [eax]  // BGRABGRA
    lea        eax, [eax + 8]
    punpcklbw  xmm0, xmm3
@ -6085,8 +6085,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
                                              const float* poly,
                                              int width) {
  __asm {
-    mov        eax, [esp + 4]  /* src_argb */
-    mov        edx, [esp + 8]  /* dst_argb */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
    mov        ecx, [esp + 12] /* poly */
    vbroadcastf128 ymm4, [ecx]  // C0
    vbroadcastf128 ymm5, [ecx + 16]  // C1
@ -6125,8 +6125,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
                                         float scale,
                                         int width) {
  __asm {
-    mov        eax, [esp + 4]  /* src */
-    mov        edx, [esp + 8]  /* dst */
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
    movd       xmm4, dword ptr [esp + 12] /* scale */
    mov        ecx, [esp + 16] /* width */
    mulss      xmm4, kExpBias
@ -6134,7 +6134,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
    pxor       xmm5, xmm5
    sub        edx, eax

-    // 8 pixel loop.
+        // 8 pixel loop.
 convertloop:
    movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
    add         eax, 16
@ -6172,7 +6172,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
    vpxor      ymm5, ymm5, ymm5
    sub        edx, eax

-    // 16 pixel loop.
+        // 16 pixel loop.
 convertloop:
    vmovdqu     ymm2, [eax]  // 16 shorts
    add         eax, 32
@ -6182,7 +6182,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
    vcvtdq2ps   ymm2, ymm2
    vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
    vmulps      ymm2, ymm2, ymm4
-    vpsrld      ymm3, ymm3, 13    // float convert to 8 half floats truncate
+    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
    vpsrld      ymm2, ymm2, 13
    vpackssdw   ymm2, ymm2, ymm3
    vmovdqu     [eax + edx - 32], ymm2
@ -6200,22 +6200,22 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
                                         float scale,
                                         int width) {
  __asm {
-    mov        eax, [esp + 4]     /* src */
-    mov        edx, [esp + 8]     /* dst */
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
    vbroadcastss ymm4, [esp + 12] /* scale */
-    mov        ecx, [esp + 16]    /* width */
+    mov        ecx, [esp + 16] /* width */
    sub        edx, eax

-    // 16 pixel loop.
+        // 16 pixel loop.
 convertloop:
    vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
    vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
    add         eax, 32
-    vcvtdq2ps   ymm2, ymm2        // convert 8 ints to floats
+    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
    vcvtdq2ps   ymm3, ymm3
    vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
    vmulps      ymm3, ymm3, ymm4
-    vcvtps2ph   xmm2, ymm2, 3     // float convert to 8 half floats truncate
+    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
    vcvtps2ph   xmm3, ymm3, 3
    vmovdqu     [eax + edx + 32], xmm2
    vmovdqu     [eax + edx + 32 + 16], xmm3
@ -6234,8 +6234,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
                                             int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]  /* dst_argb */
-    mov        esi, [esp + 4 + 8]  /* table_argb */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
    mov        ecx, [esp + 4 + 12] /* width */

    // 1 pixel loop.
@ -6268,8 +6268,8 @@ __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
                                            int width) {
  __asm {
    push       esi
-    mov        eax, [esp + 4 + 4]  /* dst_argb */
-    mov        esi, [esp + 4 + 8]  /* table_argb */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
    mov        ecx, [esp + 4 + 12] /* width */

    // 1 pixel loop.
@ -6303,8 +6303,8 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
  __asm {
    push       esi
    push       edi
-    mov        eax, [esp + 8 + 4]  /* src_argb */
-    mov        edi, [esp + 8 + 8]  /* dst_argb */
+    mov        eax, [esp + 8 + 4] /* src_argb */
+    mov        edi, [esp + 8 + 8] /* dst_argb */
    mov        ecx, [esp + 8 + 12] /* width */
    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
@ -6314,7 +6314,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
    psllw      xmm4, 8
    pxor       xmm5, xmm5

-    // 4 pixel loop.
+        // 4 pixel loop.
  convertloop:
    movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
    pmaddubsw  xmm0, xmm3