From 92e22cf5b66173d5d5056751ca62bc2254e4ff86 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Wed, 24 Jan 2018 10:52:17 -0800
Subject: [PATCH] Lint cleanup after C99 change CL

TBR=braveyao@chromium.org
Bug: libyuv:774
Test: git cl lint
Change-Id: I51cf8107a8db17fbc9952d610f3e4d7aac5aa743
Reviewed-on: https://chromium-review.googlesource.com/882217
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/compare.h      |   23 +-
 include/libyuv/compare_row.h  |   44 +-
 include/libyuv/convert.h      |    5 +-
 include/libyuv/macros_msa.h   |   84 +-
 include/libyuv/row.h          |  283 +-
 include/libyuv/scale_row.h    |   16 +-
 include/libyuv/video_common.h |    4 +-
 source/compare.cc             |   44 +-
 source/compare_common.cc      |   14 +-
 source/compare_gcc.cc         |  187 +-
 source/compare_msa.cc         |    8 +-
 source/compare_neon.cc        |    8 +-
 source/compare_neon64.cc      |    8 +-
 source/compare_win.cc         |    6 +-
 source/convert.cc             |   78 +-
 source/convert_argb.cc        |   26 +-
 source/convert_from.cc        |    2 +-
 source/convert_from_argb.cc   |   57 +-
 source/convert_jpeg.cc        |    5 +-
 source/planar_functions.cc    |   57 +-
 source/rotate.cc              |    4 +-
 source/rotate_any.cc          |    9 +-
 source/row_any.cc             |  359 +-
 source/row_common.cc          |  183 +-
 source/row_gcc.cc             | 7664 ++++++++++++++++-----------------
 source/row_msa.cc             |   53 +-
 source/row_neon.cc            |   37 +-
 source/row_neon64.cc          |   37 +-
 source/row_win.cc             |   48 +-
 source/scale.cc               |   25 +-
 source/scale_any.cc           |   88 +-
 source/scale_argb.cc          |    4 +-
 source/scale_common.cc        |    4 +-
 source/scale_gcc.cc           | 1825 ++++----
 source/scale_neon64.cc        |   14 +-
 source/video_common.cc        |    7 +-
 unit_test/color_test.cc       |   10 +-
 unit_test/compare_test.cc     |    4 +-
 unit_test/convert_test.cc     |  105 +-
 unit_test/planar_test.cc      |   19 +-
 unit_test/scale_argb_test.cc  |   18 +-
 unit_test/unit_test.h         |    4 +-
 util/psnr.cc                  |   24 +-
 util/ssim.cc                  |    2 +-
 44 files changed, 5881 insertions(+), 5625 deletions(-)

diff --git a/include/libyuv/compare.h b/include/libyuv/compare.h
index 79405a218..3353ad71c 100644
--- a/include/libyuv/compare.h
+++ b/include/libyuv/compare.h
@@ -25,25 +25,30 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed);
 // Hamming Distance
 LIBYUV_API
 uint64_t ComputeHammingDistance(const uint8_t* src_a,
-                              const uint8_t* src_b,
-                              int count);
+                                const uint8_t* src_b,
+                                int count);
 
 // Scan an opaque argb image and return fourcc based on alpha offset.
 // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
 LIBYUV_API
-uint32_t ARGBDetect(const uint8_t* argb, int stride_argb, int width, int height);
+uint32_t ARGBDetect(const uint8_t* argb,
+                    int stride_argb,
+                    int width,
+                    int height);
 
 // Sum Square Error - used to compute Mean Square Error or PSNR.
 LIBYUV_API
-uint64_t ComputeSumSquareError(const uint8_t* src_a, const uint8_t* src_b, int count);
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count);
 
 LIBYUV_API
 uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
-                                  int stride_a,
-                                  const uint8_t* src_b,
-                                  int stride_b,
-                                  int width,
-                                  int height);
+                                    int stride_a,
+                                    const uint8_t* src_b,
+                                    int stride_b,
+                                    int width,
+                                    int height);
 
 static const int kMaxPsnr = 128;
 
diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h
index 2fb451ca2..72ee74060 100644
--- a/include/libyuv/compare_row.h
+++ b/include/libyuv/compare_row.h
@@ -90,18 +90,40 @@ extern "C" {
 #define HAS_SUMSQUAREERROR_MSA
 #endif
 
-uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, int count);
-uint32_t HammingDistance_SSE42(const uint8_t* src_a, const uint8_t* src_b, int count);
-uint32_t HammingDistance_SSSE3(const uint8_t* src_a, const uint8_t* src_b, int count);
-uint32_t HammingDistance_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count);
-uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int count);
-uint32_t HammingDistance_MSA(const uint8_t* src_a, const uint8_t* src_b, int count);
+uint32_t HammingDistance_C(const uint8_t* src_a,
+                           const uint8_t* src_b,
+                           int count);
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count);
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count);
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count);
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count);
+uint32_t HammingDistance_MSA(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count);
 
-uint32_t SumSquareError_C(const uint8_t* src_a, const uint8_t* src_b, int count);
-uint32_t SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count);
-uint32_t SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count);
-uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count);
-uint32_t SumSquareError_MSA(const uint8_t* src_a, const uint8_t* src_b, int count);
+uint32_t SumSquareError_C(const uint8_t* src_a,
+                          const uint8_t* src_b,
+                          int count);
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count);
+uint32_t SumSquareError_AVX2(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count);
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count);
+uint32_t SumSquareError_MSA(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count);
 
 uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 66df12200..e8c5f355d 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -352,7 +352,10 @@ int MJPGToI420(const uint8_t* sample,
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8_t* sample, size_t sample_size, int* width, int* height);
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
+             int* width,
+             int* height);
 #endif
 
 // Convert camera sample to I420 with cropping, rotation and vertical flip.
diff --git a/include/libyuv/macros_msa.h b/include/libyuv/macros_msa.h
index abd9f26b7..921eb0714 100644
--- a/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
@@ -16,38 +16,38 @@
 #include <stdint.h>
 
 #if (__mips_isa_rev >= 6)
-#define LW(psrc)                                    \
-  ({                                                \
+#define LW(psrc)                                        \
+  ({                                                    \
     uint8_t* psrc_lw_m = (uint8_t*)(psrc); /* NOLINT */ \
-    uint32_t val_m;                                   \
-    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"  \
-                 : [val_m] "=r"(val_m)              \
-                 : [psrc_lw_m] "m"(*psrc_lw_m));    \
-    val_m;                                          \
+    uint32_t val_m;                                     \
+    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"      \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_lw_m] "m"(*psrc_lw_m));        \
+    val_m;                                              \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                    \
-  ({                                                \
+#define LD(psrc)                                        \
+  ({                                                    \
     uint8_t* psrc_ld_m = (uint8_t*)(psrc); /* NOLINT */ \
-    uint64_t val_m = 0;                               \
-    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"  \
-                 : [val_m] "=r"(val_m)              \
-                 : [psrc_ld_m] "m"(*psrc_ld_m));    \
-    val_m;                                          \
+    uint64_t val_m = 0;                                 \
+    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"      \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_ld_m] "m"(*psrc_ld_m));        \
+    val_m;                                              \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                                       \
-  ({                                                                   \
-    uint8_t* psrc_ld_m = (uint8_t*)(psrc); /* NOLINT */                    \
+#define LD(psrc)                                                         \
+  ({                                                                     \
+    uint8_t* psrc_ld_m = (uint8_t*)(psrc); /* NOLINT */                  \
     uint32_t val0_m, val1_m;                                             \
     uint64_t val_m = 0;                                                  \
-    val0_m = LW(psrc_ld_m);                                            \
-    val1_m = LW(psrc_ld_m + 4);                                        \
+    val0_m = LW(psrc_ld_m);                                              \
+    val1_m = LW(psrc_ld_m + 4);                                          \
     val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);             /* NOLINT */ \
-    val_m;                                                             \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
+    val_m;                                                               \
   })
 #endif  // (__mips == 64)
 
@@ -81,38 +81,38 @@
   })
 #endif  // !(__mips == 64)
 #else   // !(__mips_isa_rev >= 6)
-#define LW(psrc)                                    \
-  ({                                                \
+#define LW(psrc)                                        \
+  ({                                                    \
     uint8_t* psrc_lw_m = (uint8_t*)(psrc); /* NOLINT */ \
-    uint32_t val_m;                                   \
-    asm volatile("ulw  %[val_m],  %[psrc_lw_m]  \n" \
-                 : [val_m] "=r"(val_m)              \
-                 : [psrc_lw_m] "m"(*psrc_lw_m));    \
-    val_m;                                          \
+    uint32_t val_m;                                     \
+    asm volatile("ulw  %[val_m],  %[psrc_lw_m]  \n"     \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_lw_m] "m"(*psrc_lw_m));        \
+    val_m;                                              \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                    \
-  ({                                                \
+#define LD(psrc)                                        \
+  ({                                                    \
     uint8_t* psrc_ld_m = (uint8_t*)(psrc); /* NOLINT */ \
-    uint64_t val_m = 0;                               \
-    asm volatile("uld  %[val_m],  %[psrc_ld_m]  \n" \
-                 : [val_m] "=r"(val_m)              \
-                 : [psrc_ld_m] "m"(*psrc_ld_m));    \
-    val_m;                                          \
+    uint64_t val_m = 0;                                 \
+    asm volatile("uld  %[val_m],  %[psrc_ld_m]  \n"     \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_ld_m] "m"(*psrc_ld_m));        \
+    val_m;                                              \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                                       \
-  ({                                                                   \
-    uint8_t* psrc_ld_m = (uint8_t*)(psrc); /* NOLINT */                    \
+#define LD(psrc)                                                         \
+  ({                                                                     \
+    uint8_t* psrc_ld_m = (uint8_t*)(psrc); /* NOLINT */                  \
     uint32_t val0_m, val1_m;                                             \
     uint64_t val_m = 0;                                                  \
-    val0_m = LW(psrc_ld_m);                                            \
-    val1_m = LW(psrc_ld_m + 4);                                        \
+    val0_m = LW(psrc_ld_m);                                              \
+    val1_m = LW(psrc_ld_m + 4);                                          \
     val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);             /* NOLINT */ \
-    val_m;                                                             \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
+    val_m;                                                               \
   })
 #endif  // (__mips == 64)
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 0da54e87d..7f5fe9f0f 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -555,7 +555,7 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants);  // BT.709
 
 #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
 
-#define align_buffer_64(var, size)                                       \
+#define align_buffer_64(var, size)                                           \
   uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63));         /* NOLINT */ \
   uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
 
@@ -903,8 +903,12 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
-void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width);
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width);
 void BGRAToYRow_MSA(const uint8_t* src_bgra, uint8_t* dst_y, int width);
 void ABGRToYRow_MSA(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_MSA(const uint8_t* src_rgba, uint8_t* dst_y, int width);
@@ -936,7 +940,9 @@ void ABGRToYRow_Any_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
 void RGBAToYRow_Any_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
 void RGB24ToYRow_Any_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
 void RAWToYRow_Any_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
-void RGB565ToYRow_Any_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void RGB565ToYRow_Any_NEON(const uint8_t* src_rgb565,
+                           uint8_t* dst_y,
+                           int width);
 void ARGB1555ToYRow_Any_NEON(const uint8_t* src_argb1555,
                              uint8_t* dst_y,
                              int width);
@@ -951,7 +957,9 @@ void ARGBToYRow_Any_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void RGB24ToYRow_Any_MSA(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
 void RAWToYRow_Any_MSA(const uint8_t* src_raw, uint8_t* dst_y, int width);
 void RGB565ToYRow_Any_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
-void ARGB1555ToYRow_Any_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_Any_MSA(const uint8_t* src_argb1555,
+                            uint8_t* dst_y,
+                            int width);
 
 void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                       int src_stride_argb,
@@ -1224,7 +1232,10 @@ void MirrorUVRow_MSA(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width);
+void MirrorUVRow_C(const uint8_t* src_uv,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
 
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
@@ -1236,7 +1247,10 @@ void ARGBMirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_Any_NEON(const uint8_t* src, uint8_t* dst, int width);
 void ARGBMirrorRow_Any_MSA(const uint8_t* src, uint8_t* dst, int width);
 
-void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width);
+void SplitUVRow_C(const uint8_t* src_uv,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
+                  int width);
 void SplitUVRow_SSE2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
@@ -1249,7 +1263,10 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void SplitUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width);
+void SplitUVRow_MSA(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
 void SplitUVRow_Any_SSE2(const uint8_t* src_uv,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
@@ -1371,9 +1388,15 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width);
-void MultiplyRow_16_C(const uint16_t* src_y, uint16_t* dst_y, int scale, int width);
+void MultiplyRow_16_C(const uint16_t* src_y,
+                      uint16_t* dst_y,
+                      int scale,
+                      int width);
 
-void Convert8To16Row_C(const uint8_t* src_y, uint16_t* dst_y, int scale, int width);
+void Convert8To16Row_C(const uint8_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width);
 void Convert8To16Row_SSE2(const uint8_t* src_y,
                           uint16_t* dst_y,
                           int scale,
@@ -1391,7 +1414,10 @@ void Convert8To16Row_Any_AVX2(const uint8_t* src_y,
                               int scale,
                               int width);
 
-void Convert16To8Row_C(const uint16_t* src_y, uint8_t* dst_y, int scale, int width);
+void Convert16To8Row_C(const uint16_t* src_y,
+                       uint8_t* dst_y,
+                       int scale,
+                       int width);
 void Convert16To8Row_SSSE3(const uint16_t* src_y,
                            uint8_t* dst_y,
                            int scale,
@@ -1422,8 +1448,12 @@ void CopyRow_Any_NEON(const uint8_t* src, uint8_t* dst, int count);
 void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
 
 void ARGBCopyAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBCopyAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBCopyAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
 void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_argb,
                                uint8_t* dst_argb,
                                int width);
@@ -1432,10 +1462,18 @@ void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_argb,
                                int width);
 
 void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
-void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_a, int width);
-void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width);
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width);
-void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, uint8_t* dst_a, int width);
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width);
 void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_argb,
                                   uint8_t* dst_a,
                                   int width);
@@ -1450,8 +1488,12 @@ void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_argb,
                                  int width);
 
 void ARGBCopyYToAlphaRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
-void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src_y,
+                              uint8_t* dst_argb,
+                              int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src_y,
+                              uint8_t* dst_argb,
+                              int width);
 void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_y,
                                   uint8_t* dst_argb,
                                   int width);
@@ -1512,17 +1554,23 @@ void ARGBShuffleRow_Any_MSA(const uint8_t* src_argb,
                             const uint8_t* shuffler,
                             int width);
 
-void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_argb,
+                          int width);
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
+void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width);
 void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
                             uint8_t* dst_argb,
                             int width);
-void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
+void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width);
@@ -1530,14 +1578,20 @@ void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
                             uint8_t* dst_argb,
                             int width);
 
-void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width);
 void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
 void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
-void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width);
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width);
 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width);
@@ -1560,8 +1614,12 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width);
 void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_rgb24,
                               uint8_t* dst_argb,
                               int width);
-void RAWToARGBRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToARGBRow_Any_SSSE3(const uint8_t* src_raw,
+                            uint8_t* dst_argb,
+                            int width);
+void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_raw,
+                             uint8_t* dst_rgb24,
+                             int width);
 
 void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_rgb565,
                               uint8_t* dst_argb,
@@ -1585,11 +1643,19 @@ void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_argb4444,
 void RGB24ToARGBRow_Any_NEON(const uint8_t* src_rgb24,
                              uint8_t* dst_argb,
                              int width);
-void RGB24ToARGBRow_Any_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
-void RAWToARGBRow_Any_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_Any_MSA(const uint8_t* src_rgb24,
+                            uint8_t* dst_argb,
+                            int width);
+void RAWToARGBRow_Any_NEON(const uint8_t* src_raw,
+                           uint8_t* dst_argb,
+                           int width);
 void RAWToARGBRow_Any_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
-void RAWToRGB24Row_Any_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
-void RAWToRGB24Row_Any_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_Any_NEON(const uint8_t* src_raw,
+                            uint8_t* dst_rgb24,
+                            int width);
+void RAWToRGB24Row_Any_MSA(const uint8_t* src_raw,
+                           uint8_t* dst_rgb24,
+                           int width);
 void RGB565ToARGBRow_Any_NEON(const uint8_t* src_rgb565,
                               uint8_t* dst_argb,
                               int width);
@@ -1613,8 +1679,12 @@ void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_argb4444,
 void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
 void ARGBToAR30Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 
 void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
@@ -1631,15 +1701,23 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
                                 int width);
 
 void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
 void ARGBToAR30Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
                                 const uint32_t dither4,
@@ -1647,8 +1725,12 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
 void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
 void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
 void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
                                const uint32_t dither4,
@@ -2283,16 +2365,24 @@ void ARGBSubtractRow_Any_MSA(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width);
 
-void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_argb,
+                              uint8_t* dst_rgb,
+                              int width);
+void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_rgb,
+                              int width);
 void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
                                 int width);
 void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
                                 int width);
-void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_argb,
+                             uint8_t* dst_rgb,
+                             int width);
 
 void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_argb,
                                     uint8_t* dst_rgb,
@@ -2303,18 +2393,28 @@ void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_argb,
                                     const uint32_t dither4,
                                     int width);
 
-void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_rgb,
+                              int width);
 void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
                                 int width);
 void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
                                 int width);
-void ARGBToAR30Row_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToAR30Row_Any_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
 
-void ARGBToRGB24Row_Any_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRAWRow_Any_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRGB565Row_Any_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_Any_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_rgb,
+                             int width);
+void ARGBToRAWRow_Any_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToRGB565Row_Any_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_rgb,
+                              int width);
 void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
                                 int width);
@@ -2325,9 +2425,13 @@ void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_argb,
                                     uint8_t* dst_rgb,
                                     const uint32_t dither4,
                                     int width);
-void ARGBToRGB24Row_Any_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_Any_MSA(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
 void ARGBToRAWRow_Any_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
-void ARGBToRGB565Row_Any_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_Any_MSA(const uint8_t* src_argb,
+                             uint8_t* dst_rgb,
+                             int width);
 void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
                                int width);
@@ -2749,10 +2853,18 @@ void I422ToUYVYRow_Any_MSA(const uint8_t* src_y,
 
 // Effects related row functions.
 void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBAttenuateRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
 void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_argb,
                                 uint8_t* dst_argb,
                                 int width);
@@ -2768,9 +2880,15 @@ void ARGBAttenuateRow_Any_MSA(const uint8_t* src_argb,
 
 // Inverse table for unattenuate, shared by C and SSE2.
 extern const uint32_t fixed_invtbl8[256];
-void ARGBUnattenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width);
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width);
 void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_argb,
                                  uint8_t* dst_argb,
                                  int width);
@@ -2805,11 +2923,19 @@ void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
                             const int8_t* matrix_argb,
                             int width);
 
-void ARGBColorTableRow_C(uint8_t* dst_argb, const uint8_t* table_argb, int width);
-void ARGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width);
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+                         const uint8_t* table_argb,
+                         int width);
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                           const uint8_t* table_argb,
+                           int width);
 
-void RGBColorTableRow_C(uint8_t* dst_argb, const uint8_t* table_argb, int width);
-void RGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width);
+void RGBColorTableRow_C(uint8_t* dst_argb,
+                        const uint8_t* table_argb,
+                        int width);
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+                          const uint8_t* table_argb,
+                          int width);
 
 void ARGBQuantizeRow_C(uint8_t* dst_argb,
                        int scale,
@@ -3075,37 +3201,58 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
 
 // Scale and convert to half float.
 void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width);
-void HalfFloatRow_SSE2(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloatRow_SSE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
 void HalfFloatRow_Any_SSE2(const uint16_t* src,
                            uint16_t* dst,
                            float scale,
                            int width);
-void HalfFloatRow_AVX2(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloatRow_AVX2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
 void HalfFloatRow_Any_AVX2(const uint16_t* src,
                            uint16_t* dst,
                            float scale,
                            int width);
-void HalfFloatRow_F16C(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloatRow_F16C(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
 void HalfFloatRow_Any_F16C(const uint16_t* src,
                            uint16_t* dst,
                            float scale,
                            int width);
-void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloat1Row_F16C(const uint16_t* src,
+                        uint16_t* dst,
+                        float scale,
+                        int width);
 void HalfFloat1Row_Any_F16C(const uint16_t* src,
                             uint16_t* dst,
                             float scale,
                             int width);
-void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
 void HalfFloatRow_Any_NEON(const uint16_t* src,
                            uint16_t* dst,
                            float scale,
                            int width);
-void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float scale,
+                        int width);
 void HalfFloat1Row_Any_NEON(const uint16_t* src,
                             uint16_t* dst,
                             float scale,
                             int width);
-void HalfFloatRow_MSA(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloatRow_MSA(const uint16_t* src,
+                      uint16_t* dst,
+                      float scale,
+                      int width);
 void HalfFloatRow_Any_MSA(const uint16_t* src,
                           uint16_t* dst,
                           float scale,
diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h
index 42c08d8c1..c8265eac2 100644
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@@ -302,7 +302,9 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width);
 void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_16_C(const uint16_t* src_ptr, uint32_t* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+                      uint32_t* dst_ptr,
+                      int src_width);
 void ScaleARGBRowDown2_C(const uint8_t* src_argb,
                          ptrdiff_t src_stride,
                          uint8_t* dst_argb,
@@ -493,8 +495,12 @@ void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr,
 
 void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
 void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
+void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
 
 void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
                            const uint8_t* src_ptr,
@@ -810,7 +816,9 @@ void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
                                    int dst_width);
 
 void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
-void ScaleAddRow_Any_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
 
 void ScaleFilterCols_NEON(uint8_t* dst_ptr,
                           const uint8_t* src_ptr,
diff --git a/include/libyuv/video_common.h b/include/libyuv/video_common.h
index b427359ad..d2dbde75d 100644
--- a/include/libyuv/video_common.h
+++ b/include/libyuv/video_common.h
@@ -28,11 +28,11 @@ extern "C" {
 // Needs to be a macro otherwise the OS X compiler complains when the kFormat*
 // constants are used in a switch.
 #ifdef __cplusplus
-#define FOURCC(a, b, c, d)                                    \
+#define FOURCC(a, b, c, d)                                        \
   ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \
    (static_cast<uint32_t>(c) << 16) | (static_cast<uint32_t>(d) << 24))
 #else
-#define FOURCC(a, b, c, d)                                 \
+#define FOURCC(a, b, c, d)                                     \
   (((uint32_t)(a)) | ((uint32_t)(b) << 8) |       /* NOLINT */ \
    ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */
 #endif
diff --git a/source/compare.cc b/source/compare.cc
index aebab4ce1..02dd5f7a8 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -32,7 +32,8 @@ LIBYUV_API
 uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
   const int kBlockSize = 1 << 15;  // 32768;
   int remainder;
-  uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) = HashDjb2_C;
+  uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) =
+      HashDjb2_C;
 #if defined(HAS_HASHDJB2_SSE41)
   if (TestCpuFlag(kCpuHasSSE41)) {
     HashDjb2_SSE = HashDjb2_SSE41;
@@ -93,7 +94,10 @@ static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
 // Scan an opaque argb image and return fourcc based on alpha offset.
 // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
 LIBYUV_API
-uint32_t ARGBDetect(const uint8_t* argb, int stride_argb, int width, int height) {
+uint32_t ARGBDetect(const uint8_t* argb,
+                    int stride_argb,
+                    int width,
+                    int height) {
   uint32_t fourcc = 0;
   int h;
 
@@ -115,16 +119,16 @@ uint32_t ARGBDetect(const uint8_t* argb, int stride_argb, int width, int height)
 
 LIBYUV_API
 uint64_t ComputeHammingDistance(const uint8_t* src_a,
-                              const uint8_t* src_b,
-                              int count) {
+                                const uint8_t* src_b,
+                                int count) {
   const int kBlockSize = 1 << 15;  // 32768;
   const int kSimdSize = 64;
   // SIMD for multiple of 64, and C for remainder
   int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
   uint64_t diff = 0;
   int i;
-  uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b, int count) =
-      HammingDistance_C;
+  uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b,
+                              int count) = HammingDistance_C;
 #if defined(HAS_HAMMINGDISTANCE_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     HammingDistance = HammingDistance_NEON;
@@ -173,8 +177,8 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a,
 // TODO(fbarchard): Refactor into row function.
 LIBYUV_API
 uint64_t ComputeSumSquareError(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
+                               const uint8_t* src_b,
+                               int count) {
   // SumSquareError returns values 0 to 65535 for each squared difference.
   // Up to 65536 of those can be summed and remain within a uint32_t.
   // After each block of 65536 pixels, accumulate into a uint64_t.
@@ -182,8 +186,8 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a,
   int remainder = count & (kBlockSize - 1) & ~31;
   uint64_t sse = 0;
   int i;
-  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, int count) =
-      SumSquareError_C;
+  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
+                             int count) = SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SumSquareError = SumSquareError_NEON;
@@ -228,11 +232,11 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a,
 
 LIBYUV_API
 uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
-                                  int stride_a,
-                                  const uint8_t* src_b,
-                                  int stride_b,
-                                  int width,
-                                  int height) {
+                                    int stride_a,
+                                    const uint8_t* src_b,
+                                    int stride_b,
+                                    int width,
+                                    int height) {
   uint64_t sse = 0;
   int h;
   // Coalesce rows.
@@ -274,7 +278,7 @@ double CalcFramePsnr(const uint8_t* src_a,
                      int height) {
   const uint64_t samples = (uint64_t)width * (uint64_t)height;
   const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
-                                                stride_b, width, height);
+                                                  stride_b, width, height);
   return SumSquareErrorToPsnr(sse, samples);
 }
 
@@ -293,8 +297,8 @@ double I420Psnr(const uint8_t* src_y_a,
                 int stride_v_b,
                 int width,
                 int height) {
-  const uint64_t sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, src_y_b,
-                                                  stride_y_b, width, height);
+  const uint64_t sse_y = ComputeSumSquareErrorPlane(
+      src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
   const int width_uv = (width + 1) >> 1;
   const int height_uv = (height + 1) >> 1;
   const uint64_t sse_u = ComputeSumSquareErrorPlane(
@@ -302,7 +306,7 @@ double I420Psnr(const uint8_t* src_y_a,
   const uint64_t sse_v = ComputeSumSquareErrorPlane(
       src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
   const uint64_t samples = (uint64_t)width * (uint64_t)height +
-                         2 * ((uint64_t)width_uv * (uint64_t)height_uv);
+                           2 * ((uint64_t)width_uv * (uint64_t)height_uv);
   const uint64_t sse = sse_y + sse_u + sse_v;
   return SumSquareErrorToPsnr(sse, samples);
 }
@@ -344,7 +348,7 @@ static double Ssim8x8_C(const uint8_t* src_a,
     const int64_t sum_a_x_sum_b = sum_a * sum_b;
 
     const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) *
-                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+                           (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
 
     const int64_t sum_a_sq = sum_a * sum_a;
     const int64_t sum_b_sq = sum_b * sum_b;
diff --git a/source/compare_common.cc b/source/compare_common.cc
index 92852e15c..633466add 100644
--- a/source/compare_common.cc
+++ b/source/compare_common.cc
@@ -18,7 +18,9 @@ extern "C" {
 #endif
 
 #if ORIGINAL_OPT
-uint32_t HammingDistance_C1(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t HammingDistance_C1(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count) {
   uint32_t diff = 0u;
 
   int i;
@@ -46,12 +48,14 @@ uint32_t HammingDistance_C1(const uint8_t* src_a, const uint8_t* src_b, int coun
 #endif
 
 // Hakmem method for hamming distance.
-uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t HammingDistance_C(const uint8_t* src_a,
+                           const uint8_t* src_b,
+                           int count) {
   uint32_t diff = 0u;
 
   int i;
   for (i = 0; i < count - 3; i += 4) {
-    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);
+    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
     uint32_t u = x - ((x >> 1) & 0x55555555);
     u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
     diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
@@ -71,7 +75,9 @@ uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, int count
   return diff;
 }
 
-uint32_t SumSquareError_C(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t SumSquareError_C(const uint8_t* src_a,
+                          const uint8_t* src_b,
+                          int count) {
   uint32_t sse = 0u;
   int i;
   for (i = 0; i < count; ++i) {
diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc
index 39d3d0fa2..676527c1b 100644
--- a/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@@ -24,8 +24,8 @@ extern "C" {
 
 #if defined(__x86_64__)
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
+                               const uint8_t* src_b,
+                               int count) {
   uint64_t diff = 0u;
 
   asm volatile(
@@ -72,8 +72,8 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
 }
 #else
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
+                               const uint8_t* src_b,
+                               int count) {
   uint32_t diff = 0u;
 
   asm volatile(
@@ -116,8 +116,8 @@ static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
 static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
 
 uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
+                               const uint8_t* src_b,
+                               int count) {
   uint32_t diff = 0u;
 
   asm volatile(
@@ -174,7 +174,9 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
 }
 
 #ifdef HAS_HAMMINGDISTANCE_AVX2
-uint32_t HammingDistance_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
   uint32_t diff = 0u;
 
   asm volatile(
@@ -227,43 +229,46 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, const uint8_t* src_b, int co
 }
 #endif  // HAS_HAMMINGDISTANCE_AVX2
 
-uint32_t SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
   uint32_t sse;
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm1                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqu    (%1),%%xmm2                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psubusb   %%xmm2,%%xmm1                   \n"
-    "psubusb   %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpckhbw %%xmm5,%%xmm2                   \n"
-    "pmaddwd   %%xmm1,%%xmm1                   \n"
-    "pmaddwd   %%xmm2,%%xmm2                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
+  asm volatile(
+      "pxor      %%xmm0,%%xmm0                   \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
 
-    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0,%3                       \n"
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm1,%%xmm3                   \n"
+      "psubusb   %%xmm2,%%xmm1                   \n"
+      "psubusb   %%xmm3,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpckhbw %%xmm5,%%xmm2                   \n"
+      "pmaddwd   %%xmm1,%%xmm1                   \n"
+      "pmaddwd   %%xmm2,%%xmm2                   \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
 
-  : "+r"(src_a),      // %0
-    "+r"(src_b),      // %1
-    "+r"(count),      // %2
-    "=g"(sse)         // %3
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "movd      %%xmm0,%3                       \n"
+
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=g"(sse)     // %3
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
   return sse;
 }
 
@@ -295,56 +300,56 @@ static const uvec32 kHashMul3 = {
 
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
   uint32_t hash;
-  asm volatile (
-    "movd      %2,%%xmm0                       \n"
-    "pxor      %%xmm7,%%xmm7                   \n"
-    "movdqa    %4,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm1                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "pmulld    %%xmm6,%%xmm0                   \n"
-    "movdqa    %5,%%xmm5                       \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm7,%%xmm3                   \n"
-    "pmulld    %%xmm5,%%xmm3                   \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpckhwd %%xmm7,%%xmm4                   \n"
-    "pmulld    %%xmm5,%%xmm4                   \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "punpckhbw %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm7,%%xmm2                   \n"
-    "pmulld    %%xmm5,%%xmm2                   \n"
-    "movdqa    %8,%%xmm5                       \n"
-    "punpckhwd %%xmm7,%%xmm1                   \n"
-    "pmulld    %%xmm5,%%xmm1                   \n"
-    "paddd     %%xmm4,%%xmm3                   \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm1                   \n"
-    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%1                        \n"
-    "jg        1b                              \n"
-    "movd      %%xmm0,%3                       \n"
-  : "+r"(src),        // %0
-    "+r"(count),      // %1
-    "+rm"(seed),      // %2
-    "=g"(hash)        // %3
-  : "m"(kHash16x33),  // %4
-    "m"(kHashMul0),   // %5
-    "m"(kHashMul1),   // %6
-    "m"(kHashMul2),   // %7
-    "m"(kHashMul3)    // %8
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+      "movd      %2,%%xmm0                       \n"
+      "pxor      %%xmm7,%%xmm7                   \n"
+      "movdqa    %4,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "pmulld    %%xmm6,%%xmm0                   \n"
+      "movdqa    %5,%%xmm5                       \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm7,%%xmm3                   \n"
+      "pmulld    %%xmm5,%%xmm3                   \n"
+      "movdqa    %6,%%xmm5                       \n"
+      "movdqa    %%xmm2,%%xmm4                   \n"
+      "punpckhwd %%xmm7,%%xmm4                   \n"
+      "pmulld    %%xmm5,%%xmm4                   \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "punpckhbw %%xmm7,%%xmm1                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklwd %%xmm7,%%xmm2                   \n"
+      "pmulld    %%xmm5,%%xmm2                   \n"
+      "movdqa    %8,%%xmm5                       \n"
+      "punpckhwd %%xmm7,%%xmm1                   \n"
+      "pmulld    %%xmm5,%%xmm1                   \n"
+      "paddd     %%xmm4,%%xmm3                   \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm1                   \n"
+      "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "sub       $0x10,%1                        \n"
+      "jg        1b                              \n"
+      "movd      %%xmm0,%3                       \n"
+      : "+r"(src),        // %0
+        "+r"(count),      // %1
+        "+rm"(seed),      // %2
+        "=g"(hash)        // %3
+      : "m"(kHash16x33),  // %4
+        "m"(kHashMul0),   // %5
+        "m"(kHashMul1),   // %6
+        "m"(kHashMul2),   // %7
+        "m"(kHashMul3)    // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
   return hash;
 }
 #endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
diff --git a/source/compare_msa.cc b/source/compare_msa.cc
index 2f3592eee..e944235f0 100644
--- a/source/compare_msa.cc
+++ b/source/compare_msa.cc
@@ -22,7 +22,9 @@ namespace libyuv {
 extern "C" {
 #endif
 
-uint32_t HammingDistance_MSA(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t HammingDistance_MSA(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
   uint32_t diff = 0u;
   int i;
   v16u8 src0, src1, src2, src3;
@@ -47,7 +49,9 @@ uint32_t HammingDistance_MSA(const uint8_t* src_a, const uint8_t* src_b, int cou
   return diff;
 }
 
-uint32_t SumSquareError_MSA(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t SumSquareError_MSA(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count) {
   uint32_t sse = 0u;
   int i;
   v16u8 src0, src1, src2, src3;
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
index 690ba6fda..2a2181e0c 100644
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -23,7 +23,9 @@ extern "C" {
 
 // 256 bits at a time
 // uses short accumulator which restricts count to 131 KB
-uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
   uint32_t diff;
 
   asm volatile(
@@ -52,7 +54,9 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int co
   return diff;
 }
 
-uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
   uint32_t sse;
   asm volatile(
       "vmov.u8    q8, #0                         \n"
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index 8a74fbf16..6e8f672ab 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -22,7 +22,9 @@ extern "C" {
 
 // 256 bits at a time
 // uses short accumulator which restricts count to 131 KB
-uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
   uint32_t diff;
   asm volatile(
       "movi       v4.8h, #0                      \n"
@@ -47,7 +49,9 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int co
   return diff;
 }
 
-uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) {
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
   uint32_t sse;
   asm volatile(
       "eor        v16.16b, v16.16b, v16.16b      \n"
diff --git a/source/compare_win.cc b/source/compare_win.cc
index ec45412f6..d57d3d9d1 100644
--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -26,13 +26,13 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
-                             const uint8_t* src_b,
-                             int count) {
+                               const uint8_t* src_b,
+                               int count) {
   uint32_t diff = 0u;
 
   int i;
   for (i = 0; i < count - 3; i += 4) {
-    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);
+    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
     src_a += 4;
     src_b += 4;
     diff += __popcnt(x);
diff --git a/source/convert.cc b/source/convert.cc
index 4d54f927b..b71ba012e 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -451,8 +451,9 @@ int YUY2ToI420(const uint8_t* src_yuy2,
                int width,
                int height) {
   int y;
-  void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = YUY2ToUVRow_C;
+  void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      YUY2ToUVRow_C;
   void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
       YUY2ToYRow_C;
   // Negative height means invert the image.
@@ -531,8 +532,9 @@ int UYVYToI420(const uint8_t* src_uyvy,
                int width,
                int height) {
   int y;
-  void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = UYVYToUVRow_C;
+  void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      UYVYToUVRow_C;
   void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
       UYVYToYRow_C;
   // Negative height means invert the image.
@@ -611,8 +613,9 @@ int ARGBToI420(const uint8_t* src_argb,
                int width,
                int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -706,8 +709,9 @@ int BGRAToI420(const uint8_t* src_bgra,
                int width,
                int height) {
   int y;
-  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = BGRAToUVRow_C;
+  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      BGRAToUVRow_C;
   void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
       BGRAToYRow_C;
   if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -791,8 +795,9 @@ int ABGRToI420(const uint8_t* src_abgr,
                int width,
                int height) {
   int y;
-  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ABGRToUVRow_C;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
   void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
       ABGRToYRow_C;
   if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -876,8 +881,9 @@ int RGBAToI420(const uint8_t* src_rgba,
                int width,
                int height) {
   int y;
-  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = RGBAToUVRow_C;
+  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGBAToUVRow_C;
   void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
       RGBAToYRow_C;
   if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -963,14 +969,16 @@ int RGB24ToI420(const uint8_t* src_rgb24,
   int y;
 #if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
   void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) = RGB24ToUVRow_C;
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB24ToUVRow_C;
   void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
       RGB24ToYRow_C;
 #else
   void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
@@ -1099,8 +1107,9 @@ int RAWToI420(const uint8_t* src_raw,
 #else
   void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
@@ -1228,10 +1237,11 @@ int RGB565ToI420(const uint8_t* src_rgb565,
   void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
       RGB565ToYRow_C;
 #else
-  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      RGB565ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
@@ -1362,13 +1372,14 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
   void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
                           uint8_t* dst_u, uint8_t* dst_v, int width) =
       ARGB1555ToUVRow_C;
-  void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y, int width) =
-      ARGB1555ToYRow_C;
+  void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
+                         int width) = ARGB1555ToYRow_C;
 #else
-  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      ARGB1555ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
@@ -1503,13 +1514,14 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
   void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
                           uint8_t* dst_u, uint8_t* dst_v, int width) =
       ARGB4444ToUVRow_C;
-  void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y, int width) =
-      ARGB4444ToYRow_C;
+  void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
+                         int width) = ARGB4444ToYRow_C;
 #else
-  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
-      ARGB4444ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 513dc1813..94240f296 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -776,8 +776,8 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) = I422AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
-      ARGBAttenuateRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
   if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1244,8 +1244,8 @@ int RGB565ToARGB(const uint8_t* src_rgb565,
                  int width,
                  int height) {
   int y;
-  void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) =
-      RGB565ToARGBRow_C;
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
   if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1481,9 +1481,9 @@ static int NV12ToARGBMatrix(const uint8_t* src_y,
                             int width,
                             int height) {
   int y;
-  void (*NV12ToARGBRow)(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      NV12ToARGBRow_C;
+  void (*NV12ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1548,9 +1548,9 @@ static int NV21ToARGBMatrix(const uint8_t* src_y,
                             int width,
                             int height) {
   int y;
-  void (*NV21ToARGBRow)(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      NV21ToARGBRow_C;
+  void (*NV21ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
   if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1670,9 +1670,9 @@ int M420ToARGB(const uint8_t* src_m420,
                int width,
                int height) {
   int y;
-  void (*NV12ToARGBRow)(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
-                        const struct YuvConstants* yuvconstants, int width) =
-      NV12ToARGBRow_C;
+  void (*NV12ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
   if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 0a38a27cf..eb4631352 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -1123,7 +1123,7 @@ int I420ToRGB565Dither(const uint8_t* src_y,
       I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
       ARGBToRGB565DitherRow(row_argb, dst_rgb565,
                             *(uint32_t*)(dither4x4 + ((y & 3) << 2)),  // NOLINT
-                            width);                                  // NOLINT
+                            width);                                    // NOLINT
       dst_rgb565 += dst_stride_rgb565;
       src_y += src_stride_y;
       if (y & 1) {
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index a94df42ac..1fc4289a8 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -35,8 +35,8 @@ int ARGBToI444(const uint8_t* src_argb,
   int y;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v,
-                         int width) = ARGBToUV444Row_C;
+  void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = ARGBToUV444Row_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
@@ -133,8 +133,9 @@ int ARGBToI422(const uint8_t* src_argb,
                int width,
                int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -229,12 +230,13 @@ int ARGBToNV12(const uint8_t* src_argb,
                int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv,
-                      int width) = MergeUVRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_uv, int width) = MergeUVRow_C;
   if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
@@ -364,12 +366,13 @@ int ARGBToNV21(const uint8_t* src_argb,
                int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv,
-                      int width) = MergeUVRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_uv, int width) = MergeUVRow_C;
   if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
@@ -496,8 +499,9 @@ int ARGBToYUY2(const uint8_t* src_argb,
                int width,
                int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
   void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
@@ -624,8 +628,9 @@ int ARGBToUYVY(const uint8_t* src_argb,
                int width,
                int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
-                      uint8_t* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
   void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
@@ -1005,7 +1010,7 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB565DitherRow(src_argb, dst_rgb565,
-                          *(uint32_t*)(dither4x4 + ((y & 3) << 2)),
+                          *(uint32_t*)(dither4x4 + ((y & 3) << 2)),  // NOLINT
                           width); /* NOLINT */
     src_argb += src_stride_argb;
     dst_rgb565 += dst_stride_rgb565;
@@ -1023,8 +1028,8 @@ int ARGBToRGB565(const uint8_t* src_argb,
                  int width,
                  int height) {
   int y;
-  void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
-      ARGBToRGB565Row_C;
+  void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                          int width) = ARGBToRGB565Row_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1089,8 +1094,8 @@ int ARGBToARGB1555(const uint8_t* src_argb,
                    int width,
                    int height) {
   int y;
-  void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
-      ARGBToARGB1555Row_C;
+  void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                            int width) = ARGBToARGB1555Row_C;
   if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1155,8 +1160,8 @@ int ARGBToARGB4444(const uint8_t* src_argb,
                    int width,
                    int height) {
   int y;
-  void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
-      ARGBToARGB4444Row_C;
+  void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                            int width) = ARGBToARGB4444Row_C;
   if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1275,7 +1280,8 @@ int ARGBToJ420(const uint8_t* src_argb,
                int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVJRow_C;
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
   if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -1368,7 +1374,8 @@ int ARGBToJ422(const uint8_t* src_argb,
                int height) {
   int y;
   void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
-                       uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVJRow_C;
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
   if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
diff --git a/source/convert_jpeg.cc b/source/convert_jpeg.cc
index 12e006b90..c91b43dc2 100644
--- a/source/convert_jpeg.cc
+++ b/source/convert_jpeg.cc
@@ -89,7 +89,10 @@ static void JpegI400ToI420(void* opaque,
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8_t* sample, size_t sample_size, int* width, int* height) {
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
+             int* width,
+             int* height) {
   MJpegDecoder mjpeg_decoder;
   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
   if (ret) {
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 5094607bb..09e9c336d 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -430,8 +430,8 @@ void MergeUVPlane(const uint8_t* src_u,
                   int width,
                   int height) {
   int y;
-  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv,
-                     int width) = MergeUVRow_C;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                     uint8_t* dst_uv, int width) = MergeUVRow_C;
   // Coalesce rows.
   // Negative height means invert the image.
   if (height < 0) {
@@ -673,8 +673,8 @@ int YUY2ToI422(const uint8_t* src_yuy2,
                int width,
                int height) {
   int y;
-  void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v,
-                         int width) = YUY2ToUV422Row_C;
+  void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = YUY2ToUV422Row_C;
   void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
       YUY2ToYRow_C;
   if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -759,8 +759,8 @@ int UYVYToI422(const uint8_t* src_uyvy,
                int width,
                int height) {
   int y;
-  void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v,
-                         int width) = UYVYToUV422Row_C;
+  void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = UYVYToUV422Row_C;
   void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
       UYVYToYRow_C;
   if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -1287,8 +1287,8 @@ int ARGBMultiply(const uint8_t* src_argb0,
                  int width,
                  int height) {
   int y;
-  void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,
-                          int width) = ARGBMultiplyRow_C;
+  void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1,
+                          uint8_t* dst, int width) = ARGBMultiplyRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1436,8 +1436,8 @@ int ARGBSubtract(const uint8_t* src_argb0,
                  int width,
                  int height) {
   int y;
-  void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,
-                          int width) = ARGBSubtractRow_C;
+  void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1,
+                          uint8_t* dst, int width) = ARGBSubtractRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1822,7 +1822,8 @@ int ARGBRect(uint8_t* dst_argb,
              int height,
              uint32_t value) {
   int y;
-  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) = ARGBSetRow_C;
+  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+      ARGBSetRow_C;
   if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
@@ -1890,8 +1891,8 @@ int ARGBAttenuate(const uint8_t* src_argb,
                   int width,
                   int height) {
   int y;
-  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
-      ARGBAttenuateRow_C;
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2399,9 +2400,9 @@ int ARGBBlur(const uint8_t* src_argb,
   void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
                                   const int32_t* previous_cumsum, int width) =
       ComputeCumulativeSumRow_C;
-  void (*CumulativeSumToAverageRow)(const int32_t* topleft, const int32_t* botleft,
-                                    int width, int area, uint8_t* dst,
-                                    int count) = CumulativeSumToAverageRow_C;
+  void (*CumulativeSumToAverageRow)(
+      const int32_t* topleft, const int32_t* botleft, int width, int area,
+      uint8_t* dst, int count) = CumulativeSumToAverageRow_C;
   int32_t* cumsum_bot_row;
   int32_t* max_cumsum_bot_row;
   int32_t* cumsum_top_row;
@@ -2752,8 +2753,8 @@ static int ARGBSobelize(const uint8_t* src_argb,
   int y;
   void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
       ARGBToYJRow_C;
-  void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely,
-                    int width) = SobelYRow_C;
+  void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+                    uint8_t* dst_sobely, int width) = SobelYRow_C;
   void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
                     const uint8_t* src_y2, uint8_t* dst_sobely, int width) =
       SobelXRow_C;
@@ -3052,8 +3053,8 @@ int HalfFloatPlane(const uint16_t* src_y,
                    int width,
                    int height) {
   int y;
-  void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale, int width) =
-      HalfFloatRow_C;
+  void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale,
+                       int width) = HalfFloatRow_C;
   if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -3133,8 +3134,8 @@ int ARGBLumaColorTable(const uint8_t* src_argb,
                        int height) {
   int y;
   void (*ARGBLumaColorTableRow)(
-      const uint8_t* src_argb, uint8_t* dst_argb, int width, const uint8_t* luma,
-      const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;
+      const uint8_t* src_argb, uint8_t* dst_argb, int width,
+      const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;
   if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
     return -1;
   }
@@ -3173,8 +3174,8 @@ int ARGBCopyAlpha(const uint8_t* src_argb,
                   int width,
                   int height) {
   int y;
-  void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
-      ARGBCopyAlphaRow_C;
+  void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBCopyAlphaRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -3238,8 +3239,8 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
     height = 1;
     src_stride = dst_stride = 0;
   }
-  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, int width) =
-      ARGBExtractAlphaRow_C;
+  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+                              int width) = ARGBExtractAlphaRow_C;
 #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
@@ -3282,8 +3283,8 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
                      int width,
                      int height) {
   int y;
-  void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
-      ARGBCopyYToAlphaRow_C;
+  void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb,
+                              int width) = ARGBCopyYToAlphaRow_C;
   if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
diff --git a/source/rotate.cc b/source/rotate.cc
index ceec134d5..f2bed85b7 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -314,8 +314,8 @@ void RotateUV180(const uint8_t* src,
                  int width,
                  int height) {
   int i;
-  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, int width) =
-      MirrorUVRow_C;
+  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+                      int width) = MirrorUVRow_C;
 #if defined(HAS_MIRRORUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     MirrorUVRow = MirrorUVRow_NEON;
diff --git a/source/rotate_any.cc b/source/rotate_any.cc
index 40dcc4bb1..c2752e622 100644
--- a/source/rotate_any.cc
+++ b/source/rotate_any.cc
@@ -19,8 +19,8 @@ extern "C" {
 #endif
 
 #define TANY(NAMEANY, TPOS_SIMD, MASK)                                        \
-  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride,  \
-               int width) {                                                   \
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst,              \
+               int dst_stride, int width) {                                   \
     int r = width & MASK;                                                     \
     int n = width - r;                                                        \
     if (n > 0) {                                                              \
@@ -44,8 +44,9 @@ TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
 #undef TANY
 
 #define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
-  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a,                 \
-               int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) {  \
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a,             \
+               int dst_stride_a, uint8_t* dst_b, int dst_stride_b,             \
+               int width) {                                                    \
     int r = width & MASK;                                                      \
     int n = width - r;                                                         \
     if (n > 0) {                                                               \
diff --git a/source/row_any.cc b/source/row_any.cc
index 3936254d1..2b3b85272 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -31,25 +31,25 @@ extern "C" {
 #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
 
 // Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)            \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, \
-               const uint8_t* a_buf, uint8_t* dst_ptr,                         \
-               const struct YuvConstants* yuvconstants, int width) {       \
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
+               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {         \
     SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
-    memset(temp, 0, 64 * 4); /* for msan */                                \
-    int r = width & MASK;                                                  \
-    int n = width & ~MASK;                                                 \
-    if (n > 0) {                                                           \
-      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);      \
-    }                                                                      \
-    memcpy(temp, y_buf + n, r);                                            \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));             \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
-    memcpy(temp + 192, a_buf + n, r);                                      \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,          \
-             yuvconstants, MASK + 1);                                      \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                    \
-           SS(r, DUVSHIFT) * BPP);                                         \
+    memset(temp, 0, 64 * 4); /* for msan */                                  \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
+    }                                                                        \
+    memcpy(temp, y_buf + n, r);                                              \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+    memcpy(temp + 192, a_buf + n, r);                                        \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
+             yuvconstants, MASK + 1);                                        \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
+           SS(r, DUVSHIFT) * BPP);                                           \
   }
 
 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
@@ -67,22 +67,22 @@ ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
 #undef ANY41C
 
 // Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)             \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, \
-               uint8_t* dst_ptr, int width) {                                \
-    SIMD_ALIGNED(uint8_t temp[64 * 4]);                                      \
-    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                       \
-    int r = width & MASK;                                                  \
-    int n = width & ~MASK;                                                 \
-    if (n > 0) {                                                           \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                           \
-    }                                                                      \
-    memcpy(temp, y_buf + n, r);                                            \
-    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));             \
-    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));            \
-    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);           \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                    \
-           SS(r, DUVSHIFT) * BPP);                                         \
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \
+               const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 4]);                             \
+    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                \
+    int r = width & MASK;                                           \
+    int n = width & ~MASK;                                          \
+    if (n > 0) {                                                    \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                    \
+    }                                                               \
+    memcpy(temp, y_buf + n, r);                                     \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));     \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,             \
+           SS(r, DUVSHIFT) * BPP);                                  \
   }
 
 // Merge functions.
@@ -120,10 +120,10 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
 // on arm that subsamples 444 to 422 internally.
 // Any 3 planes to 1 with yuvconstants
 #define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
-  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf,     \
-               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants,        \
-               int width) {                                                    \
-    SIMD_ALIGNED(uint8_t temp[64 * 4]);                                          \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                     \
+               const uint8_t* v_buf, uint8_t* dst_ptr,                         \
+               const struct YuvConstants* yuvconstants, int width) {           \
+    SIMD_ALIGNED(uint8_t temp[64 * 4]);                                        \
     memset(temp, 0, 64 * 3); /* for YUY2 and msan */                           \
     int r = width & MASK;                                                      \
     int n = width & ~MASK;                                                     \
@@ -199,22 +199,23 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
 
 // Any 3 planes of 16 bit to 1 with yuvconstants
 // TODO(fbarchard): consider sharing this code with ANY31C
-#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK)      \
-  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, uint8_t* dst_ptr, \
-               const struct YuvConstants* yuvconstants, int width) {           \
-    SIMD_ALIGNED(T temp[16 * 3]);                                              \
-    SIMD_ALIGNED(uint8_t out[64]);                                               \
-    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);                 \
-    }                                                                          \
-    memcpy(temp, y_buf + n, r * SBPP);                                         \
-    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
-    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);          \
-    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);         \
-    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \
+               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+               int width) {                                               \
+    SIMD_ALIGNED(T temp[16 * 3]);                                         \
+    SIMD_ALIGNED(uint8_t out[64]);                                        \
+    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */               \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
+    }                                                                     \
+    memcpy(temp, y_buf + n, r * SBPP);                                    \
+    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
+    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
+    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);  \
   }
 
 #ifdef HAS_I210TOAR30ROW_SSSE3
@@ -229,21 +230,21 @@ ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #undef ANY31CT
 
 // Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)       \
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
   void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
-               int width) {                                             \
-    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                   \
-    memset(temp, 0, 64 * 2); /* for msan */                             \
-    int r = width & MASK;                                               \
-    int n = width & ~MASK;                                              \
-    if (n > 0) {                                                        \
-      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                              \
-    }                                                                   \
-    memcpy(temp, y_buf + n * SBPP, r * SBPP);                           \
-    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                  \
-           SS(r, UVSHIFT) * SBPP2);                                     \
-    ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                    \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                     \
+               int width) {                                                   \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                       \
+    memset(temp, 0, 64 * 2); /* for msan */                                   \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \
+    }                                                                         \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
+    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                          \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                           \
   }
 
 // Merge functions.
@@ -327,21 +328,21 @@ ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
 #undef ANY21
 
 // Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)      \
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
   void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
-               const struct YuvConstants* yuvconstants, int width) {    \
-    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                   \
-    memset(temp, 0, 64 * 2); /* for msan */                             \
-    int r = width & MASK;                                               \
-    int n = width & ~MASK;                                              \
-    if (n > 0) {                                                        \
-      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                \
-    }                                                                   \
-    memcpy(temp, y_buf + n * SBPP, r * SBPP);                           \
-    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                  \
-           SS(r, UVSHIFT) * SBPP2);                                     \
-    ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);      \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                     \
+               const struct YuvConstants* yuvconstants, int width) {          \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                       \
+    memset(temp, 0, 64 * 2); /* for msan */                                   \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
+    }                                                                         \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
+    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);            \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                           \
   }
 
 // Biplanar to RGB.
@@ -385,8 +386,8 @@ ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
 
 // Any 1 to 1.
 #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {         \
-    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                    \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
     memset(temp, 0, 128); /* for YUY2 and msan */                         \
     int r = width & MASK;                                                 \
     int n = width & ~MASK;                                                \
@@ -640,8 +641,8 @@ ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
 
 // Any 1 to 1 blended.  Destination is read, modify, write.
 #define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {         \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                     \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
     memset(temp, 0, 64 * 2); /* for msan */                               \
     int r = width & MASK;                                                 \
     int n = width & ~MASK;                                                \
@@ -669,18 +670,18 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
 #undef ANY11B
 
 // Any 1 to 1 with parameter.
-#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                      \
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
   void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                      \
-    memset(temp, 0, 64); /* for msan */                                    \
-    int r = width & MASK;                                                  \
-    int n = width & ~MASK;                                                 \
-    if (n > 0) {                                                           \
-      ANY_SIMD(src_ptr, dst_ptr, param, n);                                \
-    }                                                                      \
-    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                            \
-    ANY_SIMD(temp, temp + 64, param, MASK + 1);                            \
-    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                         \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                        \
+    memset(temp, 0, 64); /* for msan */                                        \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \
+    }                                                                          \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
+    ANY_SIMD(temp, temp + 64, param, MASK + 1);                                \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \
   }
 
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
@@ -755,29 +756,48 @@ ANY11C(Convert16To8Row_Any_SSSE3,
        15)
 #endif
 #ifdef HAS_CONVERT16TO8ROW_AVX2
-ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, uint16_t, uint8_t, 31)
+ANY11C(Convert16To8Row_Any_AVX2,
+       Convert16To8Row_AVX2,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       31)
 #endif
 #ifdef HAS_CONVERT8TO16ROW_SSE2
-ANY11C(Convert8To16Row_Any_SSE2, Convert8To16Row_SSE2, 1, 2, uint8_t, uint16_t, 15)
+ANY11C(Convert8To16Row_Any_SSE2,
+       Convert8To16Row_SSE2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       15)
 #endif
 #ifdef HAS_CONVERT8TO16ROW_AVX2
-ANY11C(Convert8To16Row_Any_AVX2, Convert8To16Row_AVX2, 1, 2, uint8_t, uint16_t, 31)
+ANY11C(Convert8To16Row_Any_AVX2,
+       Convert8To16Row_AVX2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       31)
 #endif
 #undef ANY11C
 
 // Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
-#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                      \
-  void NAMEANY(const uint16_t* src_ptr, uint16_t* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint16_t temp[32 * 2]);                                       \
-    memset(temp, 0, 64); /* for msan */                                      \
-    int r = width & MASK;                                                    \
-    int n = width & ~MASK;                                                   \
-    if (n > 0) {                                                             \
-      ANY_SIMD(src_ptr, dst_ptr, param, n);                                  \
-    }                                                                        \
-    memcpy(temp, src_ptr + n, r * SBPP);                                     \
-    ANY_SIMD(temp, temp + 16, param, MASK + 1);                              \
-    memcpy(dst_ptr + n, temp + 16, r * BPP);                                 \
+#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)             \
+  void NAMEANY(const uint16_t* src_ptr, uint16_t* dst_ptr, T param, \
+               int width) {                                         \
+    SIMD_ALIGNED(uint16_t temp[32 * 2]);                            \
+    memset(temp, 0, 64); /* for msan */                             \
+    int r = width & MASK;                                           \
+    int n = width & ~MASK;                                          \
+    if (n > 0) {                                                    \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                         \
+    }                                                               \
+    memcpy(temp, src_ptr + n, r * SBPP);                            \
+    ANY_SIMD(temp, temp + 16, param, MASK + 1);                     \
+    memcpy(dst_ptr + n, temp + 16, r * BPP);                        \
   }
 
 #ifdef HAS_HALFFLOATROW_SSE2
@@ -801,9 +821,9 @@ ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, float, 2, 2, 31)
 
 // Any 1 to 1 with yuvconstants
 #define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                      \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                  \
                const struct YuvConstants* yuvconstants, int width) {      \
-    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                    \
+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
     memset(temp, 0, 128); /* for YUY2 and msan */                         \
     int r = width & MASK;                                                 \
     int n = width & ~MASK;                                                \
@@ -833,20 +853,20 @@ ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
 #undef ANY11C
 
 // Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
-  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, \
-               int width, int source_y_fraction) {                             \
-    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                          \
-    memset(temp, 0, 64 * 2); /* for msan */                                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);        \
-    }                                                                          \
-    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
-    memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);          \
-    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);               \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                            \
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                           \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr,                     \
+               ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                      \
+    memset(temp, 0, 64 * 2); /* for msan */                                  \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+    }                                                                        \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+    memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
   }
 
 #ifdef HAS_INTERPOLATEROW_AVX2
@@ -865,8 +885,8 @@ ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
 
 // Any 1 to 1 mirror.
 #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {         \
-    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                     \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
     memset(temp, 0, 64); /* for msan */                                   \
     int r = width & MASK;                                                 \
     int n = width & ~MASK;                                                \
@@ -905,16 +925,16 @@ ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
 #undef ANY11M
 
 // Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)      \
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
   void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
     SIMD_ALIGNED(uint8_t temp[64]);                  \
-    int r = width & MASK;                          \
-    int n = width & ~MASK;                         \
-    if (n > 0) {                                   \
-      ANY_SIMD(dst_ptr, v32, n);                   \
-    }                                              \
-    ANY_SIMD(temp, v32, MASK + 1);                 \
-    memcpy(dst_ptr + n * BPP, temp, r * BPP);      \
+    int r = width & MASK;                            \
+    int n = width & ~MASK;                           \
+    if (n > 0) {                                     \
+      ANY_SIMD(dst_ptr, v32, n);                     \
+    }                                                \
+    ANY_SIMD(temp, v32, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, temp, r * BPP);        \
   }
 
 #ifdef HAS_SETROW_X86
@@ -932,19 +952,20 @@ ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
 #undef ANY1
 
 // Any 1 to 2.  Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width) { \
-    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                        \
-    memset(temp, 0, 128); /* for msan */                                      \
-    int r = width & MASK;                                                     \
-    int n = width & ~MASK;                                                    \
-    if (n > 0) {                                                              \
-      ANY_SIMD(src_ptr, dst_u, dst_v, n);                                     \
-    }                                                                         \
-    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);       \
-    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                         \
-    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));             \
-    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));             \
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,  \
+               int width) {                                             \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                \
+    memset(temp, 0, 128); /* for msan */                                \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n);                               \
+    }                                                                   \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                   \
+    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));       \
+    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));       \
   }
 
 #ifdef HAS_SPLITUVROW_SSE2
@@ -983,21 +1004,21 @@ ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
 #undef ANY12
 
 // Any 1 to 3.  Outputs RGB planes.
-#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                    \
-  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, \
-               int width) {                                                    \
-    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                          \
-    memset(temp, 0, 16 * 3); /* for msan */                                    \
-    int r = width & MASK;                                                      \
-    int n = width & ~MASK;                                                     \
-    if (n > 0) {                                                               \
-      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                               \
-    }                                                                          \
-    memcpy(temp, src_ptr + n * BPP, r * BPP);                                  \
-    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1);     \
-    memcpy(dst_r + n, temp + 16 * 3, r);                                       \
-    memcpy(dst_g + n, temp + 16 * 4, r);                                       \
-    memcpy(dst_b + n, temp + 16 * 5, r);                                       \
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \
+               uint8_t* dst_b, int width) {                                \
+    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                    \
+    memset(temp, 0, 16 * 3); /* for msan */                                \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                           \
+    }                                                                      \
+    memcpy(temp, src_ptr + n * BPP, r * BPP);                              \
+    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
+    memcpy(dst_r + n, temp + 16 * 3, r);                                   \
+    memcpy(dst_g + n, temp + 16 * 4, r);                                   \
+    memcpy(dst_b + n, temp + 16 * 5, r);                                   \
   }
 
 #ifdef HAS_SPLITRGBROW_SSSE3
@@ -1010,9 +1031,9 @@ ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
 // 128 byte row allows for 32 avx ARGB pixels.
 #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
-  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u,       \
-               uint8_t* dst_v, int width) {                                    \
-    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                       \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u,   \
+               uint8_t* dst_v, int width) {                                  \
+    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \
     memset(temp, 0, 128 * 2); /* for msan */                                 \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
diff --git a/source/row_common.cc b/source/row_common.cc
index ae3cd3291..737d55979 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -10,8 +10,8 @@
 
 #include "libyuv/row.h"
 
-#include <string.h>  // For memcpy and memset.
 #include <stdio.h>
+#include <string.h>  // For memcpy and memset.
 
 #include "libyuv/basic_types.h"
 
@@ -125,7 +125,9 @@ void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   }
 }
 
-void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) {
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
+                       uint8_t* dst_argb,
+                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
     uint8_t b = src_rgb565[0] & 0x1f;
@@ -291,7 +293,7 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t r1 = src_argb[6] >> 3;
     uint8_t a1 = src_argb[7] >> 7;
     *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
-                          (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+                            (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -315,8 +317,8 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
     uint8_t g1 = src_argb[5] >> 4;
     uint8_t r1 = src_argb[6] >> 4;
     uint8_t a1 = src_argb[7] >> 4;
-    *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | (b1 << 16) |
-                          (g1 << 20) | (r1 << 24) | (a1 << 28);
+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+                            (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
     dst_rgb += 4;
     src_argb += 8;
   }
@@ -354,43 +356,43 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
 }
 
 // ARGBToY_C and ARGBToUV_C
-#define MAKEROWY(NAME, R, G, B, BPP)                                     \
+#define MAKEROWY(NAME, R, G, B, BPP)                                         \
   void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                               \
-    for (x = 0; x < width; ++x) {                                        \
-      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);       \
-      src_argb0 += BPP;                                                  \
-      dst_y += 1;                                                        \
-    }                                                                    \
-  }                                                                      \
-  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,        \
+    int x;                                                                   \
+    for (x = 0; x < width; ++x) {                                            \
+      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                      \
+      dst_y += 1;                                                            \
+    }                                                                        \
+  }                                                                          \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                   \
-    int x;                                                               \
-    for (x = 0; x < width - 1; x += 2) {                                 \
-      uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +        \
-                  src_rgb1[B + BPP]) >>                                  \
-                 2;                                                      \
-      uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +        \
-                  src_rgb1[G + BPP]) >>                                  \
-                 2;                                                      \
-      uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +        \
-                  src_rgb1[R + BPP]) >>                                  \
-                 2;                                                      \
-      dst_u[0] = RGBToU(ar, ag, ab);                                     \
-      dst_v[0] = RGBToV(ar, ag, ab);                                     \
-      src_rgb0 += BPP * 2;                                               \
-      src_rgb1 += BPP * 2;                                               \
-      dst_u += 1;                                                        \
-      dst_v += 1;                                                        \
-    }                                                                    \
-    if (width & 1) {                                                     \
-      uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                       \
-      uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                       \
-      uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                       \
-      dst_u[0] = RGBToU(ar, ag, ab);                                     \
-      dst_v[0] = RGBToV(ar, ag, ab);                                     \
-    }                                                                    \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
+    int x;                                                                   \
+    for (x = 0; x < width - 1; x += 2) {                                     \
+      uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
+                    src_rgb1[B + BPP]) >>                                    \
+                   2;                                                        \
+      uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
+                    src_rgb1[G + BPP]) >>                                    \
+                   2;                                                        \
+      uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
+                    src_rgb1[R + BPP]) >>                                    \
+                   2;                                                        \
+      dst_u[0] = RGBToU(ar, ag, ab);                                         \
+      dst_v[0] = RGBToV(ar, ag, ab);                                         \
+      src_rgb0 += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                   \
+      dst_u += 1;                                                            \
+      dst_v += 1;                                                            \
+    }                                                                        \
+    if (width & 1) {                                                         \
+      uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                         \
+      uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                         \
+      uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                         \
+      dst_u[0] = RGBToU(ar, ag, ab);                                         \
+      dst_v[0] = RGBToV(ar, ag, ab);                                         \
+    }                                                                        \
   }
 
 MAKEROWY(ARGB, 2, 1, 0, 4)
@@ -440,40 +442,40 @@ static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
 #define AVGB(a, b) (((a) + (b) + 1) >> 1)
 
 // ARGBToYJ_C and ARGBToUVJ_C
-#define MAKEROWYJ(NAME, R, G, B, BPP)                                     \
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
   void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
-    int x;                                                                \
-    for (x = 0; x < width; ++x) {                                         \
-      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);       \
-      src_argb0 += BPP;                                                   \
-      dst_y += 1;                                                         \
-    }                                                                     \
-  }                                                                       \
-  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,        \
+    int x;                                                                    \
+    for (x = 0; x < width; ++x) {                                             \
+      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                       \
+      dst_y += 1;                                                             \
+    }                                                                         \
+  }                                                                           \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
                         uint8_t* dst_u, uint8_t* dst_v, int width) {          \
-    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                    \
-    int x;                                                                \
-    for (x = 0; x < width - 1; x += 2) {                                  \
-      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                     \
-                      AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));        \
-      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                     \
-                      AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));        \
-      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                     \
-                      AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));        \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                     \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                     \
-      src_rgb0 += BPP * 2;                                                \
-      src_rgb1 += BPP * 2;                                                \
-      dst_u += 1;                                                         \
-      dst_v += 1;                                                         \
-    }                                                                     \
-    if (width & 1) {                                                      \
-      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                          \
-      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                          \
-      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                          \
-      dst_u[0] = RGBToUJ(ar, ag, ab);                                     \
-      dst_v[0] = RGBToVJ(ar, ag, ab);                                     \
-    }                                                                     \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
+    int x;                                                                    \
+    for (x = 0; x < width - 1; x += 2) {                                      \
+      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                       \
+                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));          \
+      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                       \
+                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));          \
+      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                       \
+                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));          \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
+      src_rgb0 += BPP * 2;                                                    \
+      src_rgb1 += BPP * 2;                                                    \
+      dst_u += 1;                                                             \
+      dst_v += 1;                                                             \
+    }                                                                         \
+    if (width & 1) {                                                          \
+      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                            \
+      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                            \
+      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                            \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
+    }                                                                         \
   }
 
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
@@ -756,7 +758,9 @@ void ARGBColorMatrixRow_C(const uint8_t* src_argb,
 }
 
 // Apply color table to a row of image.
-void ARGBColorTableRow_C(uint8_t* dst_argb, const uint8_t* table_argb, int width) {
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+                         const uint8_t* table_argb,
+                         int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -772,7 +776,9 @@ void ARGBColorTableRow_C(uint8_t* dst_argb, const uint8_t* table_argb, int width
 }
 
 // Apply color table to a row of image.
-void RGBColorTableRow_C(uint8_t* dst_argb, const uint8_t* table_argb, int width) {
+void RGBColorTableRow_C(uint8_t* dst_argb,
+                        const uint8_t* table_argb,
+                        int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -1535,10 +1541,7 @@ void I210ToARGBRow_C(const uint16_t* src_y,
   }
 }
 
-static void StoreAR30(uint8_t* rgb_buf,
-                      int b,
-                      int g,
-                      int r) {
+static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
   uint32_t ar30;
   b = b >> 4;  // convert 10.6 to 10 bit.
   g = g >> 4;
@@ -1577,7 +1580,6 @@ void I210ToAR30Row_C(const uint16_t* src_y,
   }
 }
 
-
 // 8 bit YUV to 10 bit AR30
 // Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
 void I422ToAR30Row_C(const uint8_t* src_y,
@@ -1680,7 +1682,7 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
     g1 = g1 >> 4;
     r1 = r1 >> 4;
     *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
-                               (g1 << 20) | (r1 << 24) | 0xf000f000;
+                                 (g1 << 20) | (r1 << 24) | 0xf000f000;
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1718,7 +1720,7 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
     g1 = g1 >> 3;
     r1 = r1 >> 3;
     *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
-                               (g1 << 21) | (r1 << 26) | 0x80008000;
+                                 (g1 << 21) | (r1 << 26) | 0x80008000;
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1954,7 +1956,10 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   }
 }
 
-void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) {
+void MirrorUVRow_C(const uint8_t* src_uv,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
   int x;
   src_uv += (width - 1) << 1;
   for (x = 0; x < width - 1; x += 2) {
@@ -1985,7 +1990,10 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   }
 }
 
-void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) {
+void SplitUVRow_C(const uint8_t* src_uv,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
+                  int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_u[x] = src_uv[0];
@@ -2385,7 +2393,9 @@ const uint32_t fixed_invtbl8[256] = {
     T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
 #undef T
 
-void ARGBUnattenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
   int i;
   for (i = 0; i < width; ++i) {
     uint32_t b = src_argb[0];
@@ -2673,7 +2683,10 @@ void ARGBPolynomialRow_C(const uint8_t* src_argb,
 // simply extract the low bits of the exponent and the high
 // bits of the mantissa from our float and we're done.
 
-void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width) {
+void HalfFloatRow_C(const uint16_t* src,
+                    uint16_t* dst,
+                    float scale,
+                    int width) {
   int i;
   float mult = 1.9259299444e-34f * scale;
   for (i = 0; i < width; ++i) {
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index a17a3a300..656801ac7 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -153,386 +153,393 @@ static const lvec8 kShuffleNV21 = {
 
 #ifdef HAS_J400TOARGBROW_SSE2
 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movq      (%0),%%xmm0                     \n"
-    "lea       0x8(%0),%0                      \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm0,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm1                   \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_J400TOARGBROW_SSE2
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm3                 \n"
-    "lea       0x30(%0),%0                     \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    %%xmm2,0x20(%1)                 \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm3,0x30(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleMaskRGB24ToARGB)  // %3
-  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
+      "pslld     $0x18,%%xmm5                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm3                 \n"
+      "lea       0x30(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "palignr   $0x8,%%xmm1,%%xmm2              \n"
+      "pshufb    %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "palignr   $0xc,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "palignr   $0x4,%%xmm3,%%xmm3              \n"
+      "pshufb    %%xmm4,%%xmm3                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm3,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgb24),              // %0
+        "+r"(dst_argb),               // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskRGB24ToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm3                 \n"
-    "lea       0x30(%0),%0                     \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    %%xmm2,0x20(%1)                 \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm3,0x30(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleMaskRAWToARGB)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
+      "pslld     $0x18,%%xmm5                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm3                 \n"
+      "lea       0x30(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "palignr   $0x8,%%xmm1,%%xmm2              \n"
+      "pshufb    %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "palignr   $0xc,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "palignr   $0x4,%%xmm3,%%xmm3              \n"
+      "pshufb    %%xmm4,%%xmm3                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm3,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_raw),              // %0
+        "+r"(dst_argb),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskRAWToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  asm volatile (
-   "movdqa     %3,%%xmm3                       \n"
-   "movdqa     %4,%%xmm4                       \n"
-   "movdqa     %5,%%xmm5                       \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x4(%0),%%xmm1                  \n"
-    "movdqu    0x8(%0),%%xmm2                  \n"
-    "lea       0x18(%0),%0                     \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,0x8(%1)                  \n"
-    "movq      %%xmm2,0x10(%1)                 \n"
-    "lea       0x18(%1),%1                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_raw),    // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)       // %2
-  : "m"(kShuffleMaskRAWToRGB24_0),  // %3
-    "m"(kShuffleMaskRAWToRGB24_1),  // %4
-    "m"(kShuffleMaskRAWToRGB24_2)   // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "movdqa     %3,%%xmm3                       \n"
+      "movdqa     %4,%%xmm4                       \n"
+      "movdqa     %5,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x4(%0),%%xmm1                  \n"
+      "movdqu    0x8(%0),%%xmm2                  \n"
+      "lea       0x18(%0),%0                     \n"
+      "pshufb    %%xmm3,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq      %%xmm1,0x8(%1)                  \n"
+      "movq      %%xmm2,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_raw),                  // %0
+        "+r"(dst_rgb24),                // %1
+        "+r"(width)                     // %2
+      : "m"(kShuffleMaskRAWToRGB24_0),  // %3
+        "m"(kShuffleMaskRAWToRGB24_1),  // %4
+        "m"(kShuffleMaskRAWToRGB24_2)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x20802080,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xa,%%xmm4                     \n"
-    "psrlw     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
-    "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
-    "lea       0x10(%0),%0                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+      "mov       $0x1080108,%%eax                \n"
+      "movd      %%eax,%%xmm5                    \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x20802080,%%eax               \n"
+      "movd      %%eax,%%xmm6                    \n"
+      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psllw     $0xb,%%xmm3                     \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0xa,%%xmm4                     \n"
+      "psrlw     $0x5,%%xmm4                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psllw     $0x8,%%xmm7                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "psllw     $0xb,%%xmm2                     \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "psllw     $0x8,%%xmm1                     \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "pmulhuw   %%xmm6,%%xmm0                   \n"
+      "por       %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm1                   \n"
+      "punpckhbw %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
 }
 
 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x42004200,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "movdqa    %%xmm3,%%xmm4                   \n"
-    "psrlw     $0x6,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psllw     $0x1,%%xmm1                     \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
-    "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
-    "lea       0x10(%0),%0                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+      "mov       $0x1080108,%%eax                \n"
+      "movd      %%eax,%%xmm5                    \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x42004200,%%eax               \n"
+      "movd      %%eax,%%xmm6                    \n"
+      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psllw     $0xb,%%xmm3                     \n"
+      "movdqa    %%xmm3,%%xmm4                   \n"
+      "psrlw     $0x6,%%xmm4                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psllw     $0x8,%%xmm7                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "psllw     $0x1,%%xmm1                     \n"
+      "psllw     $0xb,%%xmm2                     \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "psllw     $0x8,%%xmm1                     \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "pmulhuw   %%xmm6,%%xmm0                   \n"
+      "pand      %%xmm7,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm1                   \n"
+      "punpckhbw %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
 }
 
 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "mov       $0xf0f0f0f,%%eax                \n"
-    "movd      %%eax,%%xmm4                    \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x4,%%xmm5                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "psllw     $0x4,%%xmm1                     \n"
-    "psrlw     $0x4,%%xmm3                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,0x00(%1,%0,2)            \n"
-    "movdqu    %%xmm1,0x10(%1,%0,2)            \n"
-    "lea       0x10(%0),%0                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+  asm volatile(
+      "mov       $0xf0f0f0f,%%eax                \n"
+      "movd      %%eax,%%xmm4                    \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "pslld     $0x4,%%xmm5                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "psllw     $0x4,%%xmm1                     \n"
+      "psrlw     $0x4,%%xmm3                     \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm2,%%xmm0                   \n"
+      "punpckhbw %%xmm2,%%xmm1                   \n"
+      "movdqu    %%xmm0,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm1,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "movdqu    %%xmm2,0x20(%1)                 \n"
-    "lea       0x30(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(kShuffleMaskARGBToRGB24)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+  asm volatile(
+
+      "movdqa    %3,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "pshufb    %%xmm6,%%xmm0                   \n"
+      "pshufb    %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm6,%%xmm2                   \n"
+      "pshufb    %%xmm6,%%xmm3                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "psrldq    $0x4,%%xmm1                     \n"
+      "pslldq    $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm2,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pslldq    $0x8,%%xmm5                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "psrldq    $0x8,%%xmm2                     \n"
+      "pslldq    $0x4,%%xmm3                     \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "lea       0x30(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskARGBToRGB24)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "movdqu    %%xmm2,0x20(%1)                 \n"
-    "lea       0x30(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(kShuffleMaskARGBToRAW)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+  asm volatile(
+
+      "movdqa    %3,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "pshufb    %%xmm6,%%xmm0                   \n"
+      "pshufb    %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm6,%%xmm2                   \n"
+      "pshufb    %%xmm6,%%xmm3                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "psrldq    $0x4,%%xmm1                     \n"
+      "pslldq    $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm2,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pslldq    $0x8,%%xmm5                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "psrldq    $0x8,%%xmm2                     \n"
+      "pslldq    $0x4,%%xmm3                     \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "lea       0x30(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskARGBToRAW)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psrld     $0x1b,%%xmm3                    \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1a,%%xmm4                    \n"
-    "pslld     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0xb,%%xmm5                     \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pslld     $0x8,%%xmm0                     \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x5,%%xmm2                     \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       0x10(%0),%0                     \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+  asm volatile(
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psrld     $0x1b,%%xmm3                    \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psrld     $0x1a,%%xmm4                    \n"
+      "pslld     $0x5,%%xmm4                     \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0xb,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pslld     $0x8,%%xmm0                     \n"
+      "psrld     $0x3,%%xmm1                     \n"
+      "psrld     $0x5,%%xmm2                     \n"
+      "psrad     $0x10,%%xmm0                    \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm4,%%xmm2                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
@@ -629,74 +636,73 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1b,%%xmm4                    \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x5,%%xmm5                     \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "pslld     $0xa,%%xmm6                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "pslld     $0xf,%%xmm7                     \n"
+  asm volatile(
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psrld     $0x1b,%%xmm4                    \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "pslld     $0x5,%%xmm5                     \n"
+      "movdqa    %%xmm4,%%xmm6                   \n"
+      "pslld     $0xa,%%xmm6                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "pslld     $0xf,%%xmm7                     \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x6,%%xmm2                     \n"
-    "psrld     $0x9,%%xmm3                     \n"
-    "pand      %%xmm7,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm6,%%xmm3                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       0x10(%0),%0                     \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "psrad     $0x10,%%xmm0                    \n"
+      "psrld     $0x3,%%xmm1                     \n"
+      "psrld     $0x6,%%xmm2                     \n"
+      "psrld     $0x9,%%xmm3                     \n"
+      "pand      %%xmm7,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm2                   \n"
+      "pand      %%xmm6,%%xmm3                   \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 
 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm4,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm3                     \n"
+  asm volatile(
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm4,%%xmm3                   \n"
+      "psrlw     $0x8,%%xmm3                     \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm3,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "psrlq     $0x4,%%xmm0                     \n"
-    "psrlq     $0x8,%%xmm1                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "lea       0x10(%0),%0                     \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm3,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm1                   \n"
+      "psrlq     $0x4,%%xmm0                     \n"
+      "psrlq     $0x8,%%xmm1                     \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
@@ -809,38 +815,37 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
@@ -848,39 +853,38 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToYJ),  // %3
-    "m"(kAddYJ64)    // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kAddYJ64)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBTOYJROW_SSSE3
 
@@ -890,85 +894,83 @@ static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
-    "vmovdqu    %5,%%ymm6                      \n"
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu    %5,%%ymm6                      \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vmovdqu    0x20(%0),%%ymm1                \n"
-    "vmovdqu    0x40(%0),%%ymm2                \n"
-    "vmovdqu    0x60(%0),%%ymm3                \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "lea       0x80(%0),%0                     \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16),    // %4
-    "m"(kPermdARGBToY_AVX)  // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "lea       0x80(%0),%0                     \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToY),          // %3
+        "m"(kAddY16),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBTOYROW_AVX2
 
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
-    "vmovdqu    %5,%%ymm6                      \n"
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu    %5,%%ymm6                      \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vmovdqu    0x20(%0),%%ymm1                \n"
-    "vmovdqu    0x40(%0),%%ymm2                \n"
-    "vmovdqu    0x60(%0),%%ymm3                \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "lea       0x80(%0),%0                     \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
-    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToYJ),   // %3
-    "m"(kAddYJ64),    // %4
-    "m"(kPermdARGBToY_AVX)  // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "lea       0x80(%0),%0                     \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+      "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToYJ),         // %3
+        "m"(kAddYJ64),          // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBTOYJROW_AVX2
 
@@ -978,64 +980,62 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
 
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kARGBToV),  // %5
-    "m"(kARGBToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToV),                     // %5
+        "m"(kARGBToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVROW_SSSE3
 
@@ -1049,61 +1049,60 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
-    "vbroadcastf128 %5,%%ymm5                  \n"
-    "vbroadcastf128 %6,%%ymm6                  \n"
-    "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub        %1,%2                          \n"
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub        %1,%2                          \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vmovdqu    0x20(%0),%%ymm1                \n"
-    "vmovdqu    0x40(%0),%%ymm2                \n"
-    "vmovdqu    0x60(%0),%%ymm3                \n"
-    "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-    "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-    "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
-    "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
-    "lea        0x80(%0),%0                    \n"
-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
+      "lea        0x80(%0),%0                    \n"
+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
 
-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpshufb    %8,%%ymm0,%%ymm0               \n"
-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpshufb    %8,%%ymm0,%%ymm0               \n"
+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
 
-    "vextractf128 $0x0,%%ymm0,(%1)             \n"
-    "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-    "lea        0x10(%1),%1                    \n"
-    "sub        $0x20,%3                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kAddUV128),  // %5
-    "m"(kARGBToV),   // %6
-    "m"(kARGBToU),   // %7
-    "m"(kShufARGBToUV_AVX)  // %8
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x20,%3                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kAddUV128),                    // %5
+        "m"(kARGBToV),                     // %6
+        "m"(kARGBToU),                     // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
 
@@ -1113,62 +1112,61 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
-    "vbroadcastf128 %5,%%ymm5                  \n"
-    "vbroadcastf128 %6,%%ymm6                  \n"
-    "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub        %1,%2                          \n"
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub        %1,%2                          \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vmovdqu    0x20(%0),%%ymm1                \n"
-    "vmovdqu    0x40(%0),%%ymm2                \n"
-    "vmovdqu    0x60(%0),%%ymm3                \n"
-    "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-    "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-    "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
-    "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
-    "lea       0x80(%0),%0                     \n"
-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
+      "lea       0x80(%0),%0                     \n"
+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
 
-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpshufb    %8,%%ymm0,%%ymm0               \n"
 
-    "vextractf128 $0x0,%%ymm0,(%1)             \n"
-    "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kAddUVJ128),  // %5
-    "m"(kARGBToVJ),  // %6
-    "m"(kARGBToUJ),  // %7
-    "m"(kShufARGBToUV_AVX)  // %8
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kAddUVJ128),                   // %5
+        "m"(kARGBToVJ),                    // %6
+        "m"(kARGBToUJ),                    // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
@@ -1178,65 +1176,63 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
 
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kARGBToVJ),  // %5
-    "m"(kARGBToUJ),  // %6
-    "m"(kAddUVJ128)  // %7
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToVJ),                    // %5
+        "m"(kARGBToUJ),                    // %6
+        "m"(kAddUVJ128)                    // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_SSSE3
 
@@ -1245,94 +1241,91 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm3                       \n"
-    "movdqa    %5,%%xmm4                       \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "movdqa    %4,%%xmm3                       \n"
+      "movdqa    %5,%%xmm4                       \n"
+      "movdqa    %6,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "movdqu    %%xmm0,0x00(%1,%2,1)            \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),        // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "m"(kARGBToV),  // %4
-    "m"(kARGBToU),  // %5
-    "m"(kAddUV128)  // %6
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm6"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm6                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm2                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "packsswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm3,%%xmm0                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm2                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "packsswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "movdqu    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+rm"(width)     // %3
+      : "m"(kARGBToV),   // %4
+        "m"(kARGBToU),   // %5
+        "m"(kAddUV128)   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
 }
 #endif  // HAS_ARGBTOUV444ROW_SSSE3
 
 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kBGRAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kBGRAToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
@@ -1340,134 +1333,130 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
 
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_bgra)), // %4
-    "m"(kBGRAToV),  // %5
-    "m"(kBGRAToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_bgra0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_bgra)),  // %4
+        "m"(kBGRAToV),                     // %5
+        "m"(kBGRAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kABGRToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kABGRToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kRGBAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kRGBAToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
@@ -1475,64 +1464,62 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
 
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_abgr)), // %4
-    "m"(kABGRToV),  // %5
-    "m"(kABGRToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_abgr0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kABGRToV),                     // %5
+        "m"(kABGRToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
 void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
@@ -1540,161 +1527,159 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
 
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,0x00(%1,%2,1)            \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_rgba)), // %4
-    "m"(kRGBAToV),  // %5
-    "m"(kRGBAToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgba0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_rgba)),  // %4
+        "m"(kRGBAToV),                     // %5
+        "m"(kRGBAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
 
 // Read 8 UV from 444
-#define READYUV444 \
-  "movq       (%[u_buf]),%%xmm0                                 \n"            \
-    "movq    0x00(%[u_buf],%[v_buf],1),%%xmm1                   \n"            \
-    "lea        0x8(%[u_buf]),%[u_buf]                          \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "movq       (%[y_buf]),%%xmm4                               \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+#define READYUV444                                                \
+  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 UV from 422, upsample to 8 UV
-#define READYUV422 \
-  "movd       (%[u_buf]),%%xmm0                                 \n"            \
-    "movd    0x00(%[u_buf],%[v_buf],1),%%xmm1                   \n"            \
-    "lea        0x4(%[u_buf]),%[u_buf]                          \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       (%[y_buf]),%%xmm4                               \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+#define READYUV422                                                \
+  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 UV from 422 10 bit, upsample to 8 UV
 // TODO(fbarchard): Consider shufb to replace pack/unpack
 // TODO(fbarchard): Consider pmulhuw to replace psraw
 // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
-#define READYUV210 \
-  "movq       (%[u_buf]),%%xmm0                                 \n"            \
-    "movq    0x00(%[u_buf],%[v_buf],1),%%xmm1                   \n"            \
-    "lea        0x8(%[u_buf]),%[u_buf]                          \n"            \
-    "punpcklwd  %%xmm1,%%xmm0                                   \n"            \
-    "psraw      $0x2,%%xmm0                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movdqu     (%[y_buf]),%%xmm4                               \n"            \
-    "psllw      $0x6,%%xmm4                                     \n"            \
-    "lea        0x10(%[y_buf]),%[y_buf]                         \n"
+#define READYUV210                                                \
+  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
+  "psraw      $0x2,%%xmm0                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "psllw      $0x6,%%xmm4                                     \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422 \
-  "movd       (%[u_buf]),%%xmm0                                 \n"            \
-    "movd    0x00(%[u_buf],%[v_buf],1),%%xmm1                   \n"            \
-    "lea        0x4(%[u_buf]),%[u_buf]                          \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       (%[y_buf]),%%xmm4                               \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        0x8(%[y_buf]),%[y_buf]                          \n"            \
-    "movq       (%[a_buf]),%%xmm5                               \n"            \
-    "lea        0x8(%[a_buf]),%[a_buf]                          \n"
+#define READYUVA422                                               \
+  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
+  "movq       (%[a_buf]),%%xmm5                               \n" \
+  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
 
 // Read 4 UV from NV12, upsample to 8 UV
-#define READNV12 \
-  "movq       (%[uv_buf]),%%xmm0                                \n"            \
-    "lea        0x8(%[uv_buf]),%[uv_buf]                        \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       (%[y_buf]),%%xmm4                               \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+#define READNV12                                                  \
+  "movq       (%[uv_buf]),%%xmm0                              \n" \
+  "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 VU from NV21, upsample to 8 UV
-#define READNV21 \
-  "movq       (%[vu_buf]),%%xmm0                                \n"            \
-    "lea        0x8(%[vu_buf]),%[vu_buf]                        \n"            \
-    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
-    "movq       (%[y_buf]),%%xmm4                               \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+#define READNV21                                                  \
+  "movq       (%[vu_buf]),%%xmm0                              \n" \
+  "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
+  "pshufb     %[kShuffleNV21], %%xmm0                         \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2 \
-  "movdqu     (%[yuy2_buf]),%%xmm4                              \n"            \
-    "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
-    "movdqu     (%[yuy2_buf]),%%xmm0                            \n"            \
-    "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
-    "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
+#define READYUY2                                                  \
+  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
+  "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \
+  "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \
+  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
 
 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY \
-  "movdqu     (%[uyvy_buf]),%%xmm4                              \n"            \
-    "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
-    "movdqu     (%[uyvy_buf]),%%xmm0                            \n"            \
-    "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
-    "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
+#define READUYVY                                                  \
+  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
+  "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \
+  "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \
+  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
 
 #if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants) \
-  "movdqa     (%[yuvconstants]),%%xmm8                          \n"            \
-    "movdqa     32(%[yuvconstants]),%%xmm9                      \n"            \
-    "movdqa     64(%[yuvconstants]),%%xmm10                     \n"            \
-    "movdqa     96(%[yuvconstants]),%%xmm11                     \n"            \
-    "movdqa     128(%[yuvconstants]),%%xmm12                    \n"            \
-    "movdqa     160(%[yuvconstants]),%%xmm13                    \n"            \
-    "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
+#define YUVTORGB_SETUP(yuvconstants)                              \
+  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
+  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
+  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
+  "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
 // Convert 8 pixels: 8 UV and 8 Y
 #define YUVTORGB16(yuvconstants)                                  \
   "movdqa     %%xmm0,%%xmm1                                   \n" \
@@ -1719,28 +1704,28 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
 #else
 #define YUVTORGB_SETUP(yuvconstants)
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB16(yuvconstants) \
-  "movdqa     %%xmm0,%%xmm1                                   \n"            \
-  "movdqa     %%xmm0,%%xmm2                                   \n"            \
-  "movdqa     %%xmm0,%%xmm3                                   \n"            \
-  "movdqa     96(%[yuvconstants]),%%xmm0                      \n"            \
-  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n"            \
-  "psubw      %%xmm1,%%xmm0                                   \n"            \
-  "movdqa     128(%[yuvconstants]),%%xmm1                     \n"            \
-  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n"            \
-  "psubw      %%xmm2,%%xmm1                                   \n"            \
-  "movdqa     160(%[yuvconstants]),%%xmm2                     \n"            \
-  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n"            \
-  "psubw      %%xmm3,%%xmm2                                   \n"            \
-  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n"            \
-  "paddsw     %%xmm4,%%xmm0                                   \n"            \
-  "paddsw     %%xmm4,%%xmm1                                   \n"            \
+#define YUVTORGB16(yuvconstants)                                  \
+  "movdqa     %%xmm0,%%xmm1                                   \n" \
+  "movdqa     %%xmm0,%%xmm2                                   \n" \
+  "movdqa     %%xmm0,%%xmm3                                   \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \
+  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \
+  "psubw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \
+  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \
+  "psubw      %%xmm2,%%xmm1                                   \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \
+  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \
+  "psubw      %%xmm3,%%xmm2                                   \n" \
+  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm1                                   \n" \
   "paddsw     %%xmm4,%%xmm2                                   \n"
 #define YUVTORGB_REGS
 #endif
 
-#define YUVTORGB(yuvconstants) \
-    YUVTORGB16(yuvconstants)                                      \
+#define YUVTORGB(yuvconstants)                                    \
+  YUVTORGB16(yuvconstants)                                        \
   "psraw      $0x6,%%xmm0                                     \n" \
   "psraw      $0x6,%%xmm1                                     \n" \
   "psraw      $0x6,%%xmm2                                     \n" \
@@ -1749,30 +1734,30 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
   "packuswb   %%xmm2,%%xmm2                                   \n"
 
 // Store 8 ARGB values.
-#define STOREARGB \
-  "punpcklbw  %%xmm1,%%xmm0                                      \n"           \
-    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
-    "movdqa     %%xmm0,%%xmm1                                    \n"           \
-    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
-    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
-    "movdqu     %%xmm0,(%[dst_argb])                             \n"           \
-    "movdqu     %%xmm1,0x10(%[dst_argb])                         \n"           \
-    "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
+#define STOREARGB                                                  \
+  "punpcklbw  %%xmm1,%%xmm0                                    \n" \
+  "punpcklbw  %%xmm5,%%xmm2                                    \n" \
+  "movdqa     %%xmm0,%%xmm1                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm1                                    \n" \
+  "movdqu     %%xmm0,(%[dst_argb])                             \n" \
+  "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
+  "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
 
 // Store 8 RGBA values.
-#define STORERGBA \
-  "pcmpeqb   %%xmm5,%%xmm5                                       \n"           \
-    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
-    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
-    "movdqa    %%xmm5,%%xmm0                                     \n"           \
-    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
-    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
-    "movdqu    %%xmm5,(%[dst_rgba])                              \n"           \
-    "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n"           \
-    "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
+#define STORERGBA                                                  \
+  "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
+  "punpcklbw %%xmm2,%%xmm1                                     \n" \
+  "punpcklbw %%xmm0,%%xmm5                                     \n" \
+  "movdqa    %%xmm5,%%xmm0                                     \n" \
+  "punpcklwd %%xmm1,%%xmm5                                     \n" \
+  "punpckhwd %%xmm1,%%xmm0                                     \n" \
+  "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
+  "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
+  "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
 
 // Store 8 AR30 values.
-#define STOREAR30 \
+#define STOREAR30                                                  \
   "psraw      $0x4,%%xmm0                                      \n" \
   "psraw      $0x4,%%xmm1                                      \n" \
   "psraw      $0x4,%%xmm2                                      \n" \
@@ -2183,111 +2168,111 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
 #endif  // HAS_I422TOARGBROW_SSSE3
 
 // Read 16 UV from 444
-#define READYUV444_AVX2 \
-  "vmovdqu    (%[u_buf]),%%xmm0                                     \n"        \
-    "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                   \n"                         \
-    "lea        0x10(%[u_buf]),%[u_buf]                             \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    (%[y_buf]),%%xmm4                                   \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+#define READYUV444_AVX2                                               \
+  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 \
-  "vmovq      (%[u_buf]),%%xmm0                                     \n"        \
-    "vmovq    0x00(%[u_buf],%[v_buf],1),%%xmm1                   \n"                           \
-    "lea        0x8(%[u_buf]),%[u_buf]                              \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    (%[y_buf]),%%xmm4                                   \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+#define READYUV422_AVX2                                               \
+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 UV from 210 10 bit, upsample to 16 UV
 // TODO(fbarchard): Consider vshufb to replace pack/unpack
 // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
-#define READYUV210_AVX2 \
-  "vmovdqu     (%[u_buf]),%%xmm0                                 \n"           \
-    "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                   \n"                         \
-    "lea        0x10(%[u_buf]),%[u_buf]                          \n"           \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                              \n"           \
-    "vpermq     $0xd8,%%ymm1,%%ymm1                              \n"           \
-    "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n"           \
-    "vpsraw     $0x2,%%ymm0,%%ymm0                               \n"           \
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n"           \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n"           \
-    "vmovdqu    (%[y_buf]),%%ymm4                                \n"           \
-    "vpsllw     $0x6,%%ymm4,%%ymm4                               \n"           \
-    "lea        0x20(%[y_buf]),%[y_buf]                          \n"
+#define READYUV210_AVX2                                            \
+  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
+  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
+  "vpsraw     $0x2,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $0x6,%%ymm4,%%ymm4                               \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2 \
-  "vmovq      (%[u_buf]),%%xmm0                                     \n"        \
-    "vmovq    0x00(%[u_buf],%[v_buf],1),%%xmm1                   \n"                           \
-    "lea        0x8(%[u_buf]),%[u_buf]                              \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    (%[y_buf]),%%xmm4                                   \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        0x10(%[y_buf]),%[y_buf]                             \n"        \
-    "vmovdqu    (%[a_buf]),%%xmm5                                   \n"        \
-    "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
-    "lea        0x10(%[a_buf]),%[a_buf]                             \n"
+#define READYUVA422_AVX2                                              \
+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
+  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
+  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
 
 // Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 \
-  "vmovdqu    (%[uv_buf]),%%xmm0                                    \n"        \
-    "lea        0x10(%[uv_buf]),%[uv_buf]                           \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    (%[y_buf]),%%xmm4                                   \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+#define READNV12_AVX2                                                 \
+  "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \
+  "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2 \
-  "vmovdqu    (%[vu_buf]),%%xmm0                                    \n"        \
-    "lea        0x10(%[vu_buf]),%[vu_buf]                           \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
-    "vmovdqu    (%[y_buf]),%%xmm4                                   \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+#define READNV21_AVX2                                                 \
+  "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \
+  "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 \
-  "vmovdqu    (%[yuy2_buf]),%%ymm4                                  \n"        \
-    "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
-    "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n"        \
-    "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
-    "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
+#define READYUY2_AVX2                                                 \
+  "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \
+  "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \
+  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 \
-  "vmovdqu     (%[uyvy_buf]),%%ymm4                                 \n"        \
-    "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
-    "vmovdqu     (%[uyvy_buf]),%%ymm0                               \n"        \
-    "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
-    "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
+#define READUYVY_AVX2                                                 \
+  "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \
+  "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \
+  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
 
 #if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants) \
-  "vmovdqa     (%[yuvconstants]),%%ymm8                          \n"           \
-    "vmovdqa     32(%[yuvconstants]),%%ymm9                      \n"           \
-    "vmovdqa     64(%[yuvconstants]),%%ymm10                     \n"           \
-    "vmovdqa     96(%[yuvconstants]),%%ymm11                     \n"           \
-    "vmovdqa     128(%[yuvconstants]),%%ymm12                    \n"           \
-    "vmovdqa     160(%[yuvconstants]),%%ymm13                    \n"           \
-    "vmovdqa     192(%[yuvconstants]),%%ymm14                    \n"
+#define YUVTORGB_SETUP_AVX2(yuvconstants)                            \
+  "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \
+  "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \
+  "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \
+  "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"
 
 #define YUVTORGB_AVX2(yuvconstants)                                   \
   "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
@@ -2313,40 +2298,40 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
 #else  // Convert 16 pixels: 16 UV and 16 Y.
 
 #define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB_AVX2(yuvconstants) \
-  "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                    \n"        \
-    "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n"        \
-    "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n"        \
-    "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n"        \
-    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
-    "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n"        \
-    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
-    "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n"        \
-    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
-    "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n"        \
-    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
-    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
-    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
-    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB_AVX2(yuvconstants)                                   \
+  "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \
+  "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \
+  "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \
+  "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \
+  "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \
+  "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \
+  "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \
+  "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \
+  "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \
+  "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n" \
+  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
+  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
+  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
+  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
+  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
 #define YUVTORGB_REGS_AVX2
 #endif
 
 // Store 16 ARGB values.
-#define STOREARGB_AVX2 \
-  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                  \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
-    "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
-    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
-    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    %%ymm1,(%[dst_argb])                                \n"        \
-    "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n"        \
-    "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"
+#define STOREARGB_AVX2                                                \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
+  "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
+  "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"
 
 #ifdef HAS_I444TOARGBROW_AVX2
 // 16 pixels
@@ -2519,11 +2504,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
-    "vmovdqu    %%ymm0,(%[dst_argb])            \n"
-    "vmovdqu    %%ymm1,0x20(%[dst_argb])              \n"
-    "lea       0x40(%[dst_argb]),%[dst_argb]           \n"
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
+    "vmovdqu    %%ymm0,(%[dst_argb])           \n"
+    "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
+    "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
+    "sub        $0x10,%[width]                 \n"
+    "jg         1b                             \n"
     "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
@@ -2672,47 +2657,46 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
 
 #ifdef HAS_I400TOARGBROW_SSE2
 void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
-  asm volatile (
-    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
-    "movd      %%eax,%%xmm2                    \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
-    "movd      %%eax,%%xmm3                    \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
+  asm volatile(
+      "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
+      "movd      %%eax,%%xmm2                    \n"
+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+      "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 *
+                                                      // 16
+      "movd      %%eax,%%xmm3                    \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "pslld     $0x18,%%xmm4                    \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    "movq      (%0),%%xmm0                     \n"
-    "lea       0x8(%0),%0                      \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "psubusw   %%xmm3,%%xmm0                   \n"
-    "psrlw     $6, %%xmm0                      \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "psubusw   %%xmm3,%%xmm0                   \n"
+      "psrlw     $6, %%xmm0                      \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
 
-    // Step 2: Weave into ARGB
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "por       %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
+      // Step 2: Weave into ARGB
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm0,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm1                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "por       %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
 
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(y_buf),     // %0
-    "+r"(dst_argb),  // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc", "eax"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(y_buf),     // %0
+        "+r"(dst_argb),  // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_I400TOARGBROW_SSE2
 
@@ -2720,46 +2704,45 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
 void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
-  asm volatile (
-    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
-    "vmovd      %%eax,%%xmm2                   \n"
-    "vbroadcastss %%xmm2,%%ymm2                \n"
-    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
-    "vmovd      %%eax,%%xmm3                   \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+  asm volatile(
+      "mov        $0x4a354a35,%%eax              \n"  // 0488 = 1160 = 1.164 *
+                                                      // 16
+      "vmovd      %%eax,%%xmm2                   \n"
+      "vbroadcastss %%xmm2,%%ymm2                \n"
+      "mov        $0x4880488,%%eax               \n"  // 4a35 = 18997 = 1.164
+      "vmovd      %%eax,%%xmm3                   \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpslld     $0x18,%%ymm4,%%ymm4            \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
-    "vmovdqu    (%0),%%xmm0                    \n"
-    "lea        0x10(%0),%0                    \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
-    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
-    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "vmovdqu    %%ymm1,0x20(%1)                \n"
-    "lea       0x40(%1),%1                     \n"
-    "sub        $0x10,%2                       \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(y_buf),     // %0
-    "+r"(dst_argb),  // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc", "eax"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+      "vmovdqu    (%0),%%xmm0                    \n"
+      "lea        0x10(%0),%0                    \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+      "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+      "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+      "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub        $0x10,%2                       \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(y_buf),     // %0
+        "+r"(dst_argb),  // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_I400TOARGBROW_AVX2
 
@@ -2770,50 +2753,48 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
 
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
+  asm volatile(
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    -0x10(%0,%2,1),%%xmm0            \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc",
-    "xmm0", "xmm5"
-  );
+      "movdqa    %3,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    -0x10(%0,%2,1),%%xmm0           \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm5                  \n"
+  asm volatile(
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    -0x20(%0,%2,1),%%ymm0            \n"
-    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc",
-    "xmm0", "xmm5"
-  );
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    -0x20(%0,%2,1),%%ymm0          \n"
+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_AVX2
 
@@ -2826,29 +2807,27 @@ void MirrorUVRow_SSSE3(const uint8_t* src,
                        uint8_t* dst_v,
                        int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "movdqa    %4,%%xmm1                       \n"
-    "lea       -0x10(%0,%3,2),%0               \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "movdqa    %4,%%xmm1                       \n"
+      "lea       -0x10(%0,%3,2),%0               \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "lea       -0x10(%0),%0                    \n"
-    "pshufb    %%xmm1,%%xmm0                   \n"
-    "movlpd    %%xmm0,(%1)                     \n"
-    "movhpd    %%xmm0,0x00(%1,%2,1)            \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $8,%3                           \n"
-    "jg        1b                              \n"
-  : "+r"(src),      // %0
-    "+r"(dst_u),    // %1
-    "+r"(dst_v),    // %2
-    "+r"(temp_width)  // %3
-  : "m"(kShuffleMirrorUV)  // %4
-  : "memory", "cc",
-    "xmm0", "xmm1"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       -0x10(%0),%0                    \n"
+      "pshufb    %%xmm1,%%xmm0                   \n"
+      "movlpd    %%xmm0,(%1)                     \n"
+      "movhpd    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $8,%3                           \n"
+      "jg        1b                              \n"
+      : "+r"(src),             // %0
+        "+r"(dst_u),           // %1
+        "+r"(dst_v),           // %2
+        "+r"(temp_width)       // %3
+      : "m"(kShuffleMirrorUV)  // %4
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_MIRRORUVROW_SSSE3
 
@@ -2856,25 +2835,24 @@ void MirrorUVRow_SSSE3(const uint8_t* src,
 
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "lea       -0x10(%0,%2,4),%0               \n"
+  asm volatile(
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
-    "lea       -0x10(%0),%0                    \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+      "lea       -0x10(%0,%2,4),%0               \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
+      "lea       -0x10(%0),%0                    \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),        // %0
+        "+r"(dst),        // %1
+        "+r"(temp_width)  // %2
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBMIRRORROW_SSE2
 
@@ -2883,24 +2861,23 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "vmovdqu    %3,%%ymm5                      \n"
+  asm volatile(
 
-    LABELALIGN
-    "1:                                        \n"
-    "vpermd    -0x20(%0,%2,4),%%ymm5,%%ymm0     \n"
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "lea        0x20(%1),%1                    \n"
-    "sub        $0x8,%2                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kARGBShuffleMirror_AVX2) // %3
-  : "memory", "cc",
-    "xmm0", "xmm5"
-  );
+      "vmovdqu    %3,%%ymm5                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpermd    -0x20(%0,%2,4),%%ymm5,%%ymm0    \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(temp_width)              // %2
+      : "m"(kARGBShuffleMirror_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
@@ -2909,38 +2886,36 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
-    "sub        %1,%2                          \n"
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+      "sub        %1,%2                          \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vmovdqu    0x20(%0),%%ymm1                \n"
-    "lea        0x40(%0),%0                    \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "vmovdqu    %%ymm2,0x00(%1,%2,1)            \n"
-    "lea        0x20(%1),%1                    \n"
-    "sub        $0x20,%3                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(width)         // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm2,0x00(%1,%2,1)            \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%3                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SPLITUVROW_AVX2
 
@@ -2949,37 +2924,35 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                  \n"
-    "psrlw      $0x8,%%xmm5                    \n"
-    "sub        %1,%2                          \n"
+  asm volatile(
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "psrlw      $0x8,%%xmm5                    \n"
+      "sub        %1,%2                          \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu     (%0),%%xmm0                    \n"
-    "movdqu     0x10(%0),%%xmm1                \n"
-    "lea        0x20(%0),%0                    \n"
-    "movdqa     %%xmm0,%%xmm2                  \n"
-    "movdqa     %%xmm1,%%xmm3                  \n"
-    "pand       %%xmm5,%%xmm0                  \n"
-    "pand       %%xmm5,%%xmm1                  \n"
-    "packuswb   %%xmm1,%%xmm0                  \n"
-    "psrlw      $0x8,%%xmm2                    \n"
-    "psrlw      $0x8,%%xmm3                    \n"
-    "packuswb   %%xmm3,%%xmm2                  \n"
-    "movdqu     %%xmm0,(%1)                    \n"
-    "movdqu    %%xmm2,0x00(%1,%2,1)            \n"
-    "lea        0x10(%1),%1                    \n"
-    "sub        $0x10,%3                       \n"
-    "jg         1b                             \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(width)         // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "lea        0x20(%0),%0                    \n"
+      "movdqa     %%xmm0,%%xmm2                  \n"
+      "movdqa     %%xmm1,%%xmm3                  \n"
+      "pand       %%xmm5,%%xmm0                  \n"
+      "pand       %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "psrlw      $0x8,%%xmm2                    \n"
+      "psrlw      $0x8,%%xmm3                    \n"
+      "packuswb   %%xmm3,%%xmm2                  \n"
+      "movdqu     %%xmm0,(%1)                    \n"
+      "movdqu    %%xmm2,0x00(%1,%2,1)            \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%3                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SPLITUVROW_SSE2
 
@@ -2988,32 +2961,31 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
+  asm volatile(
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu    0x00(%0,%1,1),%%ymm1            \n"
-    "lea       0x20(%0),%0                     \n"
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
-    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
-    "vextractf128 $0x0,%%ymm2,(%2)             \n"
-    "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
-    "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
-    "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
-    "lea       0x40(%2),%2                     \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_u),     // %0
-    "+r"(src_v),     // %1
-    "+r"(dst_uv),    // %2
-    "+r"(width)      // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2"
-  );
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu    0x00(%0,%1,1),%%ymm1           \n"
+      "lea       0x20(%0),%0                     \n"
+      "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
+      "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm2,(%2)             \n"
+      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
@@ -3022,30 +2994,29 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
+  asm volatile(
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm2                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "movdqu    %%xmm2,0x10(%2)                 \n"
-    "lea       0x20(%2),%2                     \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_u),     // %0
-    "+r"(src_v),     // %1
-    "+r"(dst_uv),    // %2
-    "+r"(width)      // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2"
-  );
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "punpcklbw %%xmm1,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm2                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "movdqu    %%xmm2,0x10(%2)                 \n"
+      "lea       0x20(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_SSE2
 
@@ -3318,61 +3289,60 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile (
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu     (%0),%%xmm0                    \n"
-    "movdqu     0x10(%0),%%xmm1                \n"
-    "movdqu     0x20(%0),%%xmm2                \n"
-    "pshufb     %5, %%xmm0                     \n"
-    "pshufb     %6, %%xmm1                     \n"
-    "pshufb     %7, %%xmm2                     \n"
-    "por        %%xmm1,%%xmm0                  \n"
-    "por        %%xmm2,%%xmm0                  \n"
-    "movdqu     %%xmm0,(%1)                    \n"
-    "lea        0x10(%1),%1                    \n"
+  asm volatile(
 
-    "movdqu     (%0),%%xmm0                    \n"
-    "movdqu     0x10(%0),%%xmm1                \n"
-    "movdqu     0x20(%0),%%xmm2                \n"
-    "pshufb     %8, %%xmm0                     \n"
-    "pshufb     %9, %%xmm1                     \n"
-    "pshufb     %10, %%xmm2                    \n"
-    "por        %%xmm1,%%xmm0                  \n"
-    "por        %%xmm2,%%xmm0                  \n"
-    "movdqu     %%xmm0,(%2)                    \n"
-    "lea        0x10(%2),%2                    \n"
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %5, %%xmm0                     \n"
+      "pshufb     %6, %%xmm1                     \n"
+      "pshufb     %7, %%xmm2                     \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
 
-    "movdqu     (%0),%%xmm0                    \n"
-    "movdqu     0x10(%0),%%xmm1                \n"
-    "movdqu     0x20(%0),%%xmm2                \n"
-    "pshufb     %11, %%xmm0                    \n"
-    "pshufb     %12, %%xmm1                    \n"
-    "pshufb     %13, %%xmm2                    \n"
-    "por        %%xmm1,%%xmm0                  \n"
-    "por        %%xmm2,%%xmm0                  \n"
-    "movdqu     %%xmm0,(%3)                    \n"
-    "lea        0x10(%3),%3                    \n"
-    "lea        0x30(%0),%0                    \n"
-    "sub        $0x10,%4                       \n"
-    "jg         1b                             \n"
-  : "+r"(src_rgb),              // %0
-    "+r"(dst_r),                // %1
-    "+r"(dst_g),                // %2
-    "+r"(dst_b),                // %3
-    "+r"(width)                 // %4
-  : "m"(kShuffleMaskRGBToR0),   // %5
-    "m"(kShuffleMaskRGBToR1),   // %6
-    "m"(kShuffleMaskRGBToR2),   // %7
-    "m"(kShuffleMaskRGBToG0),   // %8
-    "m"(kShuffleMaskRGBToG1),   // %9
-    "m"(kShuffleMaskRGBToG2),   // %10
-    "m"(kShuffleMaskRGBToB0),   // %11
-    "m"(kShuffleMaskRGBToB1),   // %12
-    "m"(kShuffleMaskRGBToB2)    // %13
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2"
-  );
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %8, %%xmm0                     \n"
+      "pshufb     %9, %%xmm1                     \n"
+      "pshufb     %10, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%2)                    \n"
+      "lea        0x10(%2),%2                    \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %11, %%xmm0                    \n"
+      "pshufb     %12, %%xmm1                    \n"
+      "pshufb     %13, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%3)                    \n"
+      "lea        0x10(%3),%3                    \n"
+      "lea        0x30(%0),%0                    \n"
+      "sub        $0x10,%4                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_rgb),             // %0
+        "+r"(dst_r),               // %1
+        "+r"(dst_g),               // %2
+        "+r"(dst_b),               // %3
+        "+r"(width)                // %4
+      : "m"(kShuffleMaskRGBToR0),  // %5
+        "m"(kShuffleMaskRGBToR1),  // %6
+        "m"(kShuffleMaskRGBToR2),  // %7
+        "m"(kShuffleMaskRGBToG0),  // %8
+        "m"(kShuffleMaskRGBToG1),  // %9
+        "m"(kShuffleMaskRGBToG2),  // %10
+        "m"(kShuffleMaskRGBToB0),  // %11
+        "m"(kShuffleMaskRGBToB1),  // %12
+        "m"(kShuffleMaskRGBToB2)   // %13
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_SPLITRGBROW_SSSE3
 
@@ -3414,126 +3384,123 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_rgb,
                        int width) {
-  asm volatile (
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu     (%0),%%xmm0                    \n"
-    "movdqu     (%1),%%xmm1                    \n"
-    "movdqu     (%2),%%xmm2                    \n"
-    "pshufb     %5, %%xmm0                     \n"
-    "pshufb     %6, %%xmm1                     \n"
-    "pshufb     %7, %%xmm2                     \n"
-    "por        %%xmm1,%%xmm0                  \n"
-    "por        %%xmm2,%%xmm0                  \n"
-    "movdqu     %%xmm0,(%3)                    \n"
+  asm volatile(
 
-    "movdqu     (%0),%%xmm0                    \n"
-    "movdqu     (%1),%%xmm1                    \n"
-    "movdqu     (%2),%%xmm2                    \n"
-    "pshufb     %8, %%xmm0                     \n"
-    "pshufb     %9, %%xmm1                     \n"
-    "pshufb     %10, %%xmm2                    \n"
-    "por        %%xmm1,%%xmm0                  \n"
-    "por        %%xmm2,%%xmm0                  \n"
-    "movdqu     %%xmm0,16(%3)                  \n"
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %5, %%xmm0                     \n"
+      "pshufb     %6, %%xmm1                     \n"
+      "pshufb     %7, %%xmm2                     \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%3)                    \n"
 
-    "movdqu     (%0),%%xmm0                    \n"
-    "movdqu     (%1),%%xmm1                    \n"
-    "movdqu     (%2),%%xmm2                    \n"
-    "pshufb     %11, %%xmm0                    \n"
-    "pshufb     %12, %%xmm1                    \n"
-    "pshufb     %13, %%xmm2                    \n"
-    "por        %%xmm1,%%xmm0                  \n"
-    "por        %%xmm2,%%xmm0                  \n"
-    "movdqu     %%xmm0,32(%3)                  \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %8, %%xmm0                     \n"
+      "pshufb     %9, %%xmm1                     \n"
+      "pshufb     %10, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,16(%3)                  \n"
 
-    "lea        0x10(%0),%0                    \n"
-    "lea        0x10(%1),%1                    \n"
-    "lea        0x10(%2),%2                    \n"
-    "lea        0x30(%3),%3                    \n"
-    "sub        $0x10,%4                       \n"
-    "jg         1b                             \n"
-  : "+r"(src_r),                // %0
-    "+r"(src_g),                // %1
-    "+r"(src_b),                // %2
-    "+r"(dst_rgb),              // %3
-    "+r"(width)                 // %4
-  : "m"(kShuffleMaskRToRGB0),   // %5
-    "m"(kShuffleMaskGToRGB0),   // %6
-    "m"(kShuffleMaskBToRGB0),   // %7
-    "m"(kShuffleMaskRToRGB1),   // %8
-    "m"(kShuffleMaskGToRGB1),   // %9
-    "m"(kShuffleMaskBToRGB1),   // %10
-    "m"(kShuffleMaskRToRGB2),   // %11
-    "m"(kShuffleMaskGToRGB2),   // %12
-    "m"(kShuffleMaskBToRGB2)    // %13
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2"
-  );
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %11, %%xmm0                    \n"
+      "pshufb     %12, %%xmm1                    \n"
+      "pshufb     %13, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,32(%3)                  \n"
+
+      "lea        0x10(%0),%0                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "lea        0x10(%2),%2                    \n"
+      "lea        0x30(%3),%3                    \n"
+      "sub        $0x10,%4                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_r),               // %0
+        "+r"(src_g),               // %1
+        "+r"(src_b),               // %2
+        "+r"(dst_rgb),             // %3
+        "+r"(width)                // %4
+      : "m"(kShuffleMaskRToRGB0),  // %5
+        "m"(kShuffleMaskGToRGB0),  // %6
+        "m"(kShuffleMaskBToRGB0),  // %7
+        "m"(kShuffleMaskRToRGB1),  // %8
+        "m"(kShuffleMaskGToRGB1),  // %9
+        "m"(kShuffleMaskBToRGB1),  // %10
+        "m"(kShuffleMaskRToRGB2),  // %11
+        "m"(kShuffleMaskGToRGB2),  // %12
+        "m"(kShuffleMaskBToRGB2)   // %13
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGERGBROW_SSSE3
 
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int count) {
-  asm volatile (
-    "test       $0xf,%0                        \n"
-    "jne        2f                             \n"
-    "test       $0xf,%1                        \n"
-    "jne        2f                             \n"
+  asm volatile(
+      "test       $0xf,%0                        \n"
+      "jne        2f                             \n"
+      "test       $0xf,%1                        \n"
+      "jne        2f                             \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       9f                              \n"
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa    (%0),%%xmm0                     \n"
+      "movdqa    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm0,(%1)                     \n"
+      "movdqa    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "jmp       9f                              \n"
 
-    LABELALIGN
-  "2:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x20,%2                        \n"
-    "jg        2b                              \n"
-  "9:                                          \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+      LABELALIGN
+      "2:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        2b                              \n"
+
+      LABELALIGN "9:                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(count)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_COPYROW_SSE2
 
 #ifdef HAS_COPYROW_AVX
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int count) {
-  asm volatile (
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "vmovdqu   %%ymm0,(%1)                     \n"
-    "vmovdqu   %%ymm1,0x20(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
-    "sub       $0x40,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "vmovdqu   %%ymm1,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x40,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(count)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_COPYROW_AVX
 
@@ -3541,106 +3508,105 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int count) {
 // Multiple of 1.
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile("rep movsb                      \n"
-               : "+S"(src),       // %0
-                 "+D"(dst),       // %1
-                 "+c"(width_tmp)  // %2
-               :
-               : "memory", "cc");
+  asm volatile(
+
+      "rep movsb                      \n"
+      : "+S"(src),       // %0
+        "+D"(dst),       // %1
+        "+c"(width_tmp)  // %2
+      :
+      : "memory", "cc");
 }
 #endif  // HAS_COPYROW_ERMS
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
-    "pslld     $0x18,%%xmm0                    \n"
-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
-    "psrld     $0x8,%%xmm1                     \n"
+  asm volatile(
+      "pcmpeqb   %%xmm0,%%xmm0                   \n"
+      "pslld     $0x18,%%xmm0                    \n"
+      "pcmpeqb   %%xmm1,%%xmm1                   \n"
+      "psrld     $0x8,%%xmm1                     \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm2                     \n"
-    "movdqu    0x10(%0),%%xmm3                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "movdqu    (%1),%%xmm4                     \n"
-    "movdqu    0x10(%1),%%xmm5                 \n"
-    "pand      %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm0,%%xmm3                   \n"
-    "pand      %%xmm1,%%xmm4                   \n"
-    "pand      %%xmm1,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm2,(%1)                     \n"
-    "movdqu    %%xmm3,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "movdqu    0x10(%0),%%xmm3                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqu    (%1),%%xmm4                     \n"
+      "movdqu    0x10(%1),%%xmm5                 \n"
+      "pand      %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm0,%%xmm3                   \n"
+      "pand      %%xmm1,%%xmm4                   \n"
+      "pand      %%xmm1,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+  asm volatile(
+      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm1                     \n"
-    "vmovdqu   0x20(%0),%%ymm2                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1                    \n"
-    "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2                \n"
-    "vmovdqu   %%ymm1,(%1)                     \n"
-    "vmovdqu   %%ymm2,0x20(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm1                     \n"
+      "vmovdqu   0x20(%0),%%ymm2                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
+      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
+      "vmovdqu   %%ymm1,(%1)                     \n"
+      "vmovdqu   %%ymm2,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
 
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 // width in pixels
-void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_a, int width) {
-  asm volatile (
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0), %%xmm0                    \n"
-    "movdqu    0x10(%0), %%xmm1                \n"
-    "lea       0x20(%0), %0                    \n"
-    "psrld     $0x18, %%xmm0                   \n"
-    "psrld     $0x18, %%xmm1                   \n"
-    "packssdw  %%xmm1, %%xmm0                  \n"
-    "packuswb  %%xmm0, %%xmm0                  \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1), %1                     \n"
-    "sub       $0x8, %2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_a),     // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0), %%xmm0                    \n"
+      "movdqu    0x10(%0), %%xmm1                \n"
+      "lea       0x20(%0), %0                    \n"
+      "psrld     $0x18, %%xmm0                   \n"
+      "psrld     $0x18, %%xmm1                   \n"
+      "packssdw  %%xmm1, %%xmm0                  \n"
+      "packuswb  %%xmm0, %%xmm0                  \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1), %1                     \n"
+      "sub       $0x8, %2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
 
@@ -3649,110 +3615,106 @@ static const uvec8 kShuffleAlphaShort_AVX2 = {
     3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
     11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
 
-void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width) {
-  asm volatile (
-    "vmovdqa    %3,%%ymm4                      \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "vmovdqa    %3,%%ymm4                      \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0), %%ymm0                    \n"
-    "vmovdqu   0x20(%0), %%ymm1                \n"
-    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n" // vpsrld $0x18, %%ymm0
-    "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
-    "vmovdqu   0x40(%0), %%ymm2                \n"
-    "vmovdqu   0x60(%0), %%ymm3                \n"
-    "lea       0x80(%0), %0                    \n"
-    "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
-    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub        $0x20, %2                      \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_a),     // %1
-    "+rm"(width)     // %2
-  : "m"(kPermdARGBToY_AVX),  // %3
-    "m"(kShuffleAlphaShort_AVX2)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0), %%ymm0                    \n"
+      "vmovdqu   0x20(%0), %%ymm1                \n"
+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"  // vpsrld $0x18, %%ymm0
+      "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
+      "vmovdqu   0x40(%0), %%ymm2                \n"
+      "vmovdqu   0x60(%0), %%ymm3                \n"
+      "lea       0x80(%0), %0                    \n"
+      "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
+      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub        $0x20, %2                      \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),               // %0
+        "+r"(dst_a),                  // %1
+        "+rm"(width)                  // %2
+      : "m"(kPermdARGBToY_AVX),       // %3
+        "m"(kShuffleAlphaShort_AVX2)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
-    "pslld     $0x18,%%xmm0                    \n"
-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
-    "psrld     $0x8,%%xmm1                     \n"
+  asm volatile(
+      "pcmpeqb   %%xmm0,%%xmm0                   \n"
+      "pslld     $0x18,%%xmm0                    \n"
+      "pcmpeqb   %%xmm1,%%xmm1                   \n"
+      "psrld     $0x8,%%xmm1                     \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movq      (%0),%%xmm2                     \n"
-    "lea       0x8(%0),%0                      \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpckhwd %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm2,%%xmm2                   \n"
-    "movdqu    (%1),%%xmm4                     \n"
-    "movdqu    0x10(%1),%%xmm5                 \n"
-    "pand      %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm0,%%xmm3                   \n"
-    "pand      %%xmm1,%%xmm4                   \n"
-    "pand      %%xmm1,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm2,(%1)                     \n"
-    "movdqu    %%xmm3,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm2                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm2,%%xmm2                   \n"
+      "punpckhwd %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm2,%%xmm2                   \n"
+      "movdqu    (%1),%%xmm4                     \n"
+      "movdqu    0x10(%1),%%xmm5                 \n"
+      "pand      %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm0,%%xmm3                   \n"
+      "pand      %%xmm1,%%xmm4                   \n"
+      "pand      %%xmm1,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+  asm volatile(
+      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vpmovzxbd (%0),%%ymm1                     \n"
-    "vpmovzxbd 0x8(%0),%%ymm2                  \n"
-    "lea       0x10(%0),%0                     \n"
-    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
-    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
-    "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1                    \n"
-    "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2                \n"
-    "vmovdqu   %%ymm1,(%1)                     \n"
-    "vmovdqu   %%ymm2,0x20(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd (%0),%%ymm1                     \n"
+      "vpmovzxbd 0x8(%0),%%ymm2                  \n"
+      "lea       0x10(%0),%0                     \n"
+      "vpslld    $0x18,%%ymm1,%%ymm1             \n"
+      "vpslld    $0x18,%%ymm2,%%ymm2             \n"
+      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
+      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
+      "vmovdqu   %%ymm1,(%1)                     \n"
+      "vmovdqu   %%ymm2,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
@@ -3760,57 +3722,61 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width >> 2);
   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-  asm volatile("rep stosl                      \n"
-               : "+D"(dst),       // %0
-                 "+c"(width_tmp)  // %1
-               : "a"(v32)         // %2
-               : "memory", "cc");
+  asm volatile(
+
+      "rep stosl                      \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
 }
 
 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile("rep stosb                      \n"
-               : "+D"(dst),       // %0
-                 "+c"(width_tmp)  // %1
-               : "a"(v8)          // %2
-               : "memory", "cc");
+  asm volatile(
+
+      "rep stosb                      \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v8)          // %2
+      : "memory", "cc");
 }
 
 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile("rep stosl                      \n"
-               : "+D"(dst_argb),  // %0
-                 "+c"(width_tmp)  // %1
-               : "a"(v32)         // %2
-               : "memory", "cc");
+  asm volatile(
+
+      "rep stosl                      \n"
+      : "+D"(dst_argb),  // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
 }
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_SSE2
 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
@@ -3818,101 +3784,96 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x00(%0,%4,1),%%xmm2            \n"
-    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq    %%xmm1,0x00(%1,%2,1)              \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
 void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq    %%xmm1,0x00(%1,%2,1)              \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile (
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
@@ -3920,108 +3881,102 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x00(%0,%4,1),%%xmm2            \n"
-    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq    %%xmm1,0x00(%1,%2,1)              \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
 void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq    %%xmm1,0x00(%1,%2,1)              \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
 #ifdef HAS_YUY2TOYROW_AVX2
 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0,(%1)                     \n"
-    "lea      0x20(%1),%1                      \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "lea      0x20(%1),%1                      \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
@@ -4029,189 +3984,180 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-    "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-    "lea       0x40(%0),%0                     \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1,(%1)             \n"
-    "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-    "lea      0x10(%1),%1                      \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
 void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1,(%1)             \n"
-    "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-    "lea      0x10(%1),%1                      \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile (
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0,(%1)                     \n"
-    "lea      0x20(%1),%1                      \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "lea      0x20(%1),%1                      \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
                       int stride_uyvy,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
-    "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
-    "lea       0x40(%0),%0                     \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1,(%1)             \n"
-    "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-    "lea      0x10(%1),%1                      \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
 void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
-    "sub       %1,%2                           \n"
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+      "sub       %1,%2                           \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1,(%1)             \n"
-    "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
-    "lea      0x10(%1),%1                      \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_YUY2TOYROW_AVX2
 
@@ -4225,81 +4171,80 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
                         const uint8_t* src_argb1,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
+  asm volatile(
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psrlw     $0xf,%%xmm7                     \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x8,%%xmm6                     \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psllw     $0x8,%%xmm5                     \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "pslld     $0x18,%%xmm4                    \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    (%0),%%xmm3                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    (%1),%%xmm2                     \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    (%1),%%xmm1                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm3                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm0                   \n"
+      "pxor      %%xmm4,%%xmm3                   \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "pshufb    %4,%%xmm3                       \n"
+      "pand      %%xmm6,%%xmm2                   \n"
+      "paddw     %%xmm7,%%xmm3                   \n"
+      "pmullw    %%xmm3,%%xmm2                   \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm2                     \n"
+      "paddusb   %%xmm2,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
 
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        99f                             \n"
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        99f                             \n"
 
-    // 1 pixel loop.
-  "91:                                         \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "lea       0x4(%1),%1                      \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
-    "sub       $0x1,%3                         \n"
-    "jge       91b                             \n"
-  "99:                                         \n"
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  : "m"(kShuffleAlpha)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 1 pixel loop.
+      "91:                                       \n"
+      "movd      (%0),%%xmm3                     \n"
+      "lea       0x4(%0),%0                      \n"
+      "movdqa    %%xmm3,%%xmm0                   \n"
+      "pxor      %%xmm4,%%xmm3                   \n"
+      "movd      (%1),%%xmm2                     \n"
+      "pshufb    %4,%%xmm3                       \n"
+      "pand      %%xmm6,%%xmm2                   \n"
+      "paddw     %%xmm7,%%xmm3                   \n"
+      "pmullw    %%xmm3,%%xmm2                   \n"
+      "movd      (%1),%%xmm1                     \n"
+      "lea       0x4(%1),%1                      \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm2                     \n"
+      "paddusb   %%xmm2,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x4(%2),%2                      \n"
+      "sub       $0x1,%3                         \n"
+      "jge       91b                             \n"
+      "99:                                       \n"
+      : "+r"(src_argb0),    // %0
+        "+r"(src_argb1),    // %1
+        "+r"(dst_argb),     // %2
+        "+r"(width)         // %3
+      : "m"(kShuffleAlpha)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
@@ -4423,45 +4368,45 @@ static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
 static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
                                      15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
 // Attenuate 4 pixels at a time.
-void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "pslld     $0x18,%%xmm3                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "pslld     $0x18,%%xmm3                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    (%0),%%xmm1                     \n"
-    "punpcklbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm1,%%xmm0                   \n"
-    "movdqu    (%0),%%xmm1                     \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqu    (%0),%%xmm2                     \n"
-    "punpckhbw %%xmm2,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqu    (%0),%%xmm2                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "pand      %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha0),  // %3
-    "m"(kShuffleAlpha1)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "punpcklbw %%xmm1,%%xmm1                   \n"
+      "pmulhuw   %%xmm1,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "punpckhbw %%xmm2,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "pand      %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),       // %0
+        "+r"(dst_argb),       // %1
+        "+r"(width)           // %2
+      : "m"(kShuffleAlpha0),  // %3
+        "m"(kShuffleAlpha1)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBATTENUATEROW_SSSE3
 
@@ -4471,40 +4416,40 @@ static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
                                          128u, 128u, 14u,  15u, 14u, 15u,
                                          14u,  15u,  128u, 128u};
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
-    "sub        %0,%1                          \n"
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+      "sub        %0,%1                          \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm6                    \n"
-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
-    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
-    "lea       0x20(%0),%0                     \n"
-    "sub        $0x8,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha_AVX2)  // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm6                    \n"
+      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+      "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+      "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub        $0x8,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleAlpha_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 
@@ -4514,44 +4459,42 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movzb     0x03(%0),%3                     \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movd    0x00(%4,%3,4),%%xmm2            \n"
-    "movzb     0x07(%0),%3                     \n"
-    "movd    0x00(%4,%3,4),%%xmm3            \n"
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    (%0),%%xmm1                     \n"
-    "movzb     0x0b(%0),%3                     \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "movd    0x00(%4,%3,4),%%xmm2            \n"
-    "movzb     0x0f(%0),%3                     \n"
-    "movd    0x00(%4,%3,4),%%xmm3            \n"
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "lea       0x10(%0),%0                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),     // %0
-    "+r"(dst_argb),     // %1
-    "+r"(width),        // %2
-    "=&r"(alpha)        // %3
-  : "r"(fixed_invtbl8)  // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movzb     0x03(%0),%3                     \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movd      0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x07(%0),%3                     \n"
+      "movd      0x00(%4,%3,4),%%xmm3            \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "movlhps   %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "movzb     0x0b(%0),%3                     \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "movd      0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x0f(%0),%3                     \n"
+      "movd      0x00(%4,%3,4),%%xmm3            \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "movlhps   %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),     // %0
+        "+r"(dst_argb),     // %1
+        "+r"(width),        // %2
+        "=&r"(alpha)        // %3
+      : "r"(fixed_invtbl8)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
 
@@ -4564,110 +4507,107 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-  asm volatile (
-    "sub        %0,%1                          \n"
-    "vbroadcastf128 %5,%%ymm5                  \n"
+  asm volatile(
+      "sub        %0,%1                          \n"
+      "vbroadcastf128 %5,%%ymm5                  \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    // replace VPGATHER
-    "movzb     0x03(%0),%3                     \n"
-    "vmovd    0x00(%4,%3,4),%%xmm0            \n"
-    "movzb     0x07(%0),%3                     \n"
-    "vmovd    0x00(%4,%3,4),%%xmm1            \n"
-    "movzb     0x0b(%0),%3                     \n"
-    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
-    "vmovd    0x00(%4,%3,4),%%xmm2            \n"
-    "movzb     0x0f(%0),%3                     \n"
-    "vmovd    0x00(%4,%3,4),%%xmm3            \n"
-    "movzb     0x13(%0),%3                     \n"
-    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
-    "vmovd    0x00(%4,%3,4),%%xmm0            \n"
-    "movzb     0x17(%0),%3                     \n"
-    "vmovd    0x00(%4,%3,4),%%xmm1            \n"
-    "movzb     0x1b(%0),%3                     \n"
-    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
-    "vmovd    0x00(%4,%3,4),%%xmm2            \n"
-    "movzb     0x1f(%0),%3                     \n"
-    "vmovd    0x00(%4,%3,4),%%xmm3            \n"
-    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
-    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
-    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
-    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
-    // end of VPGATHER
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      // replace VPGATHER
+      "movzb     0x03(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
+      "movzb     0x07(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
+      "movzb     0x0b(%0),%3                     \n"
+      "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x0f(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
+      "movzb     0x13(%0),%3                     \n"
+      "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
+      "movzb     0x17(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
+      "movzb     0x1b(%0),%3                     \n"
+      "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x1f(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
+      "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+      "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+      "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+      "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+      // end of VPGATHER
 
-    "vmovdqu    (%0),%%ymm6                    \n"
-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
-    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
-    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
-    "lea       0x20(%0),%0                     \n"
-    "sub        $0x8,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb),      // %1
-    "+r"(width),         // %2
-    "=&r"(alpha)         // %3
-  : "r"(fixed_invtbl8),  // %4
-    "m"(kUnattenShuffleAlpha_AVX2)  // %5
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      "vmovdqu    (%0),%%ymm6                    \n"
+      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+      "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+      "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub        $0x8,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),                 // %0
+        "+r"(dst_argb),                 // %1
+        "+r"(width),                    // %2
+        "=&r"(alpha)                    // %3
+      : "r"(fixed_invtbl8),             // %4
+        "m"(kUnattenShuffleAlpha_AVX2)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqu    (%0),%%xmm2                     \n"
-    "movdqu    0x10(%0),%%xmm3                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrld     $0x18,%%xmm2                    \n"
-    "psrld     $0x18,%%xmm3                    \n"
-    "packuswb  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm3                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "m"(kARGBToYJ),   // %3
-    "m"(kAddYJ64)     // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "movdqu    0x10(%0),%%xmm3                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrld     $0x18,%%xmm2                    \n"
+      "psrld     $0x18,%%xmm3                    \n"
+      "packuswb  %%xmm3,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpcklbw %%xmm2,%%xmm3                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm3,%%xmm0                   \n"
+      "punpckhwd %%xmm3,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kAddYJ64)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBGRAYROW_SSSE3
 
@@ -4687,59 +4627,57 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %2,%%xmm2                       \n"
-    "movdqa    %3,%%xmm3                       \n"
-    "movdqa    %4,%%xmm4                       \n"
+  asm volatile(
+      "movdqa    %2,%%xmm2                       \n"
+      "movdqa    %3,%%xmm3                       \n"
+      "movdqa    %4,%%xmm4                       \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm6                 \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm6                   \n"
-    "phaddw    %%xmm6,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqu    (%0),%%xmm5                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm3,%%xmm5                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqu    (%0),%%xmm5                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm4,%%xmm5                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqu    (%0),%%xmm6                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "psrld     $0x18,%%xmm6                    \n"
-    "psrld     $0x18,%%xmm1                    \n"
-    "packuswb  %%xmm1,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm5                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "punpckhwd %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%0)                     \n"
-    "movdqu    %%xmm1,0x10(%0)                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "sub       $0x8,%1                         \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),      // %0
-    "+r"(width)          // %1
-  : "m"(kARGBToSepiaB),  // %2
-    "m"(kARGBToSepiaG),  // %3
-    "m"(kARGBToSepiaR)   // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "pmaddubsw %%xmm2,%%xmm6                   \n"
+      "phaddw    %%xmm6,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm5                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm3,%%xmm5                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm5                   \n"
+      "psrlw     $0x7,%%xmm5                     \n"
+      "packuswb  %%xmm5,%%xmm5                   \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm5                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm4,%%xmm5                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm5                   \n"
+      "psrlw     $0x7,%%xmm5                     \n"
+      "packuswb  %%xmm5,%%xmm5                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "psrld     $0x18,%%xmm6                    \n"
+      "psrld     $0x18,%%xmm1                    \n"
+      "packuswb  %%xmm1,%%xmm6                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm5                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm5,%%xmm0                   \n"
+      "punpckhwd %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x8,%1                         \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),      // %0
+        "+r"(width)          // %1
+      : "m"(kARGBToSepiaB),  // %2
+        "m"(kARGBToSepiaG),  // %3
+        "m"(kARGBToSepiaR)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
 
@@ -4750,62 +4688,61 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
                               uint8_t* dst_argb,
                               const int8_t* matrix_argb,
                               int width) {
-  asm volatile (
-    "movdqu    (%3),%%xmm5                     \n"
-    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
-    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
-    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
-    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
+  asm volatile(
+      "movdqu    (%3),%%xmm5                     \n"
+      "pshufd    $0x00,%%xmm5,%%xmm2             \n"
+      "pshufd    $0x55,%%xmm5,%%xmm3             \n"
+      "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
+      "pshufd    $0xff,%%xmm5,%%xmm5             \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm7                 \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm7                   \n"
-    "movdqu    (%0),%%xmm6                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddsw   %%xmm7,%%xmm0                   \n"
-    "phaddsw   %%xmm1,%%xmm6                   \n"
-    "psraw     $0x6,%%xmm0                     \n"
-    "psraw     $0x6,%%xmm6                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm0                   \n"
-    "movdqu    (%0),%%xmm1                     \n"
-    "movdqu    0x10(%0),%%xmm7                 \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm7                   \n"
-    "phaddsw   %%xmm7,%%xmm1                   \n"
-    "movdqu    (%0),%%xmm6                     \n"
-    "movdqu    0x10(%0),%%xmm7                 \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm7                   \n"
-    "phaddsw   %%xmm7,%%xmm6                   \n"
-    "psraw     $0x6,%%xmm1                     \n"
-    "psraw     $0x6,%%xmm6                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "punpcklwd %%xmm1,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm6                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    %%xmm6,0x10(%1)                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb),      // %1
-    "+r"(width)          // %2
-  : "r"(matrix_argb)     // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "pmaddubsw %%xmm2,%%xmm7                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "phaddsw   %%xmm7,%%xmm0                   \n"
+      "phaddsw   %%xmm1,%%xmm6                   \n"
+      "psraw     $0x6,%%xmm0                     \n"
+      "psraw     $0x6,%%xmm6                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm7                   \n"
+      "phaddsw   %%xmm7,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm7                   \n"
+      "phaddsw   %%xmm7,%%xmm6                   \n"
+      "psraw     $0x6,%%xmm1                     \n"
+      "psraw     $0x6,%%xmm6                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "punpcklwd %%xmm1,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm6                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm6,0x10(%1)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
@@ -4816,49 +4753,48 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
                           int interval_size,
                           int interval_offset,
                           int width) {
-  asm volatile (
-    "movd      %2,%%xmm2                       \n"
-    "movd      %3,%%xmm3                       \n"
-    "movd      %4,%%xmm4                       \n"
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
-    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "pslld     $0x18,%%xmm6                    \n"
+  asm volatile(
+      "movd      %2,%%xmm2                       \n"
+      "movd      %3,%%xmm3                       \n"
+      "movd      %4,%%xmm4                       \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshufd    $0x44,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "pshufd    $0x44,%%xmm3,%%xmm3             \n"
+      "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
+      "pshufd    $0x44,%%xmm4,%%xmm4             \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "pslld     $0x18,%%xmm6                    \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    (%0),%%xmm1                     \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "pmullw    %%xmm3,%%xmm0                   \n"
-    "movdqu    (%0),%%xmm7                     \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm6,%%xmm7                   \n"
-    "paddw     %%xmm4,%%xmm0                   \n"
-    "paddw     %%xmm4,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%0)                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "sub       $0x4,%1                         \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "punpckhbw %%xmm5,%%xmm1                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "pmullw    %%xmm3,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm7                     \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm6,%%xmm7                   \n"
+      "paddw     %%xmm4,%%xmm0                   \n"
+      "paddw     %%xmm4,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "por       %%xmm7,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x4,%1                         \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBQUANTIZEROW_SSE2
 
@@ -4868,35 +4804,33 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        int width,
                        uint32_t value) {
-  asm volatile (
-    "movd      %3,%%xmm2                       \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm2                  \n"
+  asm volatile(
+      "movd      %3,%%xmm2                       \n"
+      "punpcklbw %%xmm2,%%xmm2                   \n"
+      "punpcklqdq %%xmm2,%%xmm2                  \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(value)       // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBSHADEROW_SSE2
 
@@ -4906,37 +4840,36 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
+  asm volatile(
 
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqu    (%1),%%xmm2                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "movdqu    %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpckhbw %%xmm5,%%xmm3                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqu    %%xmm0,%%xmm1                   \n"
+      "movdqu    %%xmm2,%%xmm3                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpckhbw %%xmm5,%%xmm3                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "pmulhuw   %%xmm3,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 
@@ -4946,38 +4879,40 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+  asm volatile(
 
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm1                    \n"
-    "lea        0x20(%0),%0                    \n"
-    "vmovdqu    (%1),%%ymm3                    \n"
-    "lea        0x20(%1),%1                    \n"
-    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
-    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0,(%2)                    \n"
-    "lea       0x20(%2),%2                     \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm1                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vmovdqu    (%1),%%ymm3                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+      "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea       0x20(%2),%2                     \n"
+      "sub        $0x8,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc"
 #if defined(__AVX2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+        ,
+        "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
-  );
+      );
 }
 #endif  // HAS_ARGBMULTIPLYROW_AVX2
 
@@ -4987,27 +4922,25 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqu    (%1),%%xmm1                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBADDROW_SSE2
 
@@ -5017,27 +4950,25 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "lea        0x20(%0),%0                    \n"
-    "vpaddusb   (%1),%%ymm0,%%ymm0             \n"
-    "lea        0x20(%1),%1                    \n"
-    "vmovdqu    %%ymm0,(%2)                    \n"
-    "lea        0x20(%2),%2                    \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vpaddusb   (%1),%%ymm0,%%ymm0             \n"
+      "lea        0x20(%1),%1                    \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x8,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBADDROW_AVX2
 
@@ -5047,27 +4978,25 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqu    (%1),%%xmm1                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "psubusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "psubusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
@@ -5077,27 +5006,25 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "lea        0x20(%0),%0                    \n"
-    "vpsubusb   (%1),%%ymm0,%%ymm0             \n"
-    "lea        0x20(%1),%1                    \n"
-    "vmovdqu    %%ymm0,(%2)                    \n"
-    "lea        0x20(%2),%2                    \n"
-    "sub        $0x8,%3                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vpsubusb   (%1),%%ymm0,%%ymm0             \n"
+      "lea        0x20(%1),%1                    \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x8,%3                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBSUBTRACTROW_AVX2
 
@@ -5111,50 +5038,48 @@ void SobelXRow_SSE2(const uint8_t* src_y0,
                     const uint8_t* src_y2,
                     uint8_t* dst_sobelx,
                     int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-    "sub       %0,%3                           \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "sub       %0,%2                           \n"
+      "sub       %0,%3                           \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movq      (%0),%%xmm0                     \n"
-    "movq      0x2(%0),%%xmm1                  \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "psubw     %%xmm1,%%xmm0                   \n"
-    "movq    0x00(%0,%1,1),%%xmm1            \n"
-    "movq    0x02(%0,%1,1),%%xmm2            \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "psubw     %%xmm2,%%xmm1                   \n"
-    "movq    0x00(%0,%2,1),%%xmm2            \n"
-    "movq    0x02(%0,%2,1),%%xmm3            \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "psubw     %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm1                   \n"
-    "pmaxsw    %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq    %%xmm0,0x00(%0,%3,1)              \n"
-    "lea       0x8(%0),%0                      \n"
-    "sub       $0x8,%4                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movq      0x2(%0),%%xmm1                  \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "psubw     %%xmm1,%%xmm0                   \n"
+      "movq      0x00(%0,%1,1),%%xmm1            \n"
+      "movq      0x02(%0,%1,1),%%xmm2            \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "psubw     %%xmm2,%%xmm1                   \n"
+      "movq      0x00(%0,%2,1),%%xmm2            \n"
+      "movq      0x02(%0,%2,1),%%xmm3            \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "psubw     %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm2,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "psubw     %%xmm0,%%xmm1                   \n"
+      "pmaxsw    %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,0x00(%0,%3,1)            \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x8,%4                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(src_y2),      // %2
+        "+r"(dst_sobelx),  // %3
+        "+r"(width)        // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELXROW_SSE2
 
@@ -5167,48 +5092,46 @@ void SobelYRow_SSE2(const uint8_t* src_y0,
                     const uint8_t* src_y1,
                     uint8_t* dst_sobely,
                     int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "sub       %0,%2                           \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movq      (%0),%%xmm0                     \n"
-    "movq    0x00(%0,%1,1),%%xmm1            \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "psubw     %%xmm1,%%xmm0                   \n"
-    "movq      0x1(%0),%%xmm1                  \n"
-    "movq    0x01(%0,%1,1),%%xmm2            \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "psubw     %%xmm2,%%xmm1                   \n"
-    "movq      0x2(%0),%%xmm2                  \n"
-    "movq    0x02(%0,%1,1),%%xmm3            \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "psubw     %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm1                   \n"
-    "pmaxsw    %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq    %%xmm0,0x00(%0,%2,1)              \n"
-    "lea       0x8(%0),%0                      \n"
-    "sub       $0x8,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movq      0x00(%0,%1,1),%%xmm1            \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "psubw     %%xmm1,%%xmm0                   \n"
+      "movq      0x1(%0),%%xmm1                  \n"
+      "movq      0x01(%0,%1,1),%%xmm2            \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "psubw     %%xmm2,%%xmm1                   \n"
+      "movq      0x2(%0),%%xmm2                  \n"
+      "movq      0x02(%0,%1,1),%%xmm3            \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "psubw     %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm2,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "psubw     %%xmm0,%%xmm1                   \n"
+      "pmaxsw    %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,0x00(%0,%2,1)            \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x8,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(dst_sobely),  // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELYROW_SSE2
 
@@ -5222,46 +5145,44 @@ void SobelRow_SSE2(const uint8_t* src_sobelx,
                    const uint8_t* src_sobely,
                    uint8_t* dst_argb,
                    int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-    "lea       0x10(%0),%0                     \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm2                   \n"
-    "punpckhbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm1                   \n"
-    "punpckhwd %%xmm2,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklwd %%xmm0,%%xmm3                   \n"
-    "punpckhwd %%xmm0,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm1,(%2)                     \n"
-    "movdqu    %%xmm2,0x10(%2)                 \n"
-    "movdqu    %%xmm3,0x20(%2)                 \n"
-    "movdqu    %%xmm0,0x30(%2)                 \n"
-    "lea       0x40(%2),%2                     \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm2                   \n"
+      "punpckhbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "punpcklwd %%xmm2,%%xmm1                   \n"
+      "punpckhwd %%xmm2,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklwd %%xmm0,%%xmm3                   \n"
+      "punpckhwd %%xmm0,%%xmm0                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm1,(%2)                     \n"
+      "movdqu    %%xmm2,0x10(%2)                 \n"
+      "movdqu    %%xmm3,0x20(%2)                 \n"
+      "movdqu    %%xmm0,0x30(%2)                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELROW_SSE2
 
@@ -5271,30 +5192,28 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
                           const uint8_t* src_sobely,
                           uint8_t* dst_y,
                           int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-    "lea       0x10(%0),%0                     \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_SOBELTOPLANEROW_SSE2
 
@@ -5308,45 +5227,44 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx,
                      const uint8_t* src_sobely,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
 
-    // 8 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%1,1),%%xmm1            \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "paddusb   %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "punpckhbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "punpcklbw %%xmm2,%%xmm4                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "punpcklwd %%xmm3,%%xmm6                   \n"
-    "punpckhwd %%xmm3,%%xmm4                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "punpcklwd %%xmm0,%%xmm7                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm6,(%2)                     \n"
-    "movdqu    %%xmm4,0x10(%2)                 \n"
-    "movdqu    %%xmm7,0x20(%2)                 \n"
-    "movdqu    %%xmm1,0x30(%2)                 \n"
-    "lea       0x40(%2),%2                     \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "paddusb   %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "punpckhbw %%xmm5,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "punpcklbw %%xmm2,%%xmm4                   \n"
+      "punpckhbw %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm4,%%xmm6                   \n"
+      "punpcklwd %%xmm3,%%xmm6                   \n"
+      "punpckhwd %%xmm3,%%xmm4                   \n"
+      "movdqa    %%xmm1,%%xmm7                   \n"
+      "punpcklwd %%xmm0,%%xmm7                   \n"
+      "punpckhwd %%xmm0,%%xmm1                   \n"
+      "movdqu    %%xmm6,(%2)                     \n"
+      "movdqu    %%xmm4,0x10(%2)                 \n"
+      "movdqu    %%xmm7,0x20(%2)                 \n"
+      "movdqu    %%xmm1,0x30(%2)                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_SOBELXYROW_SSE2
 
@@ -5357,78 +5275,76 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
                                   int32_t* cumsum,
                                   const int32_t* previous_cumsum,
                                   int width) {
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "test      $0xf,%1                         \n"
-    "jne       49f                             \n"
+  asm volatile(
+      "pxor      %%xmm0,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
+      "test      $0xf,%1                         \n"
+      "jne       49f                             \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-    "40:                                       \n"
-    "movdqu    (%0),%%xmm2                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "punpckhwd %%xmm1,%%xmm3                   \n"
-    "punpckhbw %%xmm1,%%xmm4                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "punpcklwd %%xmm1,%%xmm4                   \n"
-    "punpckhwd %%xmm1,%%xmm5                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    (%2),%%xmm2                     \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "movdqu    0x10(%2),%%xmm3                 \n"
-    "paddd     %%xmm0,%%xmm3                   \n"
-    "paddd     %%xmm4,%%xmm0                   \n"
-    "movdqu    0x20(%2),%%xmm4                 \n"
-    "paddd     %%xmm0,%%xmm4                   \n"
-    "paddd     %%xmm5,%%xmm0                   \n"
-    "movdqu    0x30(%2),%%xmm5                 \n"
-    "lea       0x40(%2),%2                     \n"
-    "paddd     %%xmm0,%%xmm5                   \n"
-    "movdqu    %%xmm2,(%1)                     \n"
-    "movdqu    %%xmm3,0x10(%1)                 \n"
-    "movdqu    %%xmm4,0x20(%1)                 \n"
-    "movdqu    %%xmm5,0x30(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm2,%%xmm4                   \n"
+      "punpcklbw %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm1,%%xmm2                   \n"
+      "punpckhwd %%xmm1,%%xmm3                   \n"
+      "punpckhbw %%xmm1,%%xmm4                   \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "punpcklwd %%xmm1,%%xmm4                   \n"
+      "punpckhwd %%xmm1,%%xmm5                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "movdqu    (%2),%%xmm2                     \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "movdqu    0x10(%2),%%xmm3                 \n"
+      "paddd     %%xmm0,%%xmm3                   \n"
+      "paddd     %%xmm4,%%xmm0                   \n"
+      "movdqu    0x20(%2),%%xmm4                 \n"
+      "paddd     %%xmm0,%%xmm4                   \n"
+      "paddd     %%xmm5,%%xmm0                   \n"
+      "movdqu    0x30(%2),%%xmm5                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "paddd     %%xmm0,%%xmm5                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "movdqu    %%xmm4,0x20(%1)                 \n"
+      "movdqu    %%xmm5,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
 
-    "49:                                       \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        19f                             \n"
 
-    // 1 pixel loop.
-    LABELALIGN
-    "10:                                       \n"
-    "movd      (%0),%%xmm2                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    (%2),%%xmm2                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "movdqu    %%xmm2,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
+      // 1 pixel loop.
+      LABELALIGN
+      "10:                                       \n"
+      "movd      (%0),%%xmm2                     \n"
+      "lea       0x4(%0),%0                      \n"
+      "punpcklbw %%xmm1,%%xmm2                   \n"
+      "punpcklwd %%xmm1,%%xmm2                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "movdqu    (%2),%%xmm2                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x1,%3                         \n"
+      "jge       10b                             \n"
 
-    "19:                                       \n"
-  : "+r"(row),  // %0
-    "+r"(cumsum),  // %1
-    "+r"(previous_cumsum),  // %2
-    "+r"(width)  // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      "19:                                       \n"
+      : "+r"(row),              // %0
+        "+r"(cumsum),           // %1
+        "+r"(previous_cumsum),  // %2
+        "+r"(width)             // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
 
@@ -5439,130 +5355,128 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
                                     int area,
                                     uint8_t* dst,
                                     int count) {
-  asm volatile (
-    "movd      %5,%%xmm5                       \n"
-    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
-    "rcpss     %%xmm5,%%xmm4                   \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "cmpl      $0x80,%5                        \n"
-    "ja        40f                             \n"
+  asm volatile(
+      "movd      %5,%%xmm5                       \n"
+      "cvtdq2ps  %%xmm5,%%xmm5                   \n"
+      "rcpss     %%xmm5,%%xmm4                   \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
+      "cmpl      $0x80,%5                        \n"
+      "ja        40f                             \n"
 
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrld     $0x10,%%xmm6                    \n"
-    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
-    "addps     %%xmm6,%%xmm5                   \n"
-    "mulps     %%xmm4,%%xmm5                   \n"
-    "cvtps2dq  %%xmm5,%%xmm5                   \n"
-    "packssdw  %%xmm5,%%xmm5                   \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrld     $0x10,%%xmm6                    \n"
+      "cvtdq2ps  %%xmm6,%%xmm6                   \n"
+      "addps     %%xmm6,%%xmm5                   \n"
+      "mulps     %%xmm4,%%xmm5                   \n"
+      "cvtps2dq  %%xmm5,%%xmm5                   \n"
+      "packssdw  %%xmm5,%%xmm5                   \n"
 
-    // 4 pixel small loop.
-    LABELALIGN
-  "4:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "psubd    0x00(%0,%4,4),%%xmm0            \n"
-    "psubd    0x10(%0,%4,4),%%xmm1            \n"
-    "psubd    0x20(%0,%4,4),%%xmm2            \n"
-    "psubd    0x30(%0,%4,4),%%xmm3            \n"
-    "lea       0x40(%0),%0                     \n"
-    "psubd     (%1),%%xmm0                     \n"
-    "psubd     0x10(%1),%%xmm1                 \n"
-    "psubd     0x20(%1),%%xmm2                 \n"
-    "psubd     0x30(%1),%%xmm3                 \n"
-    "paddd    0x00(%1,%4,4),%%xmm0            \n"
-    "paddd    0x10(%1,%4,4),%%xmm1            \n"
-    "paddd    0x20(%1,%4,4),%%xmm2            \n"
-    "paddd    0x30(%1,%4,4),%%xmm3            \n"
-    "lea       0x40(%1),%1                     \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%3                         \n"
-    "jge       4b                              \n"
-    "jmp       49f                             \n"
+      // 4 pixel small loop.
+      LABELALIGN
+      "4:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "psubd     0x10(%0,%4,4),%%xmm1            \n"
+      "psubd     0x20(%0,%4,4),%%xmm2            \n"
+      "psubd     0x30(%0,%4,4),%%xmm3            \n"
+      "lea       0x40(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "psubd     0x10(%1),%%xmm1                 \n"
+      "psubd     0x20(%1),%%xmm2                 \n"
+      "psubd     0x30(%1),%%xmm3                 \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "paddd     0x10(%1,%4,4),%%xmm1            \n"
+      "paddd     0x20(%1,%4,4),%%xmm2            \n"
+      "paddd     0x30(%1,%4,4),%%xmm3            \n"
+      "lea       0x40(%1),%1                     \n"
+      "packssdw  %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm5,%%xmm0                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       4b                              \n"
+      "jmp       49f                             \n"
 
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
-    "psubd    0x00(%0,%4,4),%%xmm0            \n"
-    "psubd    0x10(%0,%4,4),%%xmm1            \n"
-    "psubd    0x20(%0,%4,4),%%xmm2            \n"
-    "psubd    0x30(%0,%4,4),%%xmm3            \n"
-    "lea       0x40(%0),%0                     \n"
-    "psubd     (%1),%%xmm0                     \n"
-    "psubd     0x10(%1),%%xmm1                 \n"
-    "psubd     0x20(%1),%%xmm2                 \n"
-    "psubd     0x30(%1),%%xmm3                 \n"
-    "paddd    0x00(%1,%4,4),%%xmm0            \n"
-    "paddd    0x10(%1,%4,4),%%xmm1            \n"
-    "paddd    0x20(%1,%4,4),%%xmm2            \n"
-    "paddd    0x30(%1,%4,4),%%xmm3            \n"
-    "lea       0x40(%1),%1                     \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm1                   \n"
-    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
-    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
-    "mulps     %%xmm4,%%xmm2                   \n"
-    "mulps     %%xmm4,%%xmm3                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "cvtps2dq  %%xmm1,%%xmm1                   \n"
-    "cvtps2dq  %%xmm2,%%xmm2                   \n"
-    "cvtps2dq  %%xmm3,%%xmm3                   \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "psubd     0x10(%0,%4,4),%%xmm1            \n"
+      "psubd     0x20(%0,%4,4),%%xmm2            \n"
+      "psubd     0x30(%0,%4,4),%%xmm3            \n"
+      "lea       0x40(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "psubd     0x10(%1),%%xmm1                 \n"
+      "psubd     0x20(%1),%%xmm2                 \n"
+      "psubd     0x30(%1),%%xmm3                 \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "paddd     0x10(%1,%4,4),%%xmm1            \n"
+      "paddd     0x20(%1,%4,4),%%xmm2            \n"
+      "paddd     0x30(%1,%4,4),%%xmm3            \n"
+      "lea       0x40(%1),%1                     \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+      "mulps     %%xmm4,%%xmm0                   \n"
+      "mulps     %%xmm4,%%xmm1                   \n"
+      "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+      "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+      "mulps     %%xmm4,%%xmm2                   \n"
+      "mulps     %%xmm4,%%xmm3                   \n"
+      "cvtps2dq  %%xmm0,%%xmm0                   \n"
+      "cvtps2dq  %%xmm1,%%xmm1                   \n"
+      "cvtps2dq  %%xmm2,%%xmm2                   \n"
+      "cvtps2dq  %%xmm3,%%xmm3                   \n"
+      "packssdw  %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm3,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
 
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        19f                             \n"
 
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "psubd    0x00(%0,%4,4),%%xmm0            \n"
-    "lea       0x10(%0),%0                     \n"
-    "psubd     (%1),%%xmm0                     \n"
-    "paddd    0x00(%1,%4,4),%%xmm0            \n"
-    "lea       0x10(%1),%1                     \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(topleft),  // %0
-    "+r"(botleft),  // %1
-    "+r"(dst),      // %2
-    "+rm"(count)    // %3
-  : "r"((intptr_t)(width)),  // %4
-    "rm"(area)     // %5
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "lea       0x10(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "lea       0x10(%1),%1                     \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "mulps     %%xmm4,%%xmm0                   \n"
+      "cvtps2dq  %%xmm0,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x4(%2),%2                      \n"
+      "sub       $0x1,%3                         \n"
+      "jge       10b                             \n"
+      "19:                                       \n"
+      : "+r"(topleft),           // %0
+        "+r"(botleft),           // %1
+        "+r"(dst),               // %2
+        "+rm"(count)             // %3
+      : "r"((intptr_t)(width)),  // %4
+        "rm"(area)               // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 
@@ -5576,82 +5490,81 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
                         int width) {
   intptr_t src_argb_stride_temp = src_argb_stride;
   intptr_t temp;
-  asm volatile (
-    "movq      (%3),%%xmm2                     \n"
-    "movq      0x08(%3),%%xmm7                 \n"
-    "shl       $0x10,%1                        \n"
-    "add       $0x4,%1                         \n"
-    "movd      %1,%%xmm5                       \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
+  asm volatile(
+      "movq      (%3),%%xmm2                     \n"
+      "movq      0x08(%3),%%xmm7                 \n"
+      "shl       $0x10,%1                        \n"
+      "add       $0x4,%1                         \n"
+      "movd      %1,%%xmm5                       \n"
+      "sub       $0x4,%4                         \n"
+      "jl        49f                             \n"
 
-    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm0                   \n"
-    "movlhps   %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm7,%%xmm4                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
+      "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "addps     %%xmm7,%%xmm0                   \n"
+      "movlhps   %%xmm0,%%xmm2                   \n"
+      "movdqa    %%xmm7,%%xmm4                   \n"
+      "addps     %%xmm4,%%xmm4                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "addps     %%xmm4,%%xmm3                   \n"
+      "addps     %%xmm4,%%xmm4                   \n"
 
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
-    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
-    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
-    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
-    "movd      %%xmm0,%k1                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%k5                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd    0x00(%0,%1,1),%%xmm1            \n"
-    "movd    0x00(%0,%5,1),%%xmm6            \n"
-    "punpckldq %%xmm6,%%xmm1                   \n"
-    "addps     %%xmm4,%%xmm2                   \n"
-    "movq      %%xmm1,(%2)                     \n"
-    "movd      %%xmm0,%k1                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%k5                      \n"
-    "movd    0x00(%0,%1,1),%%xmm0            \n"
-    "movd    0x00(%0,%5,1),%%xmm6            \n"
-    "punpckldq %%xmm6,%%xmm0                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "movq      %%xmm0,0x08(%2)                 \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%4                         \n"
-    "jge       40b                             \n"
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "cvttps2dq %%xmm2,%%xmm0                   \n"  // x,y float->int first 2
+      "cvttps2dq %%xmm3,%%xmm1                   \n"  // x,y float->int next 2
+      "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
+      "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x*4 + y*stride
+      "movd      %%xmm0,%k1                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k5                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      0x00(%0,%1,1),%%xmm1            \n"
+      "movd      0x00(%0,%5,1),%%xmm6            \n"
+      "punpckldq %%xmm6,%%xmm1                   \n"
+      "addps     %%xmm4,%%xmm2                   \n"
+      "movq      %%xmm1,(%2)                     \n"
+      "movd      %%xmm0,%k1                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k5                      \n"
+      "movd      0x00(%0,%1,1),%%xmm0            \n"
+      "movd      0x00(%0,%5,1),%%xmm6            \n"
+      "punpckldq %%xmm6,%%xmm0                   \n"
+      "addps     %%xmm4,%%xmm3                   \n"
+      "movq      %%xmm0,0x08(%2)                 \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%4                         \n"
+      "jge       40b                             \n"
 
-  "49:                                         \n"
-    "add       $0x3,%4                         \n"
-    "jl        19f                             \n"
+      "49:                                       \n"
+      "add       $0x3,%4                         \n"
+      "jl        19f                             \n"
 
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "pmaddwd   %%xmm5,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm2                   \n"
-    "movd      %%xmm0,%k1                      \n"
-    "movd    0x00(%0,%1,1),%%xmm0            \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x04(%2),%2                     \n"
-    "sub       $0x1,%4                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_stride_temp),  // %1
-    "+r"(dst_argb),  // %2
-    "+r"(src_dudv),  // %3
-    "+rm"(width),    // %4
-    "=&r"(temp)      // %5
-  :
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "cvttps2dq %%xmm2,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "pmaddwd   %%xmm5,%%xmm0                   \n"
+      "addps     %%xmm7,%%xmm2                   \n"
+      "movd      %%xmm0,%k1                      \n"
+      "movd      0x00(%0,%1,1),%%xmm0            \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x04(%2),%2                     \n"
+      "sub       $0x1,%4                         \n"
+      "jge       10b                             \n"
+      "19:                                       \n"
+      : "+r"(src_argb),              // %0
+        "+r"(src_argb_stride_temp),  // %1
+        "+r"(dst_argb),              // %2
+        "+r"(src_dudv),              // %3
+        "+rm"(width),                // %4
+        "=&r"(temp)                  // %5
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
@@ -5662,79 +5575,77 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
                           ptrdiff_t src_stride,
                           int dst_width,
                           int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x80,%3                        \n"
-    "je        50f                             \n"
+  asm volatile(
+      "sub       %1,%0                           \n"
+      "cmp       $0x0,%3                         \n"
+      "je        100f                            \n"
+      "cmp       $0x80,%3                        \n"
+      "je        50f                             \n"
 
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x100,%3                       \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x80808080,%%eax               \n"
-    "movd      %%eax,%%xmm4                    \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "movd      %3,%%xmm0                       \n"
+      "neg       %3                              \n"
+      "add       $0x100,%3                       \n"
+      "movd      %3,%%xmm5                       \n"
+      "punpcklbw %%xmm0,%%xmm5                   \n"
+      "punpcklwd %%xmm5,%%xmm5                   \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x80808080,%%eax               \n"
+      "movd      %%eax,%%xmm4                    \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
 
-    // General purpose row blend.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    0x00(%1,%4,1),%%xmm2            \n"
-    "movdqa     %%xmm0,%%xmm1                  \n"
-    "punpcklbw  %%xmm2,%%xmm0                  \n"
-    "punpckhbw  %%xmm2,%%xmm1                  \n"
-    "psubb      %%xmm4,%%xmm0                  \n"
-    "psubb      %%xmm4,%%xmm1                  \n"
-    "movdqa     %%xmm5,%%xmm2                  \n"
-    "movdqa     %%xmm5,%%xmm3                  \n"
-    "pmaddubsw  %%xmm0,%%xmm2                  \n"
-    "pmaddubsw  %%xmm1,%%xmm3                  \n"
-    "paddw      %%xmm4,%%xmm2                  \n"
-    "paddw      %%xmm4,%%xmm3                  \n"
-    "psrlw      $0x8,%%xmm2                    \n"
-    "psrlw      $0x8,%%xmm3                    \n"
-    "packuswb   %%xmm3,%%xmm2                  \n"
-    "movdqu    %%xmm2,0x00(%1,%0,1)            \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x00(%1,%4,1),%%xmm2            \n"
+      "movdqa     %%xmm0,%%xmm1                  \n"
+      "punpcklbw  %%xmm2,%%xmm0                  \n"
+      "punpckhbw  %%xmm2,%%xmm1                  \n"
+      "psubb      %%xmm4,%%xmm0                  \n"
+      "psubb      %%xmm4,%%xmm1                  \n"
+      "movdqa     %%xmm5,%%xmm2                  \n"
+      "movdqa     %%xmm5,%%xmm3                  \n"
+      "pmaddubsw  %%xmm0,%%xmm2                  \n"
+      "pmaddubsw  %%xmm1,%%xmm3                  \n"
+      "paddw      %%xmm4,%%xmm2                  \n"
+      "paddw      %%xmm4,%%xmm3                  \n"
+      "psrlw      $0x8,%%xmm2                    \n"
+      "psrlw      $0x8,%%xmm3                    \n"
+      "packuswb   %%xmm3,%%xmm2                  \n"
+      "movdqu    %%xmm2,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "jmp       99f                             \n"
 
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    0x00(%1,%4,1),%%xmm1            \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x00(%1,%4,1),%%xmm1            \n"
+      "pavgb     %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        50b                             \n"
+      "jmp       99f                             \n"
 
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        100b                            \n"
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        100b                            \n"
 
-  "99:                                         \n"
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+rm"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", "eax",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      "99:                                       \n"
+      : "+r"(dst_ptr),               // %0
+        "+r"(src_ptr),               // %1
+        "+rm"(dst_width),            // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_INTERPOLATEROW_SSSE3
 
@@ -5745,74 +5656,72 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
                          ptrdiff_t src_stride,
                          int dst_width,
                          int source_y_fraction) {
-  asm volatile (
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "sub       %1,%0                           \n"
-    "cmp       $0x80,%3                        \n"
-    "je        50f                             \n"
+  asm volatile(
+      "cmp       $0x0,%3                         \n"
+      "je        100f                            \n"
+      "sub       %1,%0                           \n"
+      "cmp       $0x80,%3                        \n"
+      "je        50f                             \n"
 
-    "vmovd      %3,%%xmm0                      \n"
-    "neg        %3                             \n"
-    "add        $0x100,%3                      \n"
-    "vmovd      %3,%%xmm5                      \n"
-    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
-    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
-    "vbroadcastss %%xmm5,%%ymm5                \n"
-    "mov        $0x80808080,%%eax              \n"
-    "vmovd      %%eax,%%xmm4                   \n"
-    "vbroadcastss %%xmm4,%%ymm4                \n"
+      "vmovd      %3,%%xmm0                      \n"
+      "neg        %3                             \n"
+      "add        $0x100,%3                      \n"
+      "vmovd      %3,%%xmm5                      \n"
+      "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+      "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+      "vbroadcastss %%xmm5,%%ymm5                \n"
+      "mov        $0x80808080,%%eax              \n"
+      "vmovd      %%eax,%%xmm4                   \n"
+      "vbroadcastss %%xmm4,%%ymm4                \n"
 
-    // General purpose row blend.
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%1),%%ymm0                    \n"
-    "vmovdqu    0x00(%1,%4,1),%%ymm2            \n"
-    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
-    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
-    "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
-    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0,0x00(%1,%0,1)            \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%1),%%ymm0                    \n"
+      "vmovdqu    0x00(%1,%4,1),%%ymm2           \n"
+      "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+      "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
+      "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
+      "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%1,%0,1)           \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "jmp        99f                            \n"
 
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "vmovdqu    (%1),%%ymm0                    \n"
-    "vpavgb    0x00(%1,%4,1),%%ymm0,%%ymm0     \n"
-    "vmovdqu    %%ymm0,0x00(%1,%0,1)            \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x20,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "vmovdqu   (%1),%%ymm0                     \n"
+      "vpavgb    0x00(%1,%4,1),%%ymm0,%%ymm0     \n"
+      "vmovdqu   %%ymm0,0x00(%1,%0,1)            \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        50b                             \n"
+      "jmp       99f                             \n"
 
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "rep movsb                                 \n"
-    "jmp       999f                            \n"
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "rep movsb                                 \n"
+      "jmp       999f                            \n"
 
-  "99:                                         \n"
-    "vzeroupper                                \n"
-  "999:                                        \n"
-  : "+D"(dst_ptr),    // %0
-    "+S"(src_ptr),    // %1
-    "+cm"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", "eax",
-    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
-  );
+      "99:                                       \n"
+      "vzeroupper                                \n"
+      "999:                                      \n"
+      : "+D"(dst_ptr),               // %0
+        "+S"(src_ptr),               // %1
+        "+cm"(dst_width),            // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
 
@@ -5822,27 +5731,27 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           const uint8_t* shuffler,
                           int width) {
-  asm volatile (
-    "movdqu    (%3),%%xmm5                     \n"
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+  asm volatile(
+
+      "movdqu    (%3),%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
 
@@ -5852,28 +5761,28 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-  asm volatile (
-    "vbroadcastf128 (%3),%%ymm5                \n"
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu   (%0),%%ymm0                     \n"
-    "vmovdqu   0x20(%0),%%ymm1                 \n"
-    "lea       0x40(%0),%0                     \n"
-    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
-    "vmovdqu   %%ymm0,(%1)                     \n"
-    "vmovdqu   %%ymm1,0x20(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+  asm volatile(
+
+      "vbroadcastf128 (%3),%%ymm5                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "vmovdqu   %%ymm1,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
@@ -5883,33 +5792,33 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_frame,
                         int width) {
-  asm volatile (
-    "sub       %1,%2                             \n"
-    LABELALIGN
-    "1:                                          \n"
-    "movq      (%1),%%xmm2                       \n"
-    "movq    0x00(%1,%2,1),%%xmm3            \n"
-    "lea       0x8(%1),%1                        \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    (%0),%%xmm0                       \n"
-    "lea       0x10(%0),%0                       \n"
-    "movdqa    %%xmm0,%%xmm1                     \n"
-    "punpcklbw %%xmm2,%%xmm0                     \n"
-    "punpckhbw %%xmm2,%%xmm1                     \n"
-    "movdqu    %%xmm0,(%3)                       \n"
-    "movdqu    %%xmm1,0x10(%3)                   \n"
-    "lea       0x20(%3),%3                       \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+  asm volatile(
+
+      "sub       %1,%2                             \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "movq      (%1),%%xmm2                       \n"
+      "movq      0x00(%1,%2,1),%%xmm3              \n"
+      "lea       0x8(%1),%1                        \n"
+      "punpcklbw %%xmm3,%%xmm2                     \n"
+      "movdqu    (%0),%%xmm0                       \n"
+      "lea       0x10(%0),%0                       \n"
+      "movdqa    %%xmm0,%%xmm1                     \n"
+      "punpcklbw %%xmm2,%%xmm0                     \n"
+      "punpckhbw %%xmm2,%%xmm1                     \n"
+      "movdqu    %%xmm0,(%3)                       \n"
+      "movdqu    %%xmm1,0x10(%3)                   \n"
+      "lea       0x20(%3),%3                       \n"
+      "sub       $0x10,%4                          \n"
+      "jg         1b                               \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_u),      // %1
+        "+r"(src_v),      // %2
+        "+r"(dst_frame),  // %3
+        "+rm"(width)      // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 #endif  // HAS_I422TOYUY2ROW_SSE2
 
@@ -5919,33 +5828,33 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_frame,
                         int width) {
-  asm volatile (
-    "sub        %1,%2                            \n"
-    LABELALIGN
-    "1:                                          \n"
-    "movq      (%1),%%xmm2                       \n"
-    "movq    0x00(%1,%2,1),%%xmm3            \n"
-    "lea       0x8(%1),%1                        \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    (%0),%%xmm0                       \n"
-    "movdqa    %%xmm2,%%xmm1                     \n"
-    "lea       0x10(%0),%0                       \n"
-    "punpcklbw %%xmm0,%%xmm1                     \n"
-    "punpckhbw %%xmm0,%%xmm2                     \n"
-    "movdqu    %%xmm1,(%3)                       \n"
-    "movdqu    %%xmm2,0x10(%3)                   \n"
-    "lea       0x20(%3),%3                       \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+  asm volatile(
+
+      "sub        %1,%2                            \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "movq      (%1),%%xmm2                       \n"
+      "movq      0x00(%1,%2,1),%%xmm3              \n"
+      "lea       0x8(%1),%1                        \n"
+      "punpcklbw %%xmm3,%%xmm2                     \n"
+      "movdqu    (%0),%%xmm0                       \n"
+      "movdqa    %%xmm2,%%xmm1                     \n"
+      "lea       0x10(%0),%0                       \n"
+      "punpcklbw %%xmm0,%%xmm1                     \n"
+      "punpckhbw %%xmm0,%%xmm2                     \n"
+      "movdqu    %%xmm1,(%3)                       \n"
+      "movdqu    %%xmm2,0x10(%3)                   \n"
+      "lea       0x20(%3),%3                       \n"
+      "sub       $0x10,%4                          \n"
+      "jg         1b                               \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_u),      // %1
+        "+r"(src_v),      // %2
+        "+r"(dst_frame),  // %3
+        "+rm"(width)      // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 #endif  // HAS_I422TOUYVYROW_SSE2
 
@@ -5954,55 +5863,54 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const float* poly,
                             int width) {
-  asm volatile (
-    "pxor      %%xmm3,%%xmm3                   \n"
+  asm volatile(
 
-    // 2 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movq      (%0),%%xmm0                     \n"
-    "lea       0x8(%0),%0                      \n"
-    "punpcklbw %%xmm3,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm4                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm4                   \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "mulps     0x10(%3),%%xmm0                 \n"
-    "mulps     0x10(%3),%%xmm4                 \n"
-    "addps     (%3),%%xmm0                     \n"
-    "addps     (%3),%%xmm4                     \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm5,%%xmm6                   \n"
-    "mulps     %%xmm1,%%xmm2                   \n"
-    "mulps     %%xmm5,%%xmm6                   \n"
-    "mulps     %%xmm2,%%xmm1                   \n"
-    "mulps     %%xmm6,%%xmm5                   \n"
-    "mulps     0x20(%3),%%xmm2                 \n"
-    "mulps     0x20(%3),%%xmm6                 \n"
-    "mulps     0x30(%3),%%xmm1                 \n"
-    "mulps     0x30(%3),%%xmm5                 \n"
-    "addps     %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm6,%%xmm4                   \n"
-    "addps     %%xmm1,%%xmm0                   \n"
-    "addps     %%xmm5,%%xmm4                   \n"
-    "cvttps2dq %%xmm0,%%xmm0                   \n"
-    "cvttps2dq %%xmm4,%%xmm4                   \n"
-    "packuswb  %%xmm4,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x2,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(poly)        // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      "pxor      %%xmm3,%%xmm3                   \n"
+
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm3,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm4                   \n"
+      "punpcklwd %%xmm3,%%xmm0                   \n"
+      "punpckhwd %%xmm3,%%xmm4                   \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "mulps     0x10(%3),%%xmm0                 \n"
+      "mulps     0x10(%3),%%xmm4                 \n"
+      "addps     (%3),%%xmm0                     \n"
+      "addps     (%3),%%xmm4                     \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm5,%%xmm6                   \n"
+      "mulps     %%xmm1,%%xmm2                   \n"
+      "mulps     %%xmm5,%%xmm6                   \n"
+      "mulps     %%xmm2,%%xmm1                   \n"
+      "mulps     %%xmm6,%%xmm5                   \n"
+      "mulps     0x20(%3),%%xmm2                 \n"
+      "mulps     0x20(%3),%%xmm6                 \n"
+      "mulps     0x30(%3),%%xmm1                 \n"
+      "mulps     0x30(%3),%%xmm5                 \n"
+      "addps     %%xmm2,%%xmm0                   \n"
+      "addps     %%xmm6,%%xmm4                   \n"
+      "addps     %%xmm1,%%xmm0                   \n"
+      "addps     %%xmm5,%%xmm4                   \n"
+      "cvttps2dq %%xmm0,%%xmm0                   \n"
+      "cvttps2dq %%xmm4,%%xmm4                   \n"
+      "packuswb  %%xmm4,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x2,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
@@ -6011,184 +5919,185 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const float* poly,
                             int width) {
-  asm volatile (
-    "vbroadcastf128 (%3),%%ymm4                 \n"
-    "vbroadcastf128 0x10(%3),%%ymm5               \n"
-    "vbroadcastf128 0x20(%3),%%ymm6               \n"
-    "vbroadcastf128 0x30(%3),%%ymm7               \n"
+  asm volatile(
+      "vbroadcastf128 (%3),%%ymm4                \n"
+      "vbroadcastf128 0x10(%3),%%ymm5            \n"
+      "vbroadcastf128 0x20(%3),%%ymm6            \n"
+      "vbroadcastf128 0x30(%3),%%ymm7            \n"
 
-    // 2 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
-    "lea         0x8(%0),%0                    \n"
-    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
-    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
-    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
-    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
-    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
-    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
-    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
-    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
-    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
-    "vmovq       %%xmm0,(%1)                   \n"
-    "lea         0x8(%1),%1                    \n"
-    "sub         $0x2,%2                       \n"
-    "jg          1b                            \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(poly)        // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
+      "lea         0x8(%0),%0                    \n"
+      "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+      "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+      "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+      "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+      "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+      "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
+                                                      // X
+      "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+      "vmovq       %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x2,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
 #ifdef HAS_HALFFLOATROW_SSE2
 static float kScaleBias = 1.9259299444e-34f;
-void HalfFloatRow_SSE2(const uint16_t* src, uint16_t* dst, float scale, int width) {
+void HalfFloatRow_SSE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
   scale *= kScaleBias;
-  asm volatile (
-    "movd        %3,%%xmm4                     \n"
-    "pshufd      $0x0,%%xmm4,%%xmm4            \n"
-    "pxor        %%xmm5,%%xmm5                 \n"
-    "sub         %0,%1                         \n"
+  asm volatile(
+      "movd        %3,%%xmm4                     \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "sub         %0,%1                         \n"
 
-    // 16 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
-    "add         $0x10,%0                      \n"
-    "movdqa      %%xmm2,%%xmm3                 \n"
-    "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
-    "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
-    "punpckhwd   %%xmm5,%%xmm3                 \n"
-    "cvtdq2ps    %%xmm3,%%xmm3                 \n"
-    "mulps       %%xmm4,%%xmm2                 \n"
-    "mulps       %%xmm4,%%xmm3                 \n"
-    "psrld       $0xd,%%xmm2                   \n"
-    "psrld       $0xd,%%xmm3                   \n"
-    "packssdw    %%xmm3,%%xmm2                 \n"
-    "movdqu    %%xmm2,-0x10(%0,%1,1)            \n"
-    "sub         $0x8,%2                       \n"
-    "jg          1b                            \n"
-  : "+r"(src),    // %0
-    "+r"(dst),    // %1
-    "+r"(width)   // %2
-  : "m"(scale)   // %3
-  : "memory", "cc",
-    "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
+      "add         $0x10,%0                      \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
+      "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
+      "punpckhwd   %%xmm5,%%xmm3                 \n"
+      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
+      "mulps       %%xmm4,%%xmm2                 \n"
+      "mulps       %%xmm4,%%xmm3                 \n"
+      "psrld       $0xd,%%xmm2                   \n"
+      "psrld       $0xd,%%xmm3                   \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "m"(scale)   // %3
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_HALFFLOATROW_SSE2
 
 #ifdef HAS_HALFFLOATROW_AVX2
-void HalfFloatRow_AVX2(const uint16_t* src, uint16_t* dst, float scale, int width) {
+void HalfFloatRow_AVX2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
   scale *= kScaleBias;
-  asm volatile (
-    "vbroadcastss  %3, %%ymm4                  \n"
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-    "sub        %0,%1                          \n"
+  asm volatile(
+      "vbroadcastss  %3, %%ymm4                  \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+      "sub        %0,%1                          \n"
 
-    // 16 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm2                    \n"  // 16 shorts
-    "add        $0x20,%0                       \n"
-    "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
-    "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
-    "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
-    "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
-    "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
-    "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
-    "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
-    "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
-    "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
-    "vmovdqu    %%ymm2,-0x20(%0,%1,1)            \n"
-    "sub        $0x10,%2                       \n"
-    "jg         1b                             \n"
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm2                    \n"  // 16 shorts
+      "add        $0x20,%0                       \n"
+      "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
+      "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
+      "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
+      "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
+      "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
+      "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
+      "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
+      "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
+      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
+      "vmovdqu    %%ymm2,-0x20(%0,%1,1)          \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
 
-    "vzeroupper                                \n"
-  : "+r"(src),    // %0
-    "+r"(dst),    // %1
-    "+r"(width)   // %2
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
 #if defined(__x86_64__)
-  : "x"(scale)   // %3
+      : "x"(scale)  // %3
 #else
-  : "m"(scale)   // %3
+      : "m"(scale)  // %3
 #endif
-  : "memory", "cc",
-    "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_HALFFLOATROW_AVX2
 
 #ifdef HAS_HALFFLOATROW_F16C
-void HalfFloatRow_F16C(const uint16_t* src, uint16_t* dst, float scale, int width) {
-  asm volatile (
-    "vbroadcastss  %3, %%ymm4                  \n"
-    "sub        %0,%1                          \n"
+void HalfFloatRow_F16C(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "vbroadcastss  %3, %%ymm4                  \n"
+      "sub        %0,%1                          \n"
 
-    // 16 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
-    "vpmovzxwd   0x10(%0),%%ymm3               \n"
-    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
-    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
-    "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
-    "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
-    "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
-    "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
-    "vmovdqu    %%xmm2,0x00(%0,%1,1)            \n"
-    "vmovdqu    %%xmm3,0x10(%0,%1,1)            \n"
-    "add         $0x20,%0                      \n"
-    "sub         $0x10,%2                      \n"
-    "jg          1b                            \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
+      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
 #if defined(__x86_64__)
-  : "x"(scale)   // %3
+      : "x"(scale)  // %3
 #else
-  : "m"(scale)   // %3
+      : "m"(scale)  // %3
 #endif
-  : "memory", "cc",
-    "xmm2", "xmm3", "xmm4"
-  );
+      : "memory", "cc", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_HALFFLOATROW_F16C
 
 #ifdef HAS_HALFFLOATROW_F16C
 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
-  asm volatile (
-    "sub        %0,%1                          \n"
-    // 16 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
-    "vpmovzxwd   0x10(%0),%%ymm3               \n"
-    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
-    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
-    "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
-    "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
-    "vmovdqu    %%xmm2,0x00(%0,%1,1)            \n"
-    "vmovdqu    %%xmm3,0x10(%0,%1,1)            \n"
-    "add         $0x20,%0                      \n"
-    "sub         $0x10,%2                      \n"
-    "jg          1b                            \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc",
-    "xmm2", "xmm3"
-  );
+  asm volatile(
+      "sub        %0,%1                          \n"
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm2", "xmm3");
 }
 #endif  // HAS_HALFFLOATROW_F16C
 
@@ -6198,58 +6107,60 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb,
                            const uint8_t* table_argb,
                            int width) {
   uintptr_t pixel_temp;
-  asm volatile (
-    // 1 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movzb     (%0),%1                         \n"
-    "lea       0x4(%0),%0                      \n"
-    "movzb     0x00(%3,%1,4),%1                \n"
-    "mov       %b1,-0x4(%0)                    \n"
-    "movzb     -0x3(%0),%1                     \n"
-    "movzb     0x01(%3,%1,4),%1                \n"
-    "mov       %b1,-0x3(%0)                    \n"
-    "movzb     -0x2(%0),%1                     \n"
-    "movzb     0x02(%3,%1,4),%1                \n"
-    "mov       %b1,-0x2(%0)                    \n"
-    "movzb     -0x1(%0),%1                     \n"
-    "movzb     0x03(%3,%1,4),%1                \n"
-    "mov       %b1,-0x1(%0)                    \n"
-    "dec       %2                              \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),     // %0
-    "=&d"(pixel_temp),  // %1
-    "+r"(width)         // %2
-  : "r"(table_argb)     // %3
-  : "memory", "cc");
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb     (%0),%1                         \n"
+      "lea       0x4(%0),%0                      \n"
+      "movzb     0x00(%3,%1,4),%1                \n"
+      "mov       %b1,-0x4(%0)                    \n"
+      "movzb     -0x3(%0),%1                     \n"
+      "movzb     0x01(%3,%1,4),%1                \n"
+      "mov       %b1,-0x3(%0)                    \n"
+      "movzb     -0x2(%0),%1                     \n"
+      "movzb     0x02(%3,%1,4),%1                \n"
+      "mov       %b1,-0x2(%0)                    \n"
+      "movzb     -0x1(%0),%1                     \n"
+      "movzb     0x03(%3,%1,4),%1                \n"
+      "mov       %b1,-0x1(%0)                    \n"
+      "dec       %2                              \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
 }
 #endif  // HAS_ARGBCOLORTABLEROW_X86
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-void RGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width) {
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+                          const uint8_t* table_argb,
+                          int width) {
   uintptr_t pixel_temp;
-  asm volatile (
-    // 1 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movzb     (%0),%1                         \n"
-    "lea       0x4(%0),%0                      \n"
-    "movzb     0x00(%3,%1,4),%1                \n"
-    "mov       %b1,-0x4(%0)                    \n"
-    "movzb     -0x3(%0),%1                     \n"
-    "movzb     0x01(%3,%1,4),%1                \n"
-    "mov       %b1,-0x3(%0)                    \n"
-    "movzb     -0x2(%0),%1                     \n"
-    "movzb     0x02(%3,%1,4),%1                \n"
-    "mov       %b1,-0x2(%0)                    \n"
-    "dec       %2                              \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),     // %0
-    "=&d"(pixel_temp),  // %1
-    "+r"(width)         // %2
-  : "r"(table_argb)     // %3
-  : "memory", "cc");
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb     (%0),%1                         \n"
+      "lea       0x4(%0),%0                      \n"
+      "movzb     0x00(%3,%1,4),%1                \n"
+      "mov       %b1,-0x4(%0)                    \n"
+      "movzb     -0x3(%0),%1                     \n"
+      "movzb     0x01(%3,%1,4),%1                \n"
+      "mov       %b1,-0x3(%0)                    \n"
+      "movzb     -0x2(%0),%1                     \n"
+      "movzb     0x02(%3,%1,4),%1                \n"
+      "mov       %b1,-0x2(%0)                    \n"
+      "dec       %2                              \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
 }
 #endif  // HAS_RGBCOLORTABLEROW_X86
 
@@ -6262,96 +6173,95 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
                                  uint32_t lumacoeff) {
   uintptr_t pixel_temp;
   uintptr_t table_temp;
-  asm volatile (
-    "movd      %6,%%xmm3                       \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0x8,%%xmm4                     \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
+  asm volatile(
+      "movd      %6,%%xmm3                       \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0x8,%%xmm4                     \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
 
-    // 4 pixel loop.
-    LABELALIGN
-    "1:                                        \n"
-    "movdqu    (%2),%%xmm0                     \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "phaddw    %%xmm0,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%2),%%xmm0                     \n"
+      "pmaddubsw %%xmm3,%%xmm0                   \n"
+      "phaddw    %%xmm0,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "punpcklwd %%xmm5,%%xmm0                   \n"
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
 
-    "movzb     (%2),%0                         \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,(%3)                        \n"
-    "movzb     0x1(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0x1(%3)                     \n"
-    "movzb     0x2(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0x2(%3)                     \n"
-    "movzb     0x3(%2),%0                      \n"
-    "mov       %b0,0x3(%3)                     \n"
+      "movzb     (%2),%0                         \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,(%3)                        \n"
+      "movzb     0x1(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x1(%3)                     \n"
+      "movzb     0x2(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x2(%3)                     \n"
+      "movzb     0x3(%2),%0                      \n"
+      "mov       %b0,0x3(%3)                     \n"
 
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
 
-    "movzb     0x4(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0x4(%3)                     \n"
-    "movzb     0x5(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0x5(%3)                     \n"
-    "movzb     0x6(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0x6(%3)                     \n"
-    "movzb     0x7(%2),%0                      \n"
-    "mov       %b0,0x7(%3)                     \n"
+      "movzb     0x4(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x4(%3)                     \n"
+      "movzb     0x5(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x5(%3)                     \n"
+      "movzb     0x6(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x6(%3)                     \n"
+      "movzb     0x7(%2),%0                      \n"
+      "mov       %b0,0x7(%3)                     \n"
 
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
 
-    "movzb     0x8(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0x8(%3)                     \n"
-    "movzb     0x9(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0x9(%3)                     \n"
-    "movzb     0xa(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0xa(%3)                     \n"
-    "movzb     0xb(%2),%0                      \n"
-    "mov       %b0,0xb(%3)                     \n"
+      "movzb     0x8(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x8(%3)                     \n"
+      "movzb     0x9(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x9(%3)                     \n"
+      "movzb     0xa(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xa(%3)                     \n"
+      "movzb     0xb(%2),%0                      \n"
+      "mov       %b0,0xb(%3)                     \n"
 
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
 
-    "movzb     0xc(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0xc(%3)                     \n"
-    "movzb     0xd(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0xd(%3)                     \n"
-    "movzb     0xe(%2),%0                      \n"
-    "movzb     0x00(%1,%0,1),%0                \n"
-    "mov       %b0,0xe(%3)                     \n"
-    "movzb     0xf(%2),%0                      \n"
-    "mov       %b0,0xf(%3)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "lea       0x10(%3),%3                     \n"
-    "sub       $0x4,%4                         \n"
-    "jg        1b                              \n"
-  : "=&d"(pixel_temp),  // %0
-    "=&a"(table_temp),  // %1
-    "+r"(src_argb),     // %2
-    "+r"(dst_argb),     // %3
-    "+rm"(width)        // %4
-  : "r"(luma),          // %5
-    "rm"(lumacoeff)     // %6
-  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
-  );
+      "movzb     0xc(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xc(%3)                     \n"
+      "movzb     0xd(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xd(%3)                     \n"
+      "movzb     0xe(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xe(%3)                     \n"
+      "movzb     0xf(%2),%0                      \n"
+      "mov       %b0,0xf(%3)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "lea       0x10(%3),%3                     \n"
+      "sub       $0x4,%4                         \n"
+      "jg        1b                              \n"
+      : "=&d"(pixel_temp),  // %0
+        "=&a"(table_temp),  // %1
+        "+r"(src_argb),     // %2
+        "+r"(dst_argb),     // %3
+        "+rm"(width)        // %4
+      : "r"(luma),          // %5
+        "rm"(lumacoeff)     // %6
+      : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
diff --git a/source/row_msa.cc b/source/row_msa.cc
index 0bf719c46..ba9a84328 100644
--- a/source/row_msa.cc
+++ b/source/row_msa.cc
@@ -37,14 +37,14 @@ extern "C" {
   }
 
 // Load YUV 422 pixel data
-#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v)  \
-  {                                                              \
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v)    \
+  {                                                                \
     uint64_t y_m;                                                  \
     uint32_t u_m, v_m;                                             \
-    v4i32 zero_m = {0};                                          \
-    y_m = LD(psrc_y);                                            \
-    u_m = LW(psrc_u);                                            \
-    v_m = LW(psrc_v);                                            \
+    v4i32 zero_m = {0};                                            \
+    y_m = LD(psrc_y);                                              \
+    u_m = LW(psrc_u);                                              \
+    v_m = LW(psrc_v);                                              \
     out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \
     out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m);        \
     out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m);        \
@@ -275,14 +275,14 @@ extern "C" {
 // Load I444 pixel data
 #define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
   {                                                           \
-    uint64_t y_m, u_m, v_m;                                     \
+    uint64_t y_m, u_m, v_m;                                   \
     v2i64 zero_m = {0};                                       \
     y_m = LD(psrc_y);                                         \
     u_m = LD(psrc_u);                                         \
     v_m = LD(psrc_v);                                         \
-    out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m);     \
-    out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m);     \
-    out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m);     \
+    out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m);   \
+    out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m);   \
+    out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m);   \
   }
 
 void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
@@ -1014,7 +1014,9 @@ void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   }
 }
 
-void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
   int x;
   v16u8 src0, src1, dst0;
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
@@ -1054,7 +1056,9 @@ void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width)
   }
 }
 
-void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
   int x;
   v16u8 src0, src1;
   v16u8 vec0, vec1;
@@ -1230,7 +1234,9 @@ void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
   }
 }
 
-void ARGBAttenuateRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
   int x;
   v16u8 src0, src1, dst0, dst1;
   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
@@ -1547,7 +1553,9 @@ void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
   }
 }
 
-void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) {
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width) {
   int x;
   v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
   v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@@ -1592,7 +1600,9 @@ void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_argb, int width
   }
 }
 
-void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24,
+                        uint8_t* dst_argb,
+                        int width) {
   int x;
   v16u8 src0, src1, src2;
   v16u8 vec0, vec1, vec2;
@@ -1642,7 +1652,9 @@ void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   }
 }
 
-void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
+                        uint8_t* dst_y,
+                        int width) {
   int x;
   v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
   v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@@ -2969,7 +2981,9 @@ void MergeUVRow_MSA(const uint8_t* src_u,
   }
 }
 
-void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, uint8_t* dst_a, int width) {
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width) {
   int i;
   v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
 
@@ -3429,7 +3443,10 @@ void SobelYRow_MSA(const uint8_t* src_y0,
   }
 }
 
-void HalfFloatRow_MSA(const uint16_t* src, uint16_t* dst, float scale, int width) {
+void HalfFloatRow_MSA(const uint16_t* src,
+                      uint16_t* dst,
+                      float scale,
+                      int width) {
   int i;
   v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
   v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
diff --git a/source/row_neon.cc b/source/row_neon.cc
index eea0598e8..32491a47a 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -694,7 +694,9 @@ void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
       : "cc", "memory", "r3", "q0");
 }
 
-void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
   asm volatile(
       "vmov.u8    d4, #255                       \n"  // Alpha
       "1:                                        \n"
@@ -756,7 +758,9 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
   "vorr.u8    d1, d4, d6                     \n" /* G                    */
 
-void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) {
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
   asm volatile(
       "vmov.u8    d3, #255                       \n"  // Alpha
       "1:                                        \n"
@@ -848,7 +852,9 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
       );
 }
 
-void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) {
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width) {
   asm volatile(
       "1:                                        \n"
       "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
@@ -1070,7 +1076,9 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
       : "cc", "memory", "d0", "d1", "d2", "d3");
 }
 
-void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width) {
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
   asm volatile(
       "1:                                        \n"
       "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
@@ -1166,7 +1174,9 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
 }
 
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) {
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
   asm volatile(
       "1:                                        \n"
       "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
@@ -1798,7 +1808,9 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
       : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
 }
 
-void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
   asm volatile(
       "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
       "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
@@ -1822,7 +1834,9 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width)
       : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
 }
 
-void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
   asm volatile(
       "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
       "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
@@ -2081,7 +2095,9 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
 }
 
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
   asm volatile(
       // Attenuate 8 pixels.
       "1:                                        \n"
@@ -2561,7 +2577,10 @@ void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float, int width) {
 }
 
 // TODO(fbarchard): multiply by element.
-void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width) {
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
   asm volatile(
       "vdup.32    q0, %3                         \n"
 
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 3b06d880a..936709667 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -733,7 +733,9 @@ void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
       : "cc", "memory", "v0");
 }
 
-void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
   asm volatile(
       "movi       v4.8b, #255                    \n"  // Alpha
       "1:                                        \n"
@@ -797,7 +799,9 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
   "dup        v2.2D, v0.D[1]                 \n" /* R                    */
 
-void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) {
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
   asm volatile(
       "movi       v3.8b, #255                    \n"  // Alpha
       "1:                                        \n"
@@ -902,7 +906,9 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
       );
 }
 
-void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) {
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width) {
   asm volatile(
       "1:                                        \n"
       "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
@@ -1126,7 +1132,9 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
       : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
-void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width) {
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
   asm volatile(
       "1:                                        \n"
       "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
@@ -1223,7 +1231,9 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) {
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
   asm volatile(
       "1:                                        \n"
       "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16
@@ -1829,7 +1839,9 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
         "v27");
 }
 
-void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
   asm volatile(
       "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
       "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
@@ -1853,7 +1865,9 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width)
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
-void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
   asm volatile(
       "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
       "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
@@ -2121,7 +2135,9 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
 }
 
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
   asm volatile(
       // Attenuate 8 pixels.
       "1:                                        \n"
@@ -2604,7 +2620,10 @@ void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float, int width) {
       : "cc", "memory", "v1", "v2", "v3");
 }
 
-void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width) {
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
   asm volatile(
       "1:                                        \n"
       "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
diff --git a/source/row_win.cc b/source/row_win.cc
index cd5cd565b..e94bb2270 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -28,27 +28,27 @@ extern "C" {
 #if defined(_M_X64)
 
 // Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422                                      \
+#define READYUV422                                        \
   xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
-  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                 \
-  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                \
-  u_buf += 4;                                           \
-  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);              \
-  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                 \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  u_buf += 4;                                             \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
   y_buf += 8;
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                     \
+#define READYUVA422                                       \
   xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
   xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
-  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                 \
-  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                \
-  u_buf += 4;                                           \
-  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);              \
-  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                 \
-  y_buf += 8;                                           \
-  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);              \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  u_buf += 4;                                             \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
+  y_buf += 8;                                             \
+  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \
   a_buf += 8;
 
 // Convert 8 pixels: 8 UV and 8 Y.
@@ -3022,7 +3022,9 @@ __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
-__declspec(naked) void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
+                                      uint8_t* dst,
+                                      int width) {
   __asm {
     mov       eax, [esp + 4]  // src
     mov       edx, [esp + 8]  // dst
@@ -3274,7 +3276,9 @@ __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
 
 #ifdef HAS_COPYROW_SSE2
 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int count) {
+__declspec(naked) void CopyRow_SSE2(const uint8_t* src,
+                                    uint8_t* dst,
+                                    int count) {
   __asm {
     mov        eax, [esp + 4]  // src
     mov        edx, [esp + 8]  // dst
@@ -3311,7 +3315,9 @@ __declspec(naked) void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int count)
 
 #ifdef HAS_COPYROW_AVX
 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked) void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int count) {
+__declspec(naked) void CopyRow_AVX(const uint8_t* src,
+                                   uint8_t* dst,
+                                   int count) {
   __asm {
     mov        eax, [esp + 4]  // src
     mov        edx, [esp + 8]  // dst
@@ -3334,7 +3340,9 @@ __declspec(naked) void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int count)
 #endif  // HAS_COPYROW_AVX
 
 // Multiple of 1.
-__declspec(naked) void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int count) {
+__declspec(naked) void CopyRow_ERMS(const uint8_t* src,
+                                    uint8_t* dst,
+                                    int count) {
   __asm {
     mov        eax, esi
     mov        edx, edi
@@ -3582,7 +3590,9 @@ __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int count) {
 }
 
 // Write 'count' 32 bit values.
-__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int count) {
+__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
+                                      uint32_t v32,
+                                      int count) {
   __asm {
     mov        edx, edi
     mov        edi, [esp + 4]  // dst
diff --git a/source/scale.cc b/source/scale.cc
index 9b0a40e4d..510372beb 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -815,8 +815,8 @@ static void ScalePlaneBox(int src_width,
                          const uint16_t* src_ptr, uint8_t* dst_ptr) =
         (dx & 0xffff) ? ScaleAddCols2_C
                       : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
-    void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) =
-        ScaleAddRow_C;
+    void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr,
+                        int src_width) = ScaleAddRow_C;
 #if defined(HAS_SCALEADDROW_SSE2)
     if (TestCpuFlag(kCpuHasSSE2)) {
       ScaleAddRow = ScaleAddRow_Any_SSE2;
@@ -895,8 +895,8 @@ static void ScalePlaneBox_16(int src_width,
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
                          const uint32_t* src_ptr, uint16_t* dst_ptr) =
         (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
-    void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr, int src_width) =
-        ScaleAddRow_16_C;
+    void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr,
+                        int src_width) = ScaleAddRow_16_C;
 
 #if defined(HAS_SCALEADDROW_16_SSE2)
     if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
@@ -946,8 +946,8 @@ void ScalePlaneBilinearDown(int src_width,
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
-                          int x, int dx) =
+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                          int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
   void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
@@ -1144,8 +1144,8 @@ void ScalePlaneBilinearUp(int src_width,
   void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
-                          int x, int dx) =
+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                          int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_C : ScaleCols_C;
   ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
              &dx, &dy);
@@ -1401,8 +1401,8 @@ static void ScalePlaneSimple(int src_width,
                              const uint8_t* src_ptr,
                              uint8_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, int x,
-                    int dx) = ScaleCols_C;
+  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+                    int x, int dx) = ScaleCols_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
@@ -1759,8 +1759,9 @@ int ScaleOffset(const uint8_t* src,
   uint8_t* dst_y = dst + dst_yoffset_even * dst_width;
   uint8_t* dst_u =
       dst + dst_width * dst_height + (dst_yoffset_even >> 1) * dst_halfwidth;
-  uint8_t* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
-                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  uint8_t* dst_v = dst + dst_width * dst_height +
+                   dst_halfwidth * dst_halfheight +
+                   (dst_yoffset_even >> 1) * dst_halfwidth;
   if (!src || src_width <= 0 || src_height <= 0 || !dst || dst_width <= 0 ||
       dst_height <= 0 || dst_yoffset_even < 0 ||
       dst_yoffset_even >= dst_height) {
diff --git a/source/scale_any.cc b/source/scale_any.cc
index 19f1531f5..53ad13640 100644
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -19,15 +19,15 @@ extern "C" {
 #endif
 
 // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                        \
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
   void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
-               int dx) {                                                   \
-    int r = dst_width & MASK;                                              \
-    int n = dst_width & ~MASK;                                             \
-    if (n > 0) {                                                           \
-      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                               \
-    }                                                                      \
-    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                 \
+               int dx) {                                                       \
+    int r = dst_width & MASK;                                                  \
+    int n = dst_width & ~MASK;                                                 \
+    if (n > 0) {                                                               \
+      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
+    }                                                                          \
+    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
   }
 
 #ifdef HAS_SCALEFILTERCOLS_NEON
@@ -60,31 +60,31 @@ CANY(ScaleARGBFilterCols_Any_MSA,
 
 // Fixed scale down.
 // Mask may be non-power of 2, so use MOD
-#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr,   \
-               int dst_width) {                                              \
-    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */        \
-    int n = dst_width - r;                                                   \
-    if (n > 0) {                                                             \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-    }                                                                        \
-    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                   dst_ptr + n * BPP, r);                                    \
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
+    int n = dst_width - r;                                                     \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r);                                      \
   }
 
 // Fixed scale down for odd source width.  Used by I420Blend subsampling.
 // Since dst_width is (width + 1) / 2, this function scales one less pixel
 // and copies the last pixel.
-#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr,   \
-               int dst_width) {                                              \
-    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */  \
-    int n = (dst_width - 1) - r;                                             \
-    if (n > 0) {                                                             \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-    }                                                                        \
-    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                   dst_ptr + n * BPP, r + 1);                                \
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \
+    int n = (dst_width - 1) - r;                                               \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r + 1);                                  \
   }
 
 #ifdef HAS_SCALEROWDOWN2_SSSE3
@@ -385,16 +385,16 @@ SDANY(ScaleARGBRowDown2Box_Any_MSA,
 #undef SDANY
 
 // Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)      \
-  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx,  \
-               uint8_t* dst_ptr, int dst_width) {                            \
-    int r = dst_width & MASK;                                              \
-    int n = dst_width & ~MASK;                                             \
-    if (n > 0) {                                                           \
-      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);       \
-    }                                                                      \
-    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
-                   dst_ptr + n * BPP, r);                                  \
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+               uint8_t* dst_ptr, int dst_width) {                           \
+    int r = dst_width & MASK;                                               \
+    int n = dst_width & ~MASK;                                              \
+    if (n > 0) {                                                            \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \
+    }                                                                       \
+    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \
+                   dst_ptr + n * BPP, r);                                   \
   }
 
 #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
@@ -435,13 +435,13 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
 #endif
 
 // Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)          \
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
   void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
-    int n = src_width & ~MASK;                                         \
-    if (n > 0) {                                                       \
-      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                           \
-    }                                                                  \
-    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);         \
+    int n = src_width & ~MASK;                                             \
+    if (n > 0) {                                                           \
+      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
+    }                                                                      \
+    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
   }
 
 #ifdef HAS_SCALEADDROW_SSE2
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index 698464e11..53a22e8b4 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -772,8 +772,8 @@ static void ScaleARGBSimple(int src_width,
                             int y,
                             int dy) {
   int j;
-  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb, int dst_width,
-                        int x, int dx) =
+  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                        int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
   (void)src_height;
 #if defined(HAS_SCALEARGBCOLS_SSE2)
diff --git a/source/scale_common.cc b/source/scale_common.cc
index 5d9b57977..b28d7da41 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -758,7 +758,9 @@ void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
   }
 }
 
-void ScaleAddRow_16_C(const uint16_t* src_ptr, uint32_t* dst_ptr, int src_width) {
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+                      uint32_t* dst_ptr,
+                      int src_width) {
   int x;
   assert(src_width > 0);
   for (x = 0; x < src_width - 1; x += 2) {
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index c55091169..b2b415940 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -98,24 +98,25 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+  asm volatile(
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
@@ -123,72 +124,70 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   (void)src_stride;
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "pxor       %%xmm5,%%xmm5                  \n"
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pavgw      %%xmm5,%%xmm0                  \n"
-    "pavgw      %%xmm5,%%xmm1                  \n"
-    "packuswb   %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pavgw      %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst_ptr,
                             int dst_width) {
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "pxor       %%xmm5,%%xmm5                  \n"
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x00(%0,%3,1),%%xmm2            \n"
-    "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    "psrlw      $0x1,%%xmm0                    \n"
-    "psrlw      $0x1,%%xmm1                    \n"
-    "pavgw      %%xmm5,%%xmm0                  \n"
-    "pavgw      %%xmm5,%%xmm1                  \n"
-    "packuswb   %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "psrlw      $0x1,%%xmm0                    \n"
+      "psrlw      $0x1,%%xmm1                    \n"
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pavgw      %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
 #ifdef HAS_SCALEROWDOWN2_AVX2
@@ -197,26 +196,27 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vmovdqu    0x20(%0),%%ymm1                \n"
-    "lea        0x40(%0),%0                    \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "lea        0x20(%1),%1                    \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
@@ -224,76 +224,74 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
   (void)src_stride;
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vmovdqu    0x20(%0),%%ymm1                \n"
-    "lea        0x40(%0),%0                    \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "lea        0x20(%1),%1                    \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vmovdqu    0x20(%0),%%ymm1                \n"
-    "vmovdqu    0x00(%0,%3,1),%%ymm2            \n"
-    "vmovdqu    0x20(%0,%3,1),%%ymm3            \n"
-    "lea        0x40(%0),%0                    \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
-    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "lea        0x20(%1),%1                    \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEROWDOWN2_AVX2
 
@@ -302,30 +300,30 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrld     $0x18,%%xmm5                    \n"
-    "pslld     $0x10,%%xmm5                    \n"
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrld     $0x18,%%xmm5                    \n"
+      "pslld     $0x10,%%xmm5                    \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
 
 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
@@ -333,55 +331,53 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
   intptr_t stridex3;
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                  \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "psllw      $0x3,%%xmm5                    \n"
-    "lea       0x00(%4,%4,2),%3                \n"
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "movdqa     %%xmm4,%%xmm5                  \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "psllw      $0x3,%%xmm5                    \n"
+      "lea       0x00(%4,%4,2),%3                \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x00(%0,%4,1),%%xmm2            \n"
-    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    "movdqu    0x00(%0,%4,2),%%xmm2            \n"
-    "movdqu    0x10(%0,%4,2),%%xmm3            \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    "movdqu    0x00(%0,%3,1),%%xmm2            \n"
-    "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    "phaddw     %%xmm1,%%xmm0                  \n"
-    "paddw      %%xmm5,%%xmm0                  \n"
-    "psrlw      $0x4,%%xmm0                    \n"
-    "packuswb   %%xmm0,%%xmm0                  \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width),   // %2
-    "=&r"(stridex3)    // %3
-  : "r"((intptr_t)(src_stride))    // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "movdqu    0x00(%0,%4,2),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,2),%%xmm3            \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "phaddw     %%xmm1,%%xmm0                  \n"
+      "paddw      %%xmm5,%%xmm0                  \n"
+      "psrlw      $0x4,%%xmm0                    \n"
+      "packuswb   %%xmm0,%%xmm0                  \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width),             // %2
+        "=&r"(stridex3)              // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 #ifdef HAS_SCALEROWDOWN4_AVX2
@@ -390,88 +386,87 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
-    "vpslld     $0x10,%%ymm5,%%ymm5            \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vmovdqu    0x20(%0),%%ymm1                \n"
-    "lea        0x40(%0),%0                    \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%xmm0,(%1)                    \n"
-    "lea        0x10(%1),%1                    \n"
-    "sub        $0x10,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
+      "vpslld     $0x10,%%ymm5,%%ymm5            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
 
 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vmovdqu    0x20(%0),%%ymm1                \n"
-    "vmovdqu    0x00(%0,%3,1),%%ymm2            \n"
-    "vmovdqu    0x20(%0,%3,1),%%ymm3            \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    "vmovdqu    0x00(%0,%3,2),%%ymm2            \n"
-    "vmovdqu    0x20(%0,%3,2),%%ymm3            \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    "vmovdqu    0x00(%0,%4,1),%%ymm2            \n"
-    "vmovdqu    0x20(%0,%4,1),%%ymm3            \n"
-    "lea        0x40(%0),%0                    \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%xmm0,(%1)                    \n"
-    "lea        0x10(%1),%1                    \n"
-    "sub        $0x10,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "r"((intptr_t)(src_stride * 3))   // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vmovdqu    0x00(%0,%3,2),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,2),%%ymm3           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vmovdqu    0x00(%0,%4,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%4,1),%%ymm3           \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                   // %0
+        "+r"(dst_ptr),                   // %1
+        "+r"(dst_width)                  // %2
+      : "r"((intptr_t)(src_stride)),     // %3
+        "r"((intptr_t)(src_stride * 3))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_SCALEROWDOWN4_AVX2
 
@@ -489,28 +484,29 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf1),  // %1
         "m"(kShuf2)   // %2
       );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm2                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "palignr   $0x8,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,0x8(%1)                  \n"
-    "movq      %%xmm2,0x10(%1)                 \n"
-    "lea       0x18(%1),%1                     \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm2                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "palignr   $0x8,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm3,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq      %%xmm1,0x8(%1)                  \n"
+      "movq      %%xmm2,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
@@ -535,48 +531,48 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
       );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm6                     \n"
-    "movdqu    0x00(%0,%3,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,(%1)                     \n"
-    "movdqu    0x8(%0),%%xmm6                  \n"
-    "movdqu    0x8(%0,%3,1),%%xmm7            \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x8(%1)                  \n"
-    "movdqu    0x10(%0),%%xmm6                 \n"
-    "movdqu    0x10(%0,%3,1),%%xmm7            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x10(%1)                 \n"
-    "lea       0x18(%1),%1                     \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "m"(kMadd21)     // %4
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,(%1)                     \n"
+      "movdqu    0x8(%0),%%xmm6                  \n"
+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm0,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x8(%1)                  \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm4,%%xmm6                   \n"
+      "pmaddubsw %4,%%xmm6                       \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
@@ -602,51 +598,51 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kRound34)  // %2
       );
 
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm6                     \n"
-    "movdqu    0x00(%0,%3,1),%%xmm7            \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,(%1)                     \n"
-    "movdqu    0x8(%0),%%xmm6                  \n"
-    "movdqu    0x8(%0,%3,1),%%xmm7            \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x8(%1)                  \n"
-    "movdqu    0x10(%0),%%xmm6                 \n"
-    "movdqu    0x10(%0,%3,1),%%xmm7            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x10(%1)                 \n"
-    "lea       0x18(%1),%1                     \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-    : "+r"(src_ptr),   // %0
-      "+r"(dst_ptr),   // %1
-      "+r"(dst_width)  // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "m"(kMadd21)     // %4
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,(%1)                     \n"
+      "movdqu    0x8(%0),%%xmm6                  \n"
+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm0,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x8(%1)                  \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm4,%%xmm6                   \n"
+      "pmaddubsw %4,%%xmm6                       \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
@@ -654,31 +650,30 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
                           uint8_t* dst_ptr,
                           int dst_width) {
   (void)src_stride;
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movd      %%xmm1,0x8(%1)                  \n"
-    "lea       0xc(%1),%1                      \n"
-    "sub       $0xc,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "m"(kShuf38a),   // %3
-    "m"(kShuf38b)    // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movhlps   %%xmm0,%%xmm1                   \n"
+      "movd      %%xmm1,0x8(%1)                  \n"
+      "lea       0xc(%1),%1                      \n"
+      "sub       $0xc,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "m"(kShuf38a),   // %3
+        "m"(kShuf38b)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
 void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
@@ -696,35 +691,34 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAb2),  // %2
         "m"(kScaleAb2)  // %3
       );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%3,1),%%xmm1            \n"
-    "lea       0x10(%0),%0                     \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pshufb    %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "paddusw   %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "paddusw   %%xmm0,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movd      %%xmm1,(%1)                     \n"
-    "psrlq     $0x10,%%xmm1                    \n"
-    "movd      %%xmm1,0x2(%1)                  \n"
-    "lea       0x6(%1),%1                      \n"
-    "sub       $0x6,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"((intptr_t)(src_stride))  // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "pavgb     %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pshufb    %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "paddusw   %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "paddusw   %%xmm0,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movd      %%xmm1,(%1)                     \n"
+      "psrlq     $0x10,%%xmm1                    \n"
+      "movd      %%xmm1,0x2(%1)                  \n"
+      "lea       0x6(%1),%1                      \n"
+      "sub       $0x6,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
@@ -741,112 +735,117 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAc3),   // %1
         "m"(kScaleAc33)  // %2
       );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x00(%0,%3,1),%%xmm6            \n"
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    "movdqu    0x00(%0,%3,2),%%xmm6            \n"
-    "lea       0x10(%0),%0                     \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "pshufb    %%xmm3,%%xmm7                   \n"
-    "paddusw   %%xmm7,%%xmm6                   \n"
-    "pmulhuw   %%xmm4,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movd      %%xmm6,(%1)                     \n"
-    "psrlq     $0x10,%%xmm6                    \n"
-    "movd      %%xmm6,0x2(%1)                  \n"
-    "lea       0x6(%1),%1                      \n"
-    "sub       $0x6,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm6            \n"
+      "movhlps   %%xmm0,%%xmm1                   \n"
+      "movhlps   %%xmm6,%%xmm7                   \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm6                   \n"
+      "punpcklbw %%xmm5,%%xmm7                   \n"
+      "paddusw   %%xmm6,%%xmm0                   \n"
+      "paddusw   %%xmm7,%%xmm1                   \n"
+      "movdqu    0x00(%0,%3,2),%%xmm6            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movhlps   %%xmm6,%%xmm7                   \n"
+      "punpcklbw %%xmm5,%%xmm6                   \n"
+      "punpcklbw %%xmm5,%%xmm7                   \n"
+      "paddusw   %%xmm6,%%xmm0                   \n"
+      "paddusw   %%xmm7,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "psrldq    $0x2,%%xmm0                     \n"
+      "paddusw   %%xmm0,%%xmm6                   \n"
+      "psrldq    $0x2,%%xmm0                     \n"
+      "paddusw   %%xmm0,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "movdqa    %%xmm1,%%xmm7                   \n"
+      "psrldq    $0x2,%%xmm1                     \n"
+      "paddusw   %%xmm1,%%xmm7                   \n"
+      "psrldq    $0x2,%%xmm1                     \n"
+      "paddusw   %%xmm1,%%xmm7                   \n"
+      "pshufb    %%xmm3,%%xmm7                   \n"
+      "paddusw   %%xmm7,%%xmm6                   \n"
+      "pmulhuw   %%xmm4,%%xmm6                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movd      %%xmm6,(%1)                     \n"
+      "psrlq     $0x10,%%xmm6                    \n"
+      "movd      %%xmm6,0x2(%1)                  \n"
+      "lea       0x6(%1),%1                      \n"
+      "sub       $0x6,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 // Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm3                     \n"
-    "lea       0x10(%0),%0                     \n"  // src_ptr += 16
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    0x10(%1),%%xmm1                 \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpckhbw %%xmm5,%%xmm3                   \n"
-    "paddusw   %%xmm2,%%xmm0                   \n"
-    "paddusw   %%xmm3,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(src_width)    // %2
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm3                     \n"
+      "lea       0x10(%0),%0                     \n"  // src_ptr += 16
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x10(%1),%%xmm1                 \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpckhbw %%xmm5,%%xmm3                   \n"
+      "paddusw   %%xmm2,%%xmm0                   \n"
+      "paddusw   %%xmm3,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
 #ifdef HAS_SCALEADDROW_AVX2
 // Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
-  asm volatile (
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%0),%%ymm3                    \n"
-    "lea        0x20(%0),%0                    \n"  // src_ptr += 32
-    "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpaddusw   (%1),%%ymm2,%%ymm0             \n"
-    "vpaddusw   0x20(%1),%%ymm3,%%ymm1               \n"
-    "vmovdqu    %%ymm0,(%1)                    \n"
-    "vmovdqu    %%ymm1,0x20(%1)                \n"
-    "lea       0x40(%1),%1                     \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(src_width)    // %2
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm3                    \n"
+      "lea        0x20(%0),%0                    \n"  // src_ptr += 32
+      "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpaddusw   (%1),%%ymm2,%%ymm0             \n"
+      "vpaddusw   0x20(%1),%%ymm3,%%ymm1         \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEADDROW_AVX2
 
@@ -866,92 +865,92 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
                            int x,
                            int dx) {
   intptr_t x0, x1, temp_pixel;
-  asm volatile (
-    "movd      %6,%%xmm2                       \n"
-    "movd      %7,%%xmm3                       \n"
-    "movl      $0x04040000,%k2                 \n"
-    "movd      %k2,%%xmm5                      \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $15,%%xmm7                      \n"  // 0x00010001
+  asm volatile(
+      "movd      %6,%%xmm2                       \n"
+      "movd      %7,%%xmm3                       \n"
+      "movl      $0x04040000,%k2                 \n"
+      "movd      %k2,%%xmm5                      \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psrlw     $15,%%xmm7                      \n"  // 0x00010001
 
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "subl      $0x2,%5                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "subl      $0x2,%5                         \n"
+      "jl        29f                             \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "punpckldq %%xmm0,%%xmm2                   \n"
+      "punpckldq %%xmm3,%%xmm3                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
 
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    "movzwl    0x00(%1,%3,1),%k2               \n"
-    "movd      %k2,%%xmm0                      \n"
-    "psrlw     $0x9,%%xmm1                     \n"
-    "movzwl    0x00(%1,%4,1),%k2               \n"
-    "movd      %k2,%%xmm4                      \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "punpcklwd %%xmm4,%%xmm0                   \n"
-    "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-    "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) + 1
-    "paddusb   %%xmm7,%%xmm1                   \n"
-    "pmaddubsw %%xmm0,%%xmm1                   \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movd      %%xmm1,%k2                      \n"
-    "mov       %w2,(%0)                        \n"
-    "lea       0x2(%0),%0                      \n"
-    "subl      $0x2,%5                         \n"
-    "jge       2b                              \n"
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "movzwl    0x00(%1,%3,1),%k2               \n"
+      "movd      %k2,%%xmm0                      \n"
+      "psrlw     $0x9,%%xmm1                     \n"
+      "movzwl    0x00(%1,%4,1),%k2               \n"
+      "movd      %k2,%%xmm4                      \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "punpcklwd %%xmm4,%%xmm0                   \n"
+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+      "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) +
+                                                      // 1
+      "paddusb   %%xmm7,%%xmm1                   \n"
+      "pmaddubsw %%xmm0,%%xmm1                   \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
+      "psrlw     $0x7,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movd      %%xmm1,%k2                      \n"
+      "mov       %w2,(%0)                        \n"
+      "lea       0x2(%0),%0                      \n"
+      "subl      $0x2,%5                         \n"
+      "jge       2b                              \n"
 
-    LABELALIGN
-  "29:                                         \n"
-    "addl      $0x1,%5                         \n"
-    "jl        99f                             \n"
-    "movzwl    0x00(%1,%3,1),%k2               \n"
-    "movd      %k2,%%xmm0                      \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "paddusb   %%xmm7,%%xmm2                   \n"
-    "pmaddubsw %%xmm0,%%xmm2                   \n"
-    "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm2                   \n"
-    "movd      %%xmm2,%k2                      \n"
-    "mov       %b2,(%0)                        \n"
-  "99:                                         \n"
-  : "+r"(dst_ptr),      // %0
-    "+r"(src_ptr),      // %1
-    "=&a"(temp_pixel),  // %2
-    "=&r"(x0),          // %3
-    "=&r"(x1),          // %4
+      LABELALIGN
+      "29:                                       \n"
+      "addl      $0x1,%5                         \n"
+      "jl        99f                             \n"
+      "movzwl    0x00(%1,%3,1),%k2               \n"
+      "movd      %k2,%%xmm0                      \n"
+      "psrlw     $0x9,%%xmm2                     \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+      "pxor      %%xmm6,%%xmm2                   \n"
+      "paddusb   %%xmm7,%%xmm2                   \n"
+      "pmaddubsw %%xmm0,%%xmm2                   \n"
+      "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm2                   \n"
+      "movd      %%xmm2,%k2                      \n"
+      "mov       %b2,(%0)                        \n"
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "=&a"(temp_pixel),  // %2
+        "=&r"(x0),          // %3
+        "=&r"(x1),          // %4
 #if defined(__x86_64__)
-    "+rm"(dst_width)    // %5
+        "+rm"(dst_width)  // %5
 #else
-    "+m"(dst_width)    // %5
+        "+m"(dst_width)  // %5
 #endif
-  : "rm"(x),            // %6
-    "rm"(dx),           // %7
+      : "rm"(x),   // %6
+        "rm"(dx),  // %7
 #if defined(__x86_64__)
-    "x"(kFsub80),       // %8
-    "x"(kFadd40)        // %9
+        "x"(kFsub80),  // %8
+        "x"(kFadd40)   // %9
 #else
-    "m"(kFsub80),       // %8
-    "m"(kFadd40)        // %9
+        "m"(kFsub80),    // %8
+        "m"(kFadd40)     // %9
 #endif
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
@@ -963,25 +962,26 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
                        int dx) {
   (void)x;
   (void)dx;
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%0)                     \n"
-    "movdqu    %%xmm1,0x10(%0)                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "sub       $0x20,%2                         \n"
-    "jg        1b                              \n"
+  asm volatile(
 
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_width)    // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+
+      : "+r"(dst_ptr),   // %0
+        "+r"(src_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
@@ -989,22 +989,23 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int dst_width) {
   (void)src_stride;
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
@@ -1012,56 +1013,56 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   (void)src_stride;
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                uint8_t* dst_argb,
                                int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x00(%0,%3,1),%%xmm2            \n"
-    "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Reads 4 pixels at a time.
@@ -1074,31 +1075,31 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   (void)src_stride;
-  asm volatile (
-    "lea       0x00(,%1,4),%1                  \n"
-    "lea       0x00(%1,%1,2),%4                \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movd      (%0),%%xmm0                     \n"
-    "movd    0x00(%0,%1,1),%%xmm1            \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "movd    0x00(%0,%1,2),%%xmm2            \n"
-    "movd    0x00(%0,%4,1),%%xmm3            \n"
-    "lea       0x00(%0,%1,4),%0                \n"
-    "punpckldq %%xmm3,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm0                  \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),       // %0
-    "+r"(src_stepx_x4),   // %1
-    "+r"(dst_argb),       // %2
-    "+r"(dst_width),      // %3
-    "=&r"(src_stepx_x12)  // %4
-  :: "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+  asm volatile(
+      "lea       0x00(,%1,4),%1                  \n"
+      "lea       0x00(%1,%1,2),%4                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd      (%0),%%xmm0                     \n"
+      "movd      0x00(%0,%1,1),%%xmm1            \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movd      0x00(%0,%1,2),%%xmm2            \n"
+      "movd      0x00(%0,%4,1),%%xmm3            \n"
+      "lea       0x00(%0,%1,4),%0                \n"
+      "punpckldq %%xmm3,%%xmm2                   \n"
+      "punpcklqdq %%xmm2,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),       // %0
+        "+r"(src_stepx_x4),   // %1
+        "+r"(dst_argb),       // %2
+        "+r"(dst_width),      // %3
+        "=&r"(src_stepx_x12)  // %4
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Blends four 2x2 to 4x1.
@@ -1111,42 +1112,41 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   intptr_t row1 = (intptr_t)(src_stride);
-  asm volatile (
-    "lea       0x00(,%1,4),%1                  \n"
-    "lea       0x00(%1,%1,2),%4                \n"
-    "lea       0x00(%0,%5,1),%5                \n"
+  asm volatile(
+      "lea       0x00(,%1,4),%1                  \n"
+      "lea       0x00(%1,%1,2),%4                \n"
+      "lea       0x00(%0,%5,1),%5                \n"
 
-    LABELALIGN
-  "1:                                          \n"
-    "movq      (%0),%%xmm0                     \n"
-    "movhps    0x00(%0,%1,1),%%xmm0            \n"
-    "movq    0x00(%0,%1,2),%%xmm1            \n"
-    "movhps    0x00(%0,%4,1),%%xmm1            \n"
-    "lea       0x00(%0,%1,4),%0                \n"
-    "movq      (%5),%%xmm2                     \n"
-    "movhps    0x00(%5,%1,1),%%xmm2            \n"
-    "movq    0x00(%5,%1,2),%%xmm3            \n"
-    "movhps    0x00(%5,%4,1),%%xmm3            \n"
-    "lea       0x00(%5,%1,4),%5                \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),        // %0
-    "+r"(src_stepx_x4),    // %1
-    "+r"(dst_argb),        // %2
-    "+rm"(dst_width),      // %3
-    "=&r"(src_stepx_x12),  // %4
-    "+r"(row1)             // %5
-  :: "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movhps    0x00(%0,%1,1),%%xmm0            \n"
+      "movq      0x00(%0,%1,2),%%xmm1            \n"
+      "movhps    0x00(%0,%4,1),%%xmm1            \n"
+      "lea       0x00(%0,%1,4),%0                \n"
+      "movq      (%5),%%xmm2                     \n"
+      "movhps    0x00(%5,%1,1),%%xmm2            \n"
+      "movq      0x00(%5,%1,2),%%xmm3            \n"
+      "movhps    0x00(%5,%4,1),%%xmm3            \n"
+      "lea       0x00(%5,%1,4),%5                \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),        // %0
+        "+r"(src_stepx_x4),    // %1
+        "+r"(dst_argb),        // %2
+        "+rm"(dst_width),      // %3
+        "=&r"(src_stepx_x12),  // %4
+        "+r"(row1)             // %5
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
@@ -1155,68 +1155,66 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
                         int x,
                         int dx) {
   intptr_t x0, x1;
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pextrw    $0x1,%%xmm2,%k0                 \n"
-    "pextrw    $0x3,%%xmm2,%k1                 \n"
-    "cmp       $0x0,%4                         \n"
-    "jl        99f                             \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
+  asm volatile(
+      "movd      %5,%%xmm2                       \n"
+      "movd      %6,%%xmm3                       \n"
+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+      "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pextrw    $0x1,%%xmm2,%k0                 \n"
+      "pextrw    $0x3,%%xmm2,%k1                 \n"
+      "cmp       $0x0,%4                         \n"
+      "jl        99f                             \n"
+      "sub       $0x4,%4                         \n"
+      "jl        49f                             \n"
 
-    LABELALIGN
-  "40:                                         \n"
-    "movd    0x00(%3,%0,4),%%xmm0            \n"
-    "movd    0x00(%3,%1,4),%%xmm1            \n"
-    "pextrw    $0x5,%%xmm2,%k0                 \n"
-    "pextrw    $0x7,%%xmm2,%k1                 \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "movd    0x00(%3,%0,4),%%xmm1            \n"
-    "movd    0x00(%3,%1,4),%%xmm4            \n"
-    "pextrw    $0x1,%%xmm2,%k0                 \n"
-    "pextrw    $0x3,%%xmm2,%k1                 \n"
-    "punpckldq %%xmm4,%%xmm1                   \n"
-    "punpcklqdq %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "sub       $0x4,%4                         \n"
-    "jge       40b                             \n"
+      LABELALIGN
+      "40:                                       \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      0x00(%3,%1,4),%%xmm1            \n"
+      "pextrw    $0x5,%%xmm2,%k0                 \n"
+      "pextrw    $0x7,%%xmm2,%k1                 \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movd      0x00(%3,%0,4),%%xmm1            \n"
+      "movd      0x00(%3,%1,4),%%xmm4            \n"
+      "pextrw    $0x1,%%xmm2,%k0                 \n"
+      "pextrw    $0x3,%%xmm2,%k1                 \n"
+      "punpckldq %%xmm4,%%xmm1                   \n"
+      "punpcklqdq %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%4                         \n"
+      "jge       40b                             \n"
 
-  "49:                                         \n"
-    "test      $0x2,%4                         \n"
-    "je        29f                             \n"
-    "movd    0x00(%3,%0,4),%%xmm0            \n"
-    "movd    0x00(%3,%1,4),%%xmm1            \n"
-    "pextrw    $0x5,%%xmm2,%k0                 \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0,(%2)                     \n"
-    "lea       0x8(%2),%2                      \n"
-  "29:                                         \n"
-    "test      $0x1,%4                         \n"
-    "je        99f                             \n"
-    "movd    0x00(%3,%0,4),%%xmm0            \n"
-    "movd      %%xmm0,(%2)                     \n"
-  "99:                                         \n"
-  : "=&a"(x0),         // %0
-    "=&d"(x1),         // %1
-    "+r"(dst_argb),    // %2
-    "+r"(src_argb),    // %3
-    "+r"(dst_width)    // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+      "49:                                       \n"
+      "test      $0x2,%4                         \n"
+      "je        29f                             \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      0x00(%3,%1,4),%%xmm1            \n"
+      "pextrw    $0x5,%%xmm2,%k0                 \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movq      %%xmm0,(%2)                     \n"
+      "lea       0x8(%2),%2                      \n"
+      "29:                                       \n"
+      "test      $0x1,%4                         \n"
+      "je        99f                             \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "99:                                       \n"
+      : "=&a"(x0),       // %0
+        "=&d"(x1),       // %1
+        "+r"(dst_argb),  // %2
+        "+r"(src_argb),  // %3
+        "+r"(dst_width)  // %4
+      : "rm"(x),         // %5
+        "rm"(dx)         // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
@@ -1228,26 +1226,26 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
                            int dx) {
   (void)x;
   (void)dx;
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpckldq %%xmm0,%%xmm0                   \n"
-    "punpckhdq %%xmm1,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%0)                     \n"
-    "movdqu    %%xmm1,0x10(%0)                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
+  asm volatile(
 
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+r"(dst_width)    // %2
-  :: "memory", "cc",
-    "xmm0", "xmm1"
-  );
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpckldq %%xmm0,%%xmm0                   \n"
+      "punpckhdq %%xmm1,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+
+      : "+r"(dst_argb),  // %0
+        "+r"(src_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
@@ -1276,67 +1274,64 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
         "m"(kShuffleFractions)  // %1
       );
 
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "sub       $0x2,%2                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
+  asm volatile(
+      "movd      %5,%%xmm2                       \n"
+      "movd      %6,%%xmm3                       \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x9,%%xmm6                     \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "sub       $0x2,%2                         \n"
+      "jl        29f                             \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "punpckldq %%xmm0,%%xmm2                   \n"
+      "punpckldq %%xmm3,%%xmm3                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
 
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    "movq    0x00(%1,%3,4),%%xmm0            \n"
-    "psrlw     $0x9,%%xmm1                     \n"
-    "movhps    0x00(%1,%4,4),%%xmm0            \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm1                   \n"
-    "pmaddubsw %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0,(%0)                     \n"
-    "lea       0x8(%0),%0                      \n"
-    "sub       $0x2,%2                         \n"
-    "jge       2b                              \n"
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "movq      0x00(%1,%3,4),%%xmm0            \n"
+      "psrlw     $0x9,%%xmm1                     \n"
+      "movhps    0x00(%1,%4,4),%%xmm0            \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pxor      %%xmm6,%%xmm1                   \n"
+      "pmaddubsw %%xmm1,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%0)                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x2,%2                         \n"
+      "jge       2b                              \n"
 
-    LABELALIGN
-  "29:                                         \n"
-    "add       $0x1,%2                         \n"
-    "jl        99f                             \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    "movq    0x00(%1,%3,4),%%xmm0            \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,(%0)                     \n"
+      LABELALIGN
+      "29:                                       \n"
+      "add       $0x1,%2                         \n"
+      "jl        99f                             \n"
+      "psrlw     $0x9,%%xmm2                     \n"
+      "movq      0x00(%1,%3,4),%%xmm0            \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pxor      %%xmm6,%%xmm2                   \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movd      %%xmm0,(%0)                     \n"
 
-    LABELALIGN
-  "99:                                         \n"
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+rm"(dst_width),  // %2
-    "=&r"(x0),         // %3
-    "=&r"(x1)          // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+      LABELALIGN "99:                            \n"  // clang-format error.
+      : "+r"(dst_argb),    // %0
+        "+r"(src_argb),    // %1
+        "+rm"(dst_width),  // %2
+        "=&r"(x0),         // %3
+        "=&r"(x1)          // %4
+      : "rm"(x),           // %5
+        "rm"(dx)           // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
 // Divide num by div and return as 16.16 fixed point result.
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 3ce2e2aa2..494a9cfbf 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -807,9 +807,9 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
       "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
       "st1        {v0.16b}, [%1], #16            \n"
       "b.gt       1b                             \n"
-      : "+r"(src_argb),              // %0
-        "+r"(dst_argb),              // %1
-        "+r"(dst_width)              // %2
+      : "+r"(src_argb),                // %0
+        "+r"(dst_argb),                // %1
+        "+r"(dst_width)                // %2
       : "r"((int64_t)(src_stepx * 4))  // %3
       : "memory", "cc", "v0");
 }
@@ -851,10 +851,10 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
       "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
       "st1     {v0.16b}, [%2], #16               \n"
       "b.gt       1b                             \n"
-      : "+r"(src_argb),              // %0
-        "+r"(src_stride),            // %1
-        "+r"(dst_argb),              // %2
-        "+r"(dst_width)              // %3
+      : "+r"(src_argb),                // %0
+        "+r"(src_stride),              // %1
+        "+r"(dst_argb),                // %2
+        "+r"(dst_width)                // %3
       : "r"((int64_t)(src_stepx * 4))  // %4
       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
diff --git a/source/video_common.cc b/source/video_common.cc
index bbefcdfbf..92384c050 100644
--- a/source/video_common.cc
+++ b/source/video_common.cc
@@ -15,14 +15,13 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
-
 struct FourCCAliasEntry {
   uint32_t alias;
   uint32_t canonical;
 };
 
-static const struct FourCCAliasEntry kFourCCAliases[] = {
+#define NUM_ALIASES 18
+static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
     {FOURCC_IYUV, FOURCC_I420},
     {FOURCC_YU12, FOURCC_I420},
     {FOURCC_YU16, FOURCC_I422},
@@ -48,7 +47,7 @@ static const struct FourCCAliasEntry kFourCCAliases[] = {
 LIBYUV_API
 uint32_t CanonicalFourCC(uint32_t fourcc) {
   int i;
-  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+  for (i = 0; i < NUM_ALIASES; ++i) {
     if (kFourCCAliases[i].alias == fourcc) {
       return kFourCCAliases[i].canonical;
     }
diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc
index 46adf0ef3..4bb448d56 100644
--- a/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@@ -63,10 +63,10 @@ namespace libyuv {
                                                                                \
     /* The test is overall for color conversion matrix being reversible, so */ \
     /* this initializes the pixel with 2x2 blocks to eliminate subsampling. */ \
-    uint8_t* p = orig_y;                                                         \
+    uint8_t* p = orig_y;                                                       \
     for (int y = 0; y < benchmark_height_ - HS1; y += HS) {                    \
       for (int x = 0; x < benchmark_width_ - 1; x += 2) {                      \
-        uint8_t r = static_cast<uint8_t>(fastrand());                              \
+        uint8_t r = static_cast<uint8_t>(fastrand());                          \
         p[0] = r;                                                              \
         p[1] = r;                                                              \
         p[HN] = r;                                                             \
@@ -74,7 +74,7 @@ namespace libyuv {
         p += 2;                                                                \
       }                                                                        \
       if (benchmark_width_ & 1) {                                              \
-        uint8_t r = static_cast<uint8_t>(fastrand());                              \
+        uint8_t r = static_cast<uint8_t>(fastrand());                          \
         p[0] = r;                                                              \
         p[HN] = r;                                                             \
         p += 1;                                                                \
@@ -83,13 +83,13 @@ namespace libyuv {
     }                                                                          \
     if ((benchmark_height_ & 1) && HS == 2) {                                  \
       for (int x = 0; x < benchmark_width_ - 1; x += 2) {                      \
-        uint8_t r = static_cast<uint8_t>(fastrand());                              \
+        uint8_t r = static_cast<uint8_t>(fastrand());                          \
         p[0] = r;                                                              \
         p[1] = r;                                                              \
         p += 2;                                                                \
       }                                                                        \
       if (benchmark_width_ & 1) {                                              \
-        uint8_t r = static_cast<uint8_t>(fastrand());                              \
+        uint8_t r = static_cast<uint8_t>(fastrand());                          \
         p[0] = r;                                                              \
         p += 1;                                                                \
       }                                                                        \
diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc
index 61b288c80..c8546b023 100644
--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -22,7 +22,9 @@
 namespace libyuv {
 
 // hash seed of 5381 recommended.
-static uint32_t ReferenceHashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
+static uint32_t ReferenceHashDjb2(const uint8_t* src,
+                                  uint64_t count,
+                                  uint32_t seed) {
   uint32_t hash = seed;
   if (count > 0) {
     do {
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index a4ab809f8..e8ef2349e 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -173,8 +173,8 @@ TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
                                          SUBSAMPLE(kHeight, SUBSAMP_Y));      \
     align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *           \
                                          SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    uint8_t* src_u = src_uv + OFF_U;                                            \
-    uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V);            \
+    uint8_t* src_u = src_uv + OFF_U;                                          \
+    uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V);          \
     int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE;          \
     for (int i = 0; i < kHeight; ++i)                                         \
       for (int j = 0; j < kWidth; ++j)                                        \
@@ -2016,56 +2016,57 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
 #endif  // HAS_ARGBTOAR30ROW_AVX2
 
 // TODO(fbarchard): Fix clamping issue affected by U channel.
-#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,      \
-                         ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF)      \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                  \
-    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-    const int kBpc = 2;                                                       \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
-    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
-    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & 0x3ff);      \
-    }                                                                         \
-    for (int i = 0; i < kSizeUV; ++i) {                                       \
-      reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & 0x3ff);      \
-      reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & 0x3ff);      \
-    }                                                                         \
-    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                         \
-    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                     \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,    \
-                          reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
-                          reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
-                          dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);  \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      FMT_PLANAR##To##FMT_B(                                                  \
-          reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                    \
-          reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,                 \
-          reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,                 \
-          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);                \
-    }                                                                         \
-    int max_diff = 0;                                                         \
-    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                      \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) -             \
-                         static_cast<int>(dst_argb_opt[i + DOFF]));           \
-      if (abs_diff > max_diff) {                                              \
-        max_diff = abs_diff;                                                  \
-      }                                                                       \
-    }                                                                         \
-    EXPECT_LE(max_diff, DIFF);                                                \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_u);                                      \
-    free_aligned_buffer_page_end(src_v);                                      \
-    free_aligned_buffer_page_end(dst_argb_c);                                 \
-    free_aligned_buffer_page_end(dst_argb_opt);                               \
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,   \
+                         ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF)   \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                    \
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                        \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);               \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                  \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                    \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);         \
+    const int kBpc = 2;                                                    \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);            \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                    \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                    \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);           \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);         \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                           \
+      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & 0x3ff); \
+    }                                                                      \
+    for (int i = 0; i < kSizeUV; ++i) {                                    \
+      reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & 0x3ff); \
+      reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & 0x3ff); \
+    }                                                                      \
+    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                      \
+    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                  \
+    MaskCpuFlags(disable_cpu_flags_);                                      \
+    FMT_PLANAR##To##FMT_B(                                                 \
+        reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                 \
+        reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,              \
+        reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,              \
+        dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);                 \
+    MaskCpuFlags(benchmark_cpu_info_);                                     \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                      \
+      FMT_PLANAR##To##FMT_B(                                               \
+          reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,               \
+          reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,            \
+          reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,            \
+          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);             \
+    }                                                                      \
+    int max_diff = 0;                                                      \
+    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                   \
+      int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) -          \
+                         static_cast<int>(dst_argb_opt[i + DOFF]));        \
+      if (abs_diff > max_diff) {                                           \
+        max_diff = abs_diff;                                               \
+      }                                                                    \
+    }                                                                      \
+    EXPECT_LE(max_diff, DIFF);                                             \
+    free_aligned_buffer_page_end(src_y);                                   \
+    free_aligned_buffer_page_end(src_u);                                   \
+    free_aligned_buffer_page_end(src_v);                                   \
+    free_aligned_buffer_page_end(dst_argb_c);                              \
+    free_aligned_buffer_page_end(dst_argb_opt);                            \
   }
 
 #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index f5a392c8e..c55234ba2 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2061,8 +2061,8 @@ int TestHalfFloatPlane(int benchmark_width,
   MaskCpuFlags(disable_cpu_flags);
   for (j = 0; j < benchmark_iterations; j++) {
     HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y), benchmark_width * 2,
-                   reinterpret_cast<uint16_t*>(dst_c), benchmark_width * 2, scale,
-                   benchmark_width, benchmark_height);
+                   reinterpret_cast<uint16_t*>(dst_c), benchmark_width * 2,
+                   scale, benchmark_width, benchmark_height);
   }
 
   // Enable optimizations.
@@ -2075,8 +2075,9 @@ int TestHalfFloatPlane(int benchmark_width,
 
   int max_diff = 0;
   for (i = 0; i < y_plane_size / 2; ++i) {
-    int abs_diff = abs(static_cast<int>(reinterpret_cast<uint16_t*>(dst_c)[i]) -
-                       static_cast<int>(reinterpret_cast<uint16_t*>(dst_opt)[i]));
+    int abs_diff =
+        abs(static_cast<int>(reinterpret_cast<uint16_t*>(dst_c)[i]) -
+            static_cast<int>(reinterpret_cast<uint16_t*>(dst_opt)[i]));
     if (abs_diff > max_diff) {
       max_diff = abs_diff;
     }
@@ -2788,8 +2789,9 @@ TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
 
   MaskCpuFlags(disable_cpu_flags_);
   Convert8To16Plane(src_pixels_y, benchmark_width_,
-                    reinterpret_cast<uint16_t*>(dst_pixels_y_c), benchmark_width_,
-                    1024, benchmark_width_, benchmark_height_);
+                    reinterpret_cast<uint16_t*>(dst_pixels_y_c),
+                    benchmark_width_, 1024, benchmark_width_,
+                    benchmark_height_);
   MaskCpuFlags(benchmark_cpu_info_);
 
   for (int i = 0; i < benchmark_iterations_; ++i) {
@@ -3214,8 +3216,9 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
     EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
   }
 
-  EXPECT_EQ(dst_pixels_c[0], static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 +
-                                                 640 * 3 * 4 + 640 * 4 * 1));
+  EXPECT_EQ(dst_pixels_c[0],
+            static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 +
+                                  640 * 4 * 1));
   EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
 }
 
diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index 0125254a4..a1be85b8d 100644
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -48,7 +48,8 @@ static int ARGBTestFilter(int src_width,
   }
   MemRandomize(src_argb, src_argb_plane_size);
 
-  int64_t dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4LL;
+  int64_t dst_argb_plane_size =
+      (dst_width + b * 2) * (dst_height + b * 2) * 4LL;
   int dst_stride_argb = (b * 2 + dst_width) * 4;
 
   align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
@@ -310,6 +311,7 @@ TEST_SCALETO(ARGBScale, 1280, 720)
 #undef TEST_SCALETO
 
 // Scale with YUV conversion to ARGB and clipping.
+// TODO(fbarchard): Add fourcc support.  All 4 ARGB formats is easy to support.
 LIBYUV_API
 int YUVToARGBScaleReference2(const uint8_t* src_y,
                              int src_stride_y,
@@ -317,12 +319,12 @@ int YUVToARGBScaleReference2(const uint8_t* src_y,
                              int src_stride_u,
                              const uint8_t* src_v,
                              int src_stride_v,
-                             uint32_t /* src_fourcc */,  // TODO: Add support.
+                             uint32 /* src_fourcc */,
                              int src_width,
                              int src_height,
                              uint8_t* dst_argb,
                              int dst_stride_argb,
-                             uint32_t /* dst_fourcc */,  // TODO: Add support.
+                             uint32 /* dst_fourcc */,
                              int dst_width,
                              int dst_height,
                              int clip_x,
@@ -330,7 +332,8 @@ int YUVToARGBScaleReference2(const uint8_t* src_y,
                              int clip_width,
                              int clip_height,
                              enum FilterMode filtering) {
-  uint8_t* argb_buffer = static_cast<uint8_t*>(malloc(src_width * src_height * 4));
+  uint8_t* argb_buffer =
+      static_cast<uint8_t*>(malloc(src_width * src_height * 4));
   int r;
   I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
              argb_buffer, src_width * 4, src_width, src_height);
@@ -342,7 +345,12 @@ int YUVToARGBScaleReference2(const uint8_t* src_y,
   return r;
 }
 
-static void FillRamp(uint8_t* buf, int width, int height, int v, int dx, int dy) {
+static void FillRamp(uint8_t* buf,
+                     int width,
+                     int height,
+                     int v,
+                     int dx,
+                     int dy) {
   int rv = v;
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h
index 91f6b4cbf..aec5b8d25 100644
--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@@ -69,10 +69,10 @@ static inline bool SizeValid(int src_width,
   return true;
 }
 
-#define align_buffer_page_end(var, size)                              \
+#define align_buffer_page_end(var, size)                                \
   uint8_t* var##_mem =                                                  \
       reinterpret_cast<uint8_t*>(malloc(((size) + 4095 + 63) & ~4095)); \
-  uint8_t* var = reinterpret_cast<uint8_t*>(                              \
+  uint8_t* var = reinterpret_cast<uint8_t*>(                            \
       (intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - (size)) & ~63)
 
 #define free_aligned_buffer_page_end(var) \
diff --git a/util/psnr.cc b/util/psnr.cc
index e9ffa48b8..0196a3939 100644
--- a/util/psnr.cc
+++ b/util/psnr.cc
@@ -39,8 +39,8 @@ typedef unsigned long long uint64_t;  // NOLINT
     !defined(__aarch64__)
 #define HAS_SUMSQUAREERROR_NEON
 static uint32_t SumSquareError_NEON(const uint8_t* src_a,
-                                  const uint8_t* src_b,
-                                  int count) {
+                                    const uint8_t* src_b,
+                                    int count) {
   volatile uint32_t sse;
   asm volatile(
       "vmov.u8    q7, #0                         \n"
@@ -74,8 +74,8 @@ static uint32_t SumSquareError_NEON(const uint8_t* src_a,
 #elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #define HAS_SUMSQUAREERROR_NEON
 static uint32_t SumSquareError_NEON(const uint8_t* src_a,
-                                  const uint8_t* src_b,
-                                  int count) {
+                                    const uint8_t* src_b,
+                                    int count) {
   volatile uint32_t sse;
   asm volatile(
       "eor        v16.16b, v16.16b, v16.16b      \n"
@@ -108,8 +108,8 @@ static uint32_t SumSquareError_NEON(const uint8_t* src_a,
 #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_SUMSQUAREERROR_SSE2
 __declspec(naked) static uint32_t SumSquareError_SSE2(const uint8_t* /*src_a*/,
-                                                    const uint8_t* /*src_b*/,
-                                                    int /*count*/) {
+                                                      const uint8_t* /*src_b*/,
+                                                      int /*count*/) {
   __asm {
     mov        eax, [esp + 4]  // src_a
     mov        edx, [esp + 8]  // src_b
@@ -147,8 +147,8 @@ __declspec(naked) static uint32_t SumSquareError_SSE2(const uint8_t* /*src_a*/,
 #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SUMSQUAREERROR_SSE2
 static uint32_t SumSquareError_SSE2(const uint8_t* src_a,
-                                  const uint8_t* src_b,
-                                  int count) {
+                                    const uint8_t* src_b,
+                                    int count) {
   uint32_t sse;
   asm volatile(  // NOLINT
       "pxor      %%xmm0,%%xmm0                   \n"
@@ -229,8 +229,8 @@ static int CpuHasSSE2() {
 #endif  // HAS_SUMSQUAREERROR_SSE2
 
 static uint32_t SumSquareError_C(const uint8_t* src_a,
-                               const uint8_t* src_b,
-                               int count) {
+                                 const uint8_t* src_b,
+                                 int count) {
   uint32_t sse = 0u;
   for (int x = 0; x < count; ++x) {
     int diff = src_a[x] - src_b[x];
@@ -242,8 +242,8 @@ static uint32_t SumSquareError_C(const uint8_t* src_a,
 double ComputeSumSquareError(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count) {
-  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, int count) =
-      SumSquareError_C;
+  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
+                             int count) = SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
   SumSquareError = SumSquareError_NEON;
 #endif
diff --git a/util/ssim.cc b/util/ssim.cc
index 9dce2f251..126c436a7 100644
--- a/util/ssim.cc
+++ b/util/ssim.cc
@@ -262,7 +262,7 @@ double GetSSIMFullKernel(const uint8_t* org,
 
 #define ADD_AND_STORE_FOUR_EPI32(M, OUT)                    \
   do {                                                      \
-    uint32_t tmp[4];                                          \
+    uint32_t tmp[4];                                        \
     _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
     (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];              \
   } while (0)