diff --git a/source/convert.cc b/source/convert.cc
index 8a4fcf06e..06d312f96 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -15,8 +15,8 @@
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "libyuv/row.h"
-#include "libyuv/scale.h"  // For ScalePlane()
-#include "libyuv/scale_uv.h" // For UVScale()
+#include "libyuv/scale.h"     // For ScalePlane()
+#include "libyuv/scale_uv.h"  // For UVScale()
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index f91110034..1aea6db9e 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -4374,15 +4374,15 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
       "lea         8(%1),%1                      \n"
       "subl        $0x8,%5                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),             // %0
-        "+r"(dst_r),                // %1
-        "+r"(dst_g),                // %2
-        "+r"(dst_b),                // %3
-        "+r"(dst_a),                // %4
+      : "+r"(src_argb),  // %0
+        "+r"(dst_r),     // %1
+        "+r"(dst_g),     // %2
+        "+r"(dst_b),     // %3
+        "+r"(dst_a),     // %4
 #if defined(__i386__)
-        "+m"(width)                 // %5
+        "+m"(width)  // %5
 #else
-        "+rm"(width)                // %5
+        "+rm"(width)  // %5
 #endif
       : "m"(kShuffleMaskARGBSplit)  // %6
       : "memory", "cc", "xmm0", "xmm1", "xmm2");
@@ -4465,15 +4465,15 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
       "lea         16(%1),%1                     \n"
       "subl        $0x10,%5                      \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),                    // %0
-        "+r"(dst_r),                       // %1
-        "+r"(dst_g),                       // %2
-        "+r"(dst_b),                       // %3
-        "+r"(dst_a),                       // %4
+      : "+r"(src_argb),  // %0
+        "+r"(dst_r),     // %1
+        "+r"(dst_g),     // %2
+        "+r"(dst_b),     // %3
+        "+r"(dst_a),     // %4
 #if defined(__i386__)
-        "+m"(width)                        // %5
+        "+m"(width)  // %5
 #else
-        "+rm"(width)                       // %5
+        "+rm"(width)  // %5
 #endif
       : "m"(kShuffleMaskARGBSplit_AVX2),   // %6
         "m"(kShuffleMaskARGBPermute_AVX2)  // %7
@@ -7186,7 +7186,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
 #if defined(__x86_64__)
       : "x"(scale)  // %3
 #else
-      : "m"(scale)  // %3
+      : "m"(scale)    // %3
 #endif
       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
 }
@@ -7224,7 +7224,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
 #if defined(__x86_64__)
       : "x"(scale)  // %3
 #else
-      : "m"(scale)  // %3
+      : "m"(scale)    // %3
 #endif
       : "memory", "cc", "xmm2", "xmm3", "xmm4");
 }
diff --git a/source/scale_any.cc b/source/scale_any.cc
index 4257d17b9..47b283863 100644
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@@ -700,28 +700,28 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
 
 // Scale up 2 times using bilinear filter.
 // This function produces 2 rows at a time.
-#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE)                            \
-  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
-            ptrdiff_t dst_stride, int dst_width) {                      \
-    int work_width = (dst_width - 1) & ~1;                              \
-    int r = work_width & MASK;                                          \
-    int n = work_width & ~MASK;                                         \
-    const PTYPE* sa = src_ptr;                                          \
-    const PTYPE* sb = src_ptr + src_stride;                             \
-    PTYPE* da = dst_ptr;                                                \
-    PTYPE* db = dst_ptr + dst_stride;                                   \
-    da[0] = (3 * sa[0] + sb[0]) >> 2;                                   \
-    db[0] = (sa[0] + 3 * sb[0]) >> 2;                                   \
-    if (work_width > 0) {                                               \
-      if (n != 0) {                                                     \
-        SIMD(sa, sb - sa, da + 1, db - da, n);                          \
-      }                                                                 \
-      C(sa + (n / 2), sb - sa, da + n + 1, db - da, r);                 \
-    }                                                                   \
-    da[dst_width - 1] =                                                 \
-        (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2]) >> 2;   \
-    db[dst_width - 1] =                                                 \
-        (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2]) >> 2;   \
+#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE)                              \
+  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr,   \
+            ptrdiff_t dst_stride, int dst_width) {                        \
+    int work_width = (dst_width - 1) & ~1;                                \
+    int r = work_width & MASK;                                            \
+    int n = work_width & ~MASK;                                           \
+    const PTYPE* sa = src_ptr;                                            \
+    const PTYPE* sb = src_ptr + src_stride;                               \
+    PTYPE* da = dst_ptr;                                                  \
+    PTYPE* db = dst_ptr + dst_stride;                                     \
+    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                                 \
+    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                                 \
+    if (work_width > 0) {                                                 \
+      if (n != 0) {                                                       \
+        SIMD(sa, sb - sa, da + 1, db - da, n);                            \
+      }                                                                   \
+      C(sa + (n / 2), sb - sa, da + n + 1, db - da, r);                   \
+    }                                                                     \
+    da[dst_width - 1] =                                                   \
+        (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
+    db[dst_width - 1] =                                                   \
+        (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
   }
 
 SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
@@ -856,10 +856,10 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
     const PTYPE* sb = src_ptr + src_stride;                             \
     PTYPE* da = dst_ptr;                                                \
     PTYPE* db = dst_ptr + dst_stride;                                   \
-    da[0] = (3 * sa[0] + sb[0]) >> 2;                                   \
-    db[0] = (sa[0] + 3 * sb[0]) >> 2;                                   \
-    da[1] = (3 * sa[1] + sb[1]) >> 2;                                   \
-    db[1] = (sa[1] + 3 * sb[1]) >> 2;                                   \
+    da[0] = (3 * sa[0] + sb[0] + 2) >> 2;                               \
+    db[0] = (sa[0] + 3 * sb[0] + 2) >> 2;                               \
+    da[1] = (3 * sa[1] + sb[1] + 2) >> 2;                               \
+    db[1] = (sa[1] + 3 * sb[1] + 2) >> 2;                               \
     if (work_width > 0) {                                               \
       if (n != 0) {                                                     \
         SIMD(sa, sb - sa, da + 2, db - da, n);                          \
@@ -867,13 +867,17 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
       C(sa + n, sb - sa, da + 2 * n + 2, db - da, r);                   \
     }                                                                   \
     da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] +       \
-                             sb[((dst_width + 1) & ~1) - 2]) >> 2;      \
+                             sb[((dst_width + 1) & ~1) - 2] + 2) >>     \
+                            2;                                          \
     db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] +           \
-                             3 * sb[((dst_width + 1) & ~1) - 2]) >> 2;  \
+                             3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \
+                            2;                                          \
     da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] +       \
-                             sb[((dst_width + 1) & ~1) - 1]) >> 2;      \
+                             sb[((dst_width + 1) & ~1) - 1] + 2) >>     \
+                            2;                                          \
     db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] +           \
-                             3 * sb[((dst_width + 1) & ~1) - 1]) >> 2;  \
+                             3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \
+                            2;                                          \
   }
 
 SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
diff --git a/source/scale_common.cc b/source/scale_common.cc
index 4af843216..f4f233973 100644
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@@ -1232,21 +1232,29 @@ void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
   assert((dst_width % 2 == 0) && (dst_width >= 0));
   for (x = 0; x < src_width; ++x) {
     d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
-                    t[2 * x + 2] * 1 + 8) >> 4;
+                    t[2 * x + 2] * 1 + 8) >>
+                   4;
     d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
-                    t[2 * x + 3] * 1 + 8) >> 4;
+                    t[2 * x + 3] * 1 + 8) >>
+                   4;
     d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
-                    t[2 * x + 2] * 3 + 8) >> 4;
+                    t[2 * x + 2] * 3 + 8) >>
+                   4;
     d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
-                    t[2 * x + 3] * 3 + 8) >> 4;
+                    t[2 * x + 3] * 3 + 8) >>
+                   4;
     e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
-                    t[2 * x + 2] * 3 + 8) >> 4;
+                    t[2 * x + 2] * 3 + 8) >>
+                   4;
     e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
-                    t[2 * x + 3] * 3 + 8) >> 4;
+                    t[2 * x + 3] * 3 + 8) >>
+                   4;
     e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
-                    t[2 * x + 2] * 9 + 8) >> 4;
+                    t[2 * x + 2] * 9 + 8) >>
+                   4;
     e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
-                    t[2 * x + 3] * 9 + 8) >> 4;
+                    t[2 * x + 3] * 9 + 8) >>
+                   4;
   }
 }
 
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index 226e0a956..9563e5bb6 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -196,8 +196,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -211,11 +210,11 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
+               : "+r"(src_ptr),   // %0
+                 "+r"(dst_ptr),   // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1");
 }
 
 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
@@ -483,8 +482,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf1),  // %1
         "m"(kShuf2)   // %2
   );
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm2               \n"
@@ -500,11 +498,11 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
       "lea         0x18(%1),%1                   \n"
       "sub         $0x18,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+               : "+r"(src_ptr),   // %0
+                 "+r"(dst_ptr),   // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
@@ -529,8 +527,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
   );
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@@ -563,13 +560,13 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
       "lea         0x18(%1),%1                   \n"
       "sub         $0x18,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "m"(kMadd21)                  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+               : "+r"(src_ptr),                // %0
+                 "+r"(dst_ptr),                // %1
+                 "+r"(dst_width)               // %2
+               : "r"((intptr_t)(src_stride)),  // %3
+                 "m"(kMadd21)                  // %4
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6", "xmm7");
 }
 
 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
@@ -595,8 +592,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kRound34)  // %2
   );
 
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@@ -632,13 +628,13 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
       "lea         0x18(%1),%1                   \n"
       "sub         $0x18,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src_ptr),                // %0
-        "+r"(dst_ptr),                // %1
-        "+r"(dst_width)               // %2
-      : "r"((intptr_t)(src_stride)),  // %3
-        "m"(kMadd21)                  // %4
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+               : "+r"(src_ptr),                // %0
+                 "+r"(dst_ptr),                // %1
+                 "+r"(dst_width)               // %2
+               : "r"((intptr_t)(src_stride)),  // %3
+                 "m"(kMadd21)                  // %4
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6", "xmm7");
 }
 
 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
@@ -687,8 +683,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAb2),  // %2
         "m"(kScaleAb2)  // %3
   );
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm1          \n"
@@ -709,11 +704,12 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
       "lea         0x6(%1),%1                    \n"
       "sub         $0x6,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_ptr),               // %0
-        "+r"(dst_ptr),               // %1
-        "+r"(dst_width)              // %2
-      : "r"((intptr_t)(src_stride))  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+               : "+r"(src_ptr),               // %0
+                 "+r"(dst_ptr),               // %1
+                 "+r"(dst_width)              // %2
+               : "r"((intptr_t)(src_stride))  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6");
 }
 
 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
@@ -730,8 +726,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAc3),   // %1
         "m"(kScaleAc33)  // %2
   );
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm6          \n"
@@ -771,12 +766,12 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
       "lea         0x6(%1),%1                    \n"
       "sub         $0x6,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_ptr),               // %0
-        "+r"(dst_ptr),               // %1
-        "+r"(dst_width)              // %2
-      : "r"((intptr_t)(src_stride))  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+               : "+r"(src_ptr),               // %0
+                 "+r"(dst_ptr),               // %1
+                 "+r"(dst_width)              // %2
+               : "r"((intptr_t)(src_stride))  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6", "xmm7");
 }
 
 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
@@ -1601,11 +1596,10 @@ void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-  asm volatile(
-      "pxor        %%xmm5,%%xmm5                 \n"
+      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
 
-      // 16 pixel loop.
-      LABELALIGN
+               // 16 pixel loop.
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm3                   \n"
       "lea         0x10(%0),%0                   \n"  // src_ptr += 16
@@ -1621,11 +1615,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
       "lea         0x20(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+               : "+r"(src_ptr),   // %0
+                 "+r"(dst_ptr),   // %1
+                 "+r"(src_width)  // %2
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
 #ifdef HAS_SCALEADDROW_AVX2
@@ -1633,10 +1627,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-  asm volatile(
-      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm3                   \n"
       "lea         0x20(%0),%0                   \n"  // src_ptr += 32
@@ -1651,11 +1644,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+               : "+r"(src_ptr),   // %0
+                 "+r"(dst_ptr),   // %1
+                 "+r"(src_width)  // %2
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEADDROW_AVX2
 
@@ -1772,8 +1765,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
                        int dx) {
   (void)x;
   (void)dx;
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
       "lea         0x10(%1),%1                   \n"
@@ -1786,11 +1778,11 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
 
-      : "+r"(dst_ptr),   // %0
-        "+r"(src_ptr),   // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
+               : "+r"(dst_ptr),   // %0
+                 "+r"(src_ptr),   // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1");
 }
 
 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
@@ -1798,8 +1790,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int dst_width) {
   (void)src_stride;
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1809,11 +1800,11 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
+               : "+r"(src_argb),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1");
 }
 
 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
@@ -1821,8 +1812,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   (void)src_stride;
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1835,19 +1825,18 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
+               : "+r"(src_argb),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1");
 }
 
 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                uint8_t* dst_argb,
                                int dst_width) {
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1864,11 +1853,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),              // %0
-        "+r"(dst_argb),              // %1
-        "+r"(dst_width)              // %2
-      : "r"((intptr_t)(src_stride))  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+               : "+r"(src_argb),              // %0
+                 "+r"(dst_argb),              // %1
+                 "+r"(dst_width)              // %2
+               : "r"((intptr_t)(src_stride))  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Reads 4 pixels at a time.
@@ -2032,8 +2021,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
                            int dx) {
   (void)x;
   (void)dx;
-  asm volatile(
-      LABELALIGN
+  asm volatile(LABELALIGN
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
       "lea         0x10(%1),%1                   \n"
@@ -2046,11 +2034,11 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
 
-      : "+r"(dst_argb),  // %0
-        "+r"(src_argb),  // %1
-        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
+               : "+r"(dst_argb),  // %0
+                 "+r"(src_argb),  // %1
+                 "+r"(dst_width)  // %2
+                 ::"memory",
+                 "cc", "xmm0", "xmm1");
 }
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
@@ -2381,7 +2369,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
         "+r"(dst_width)               // %2
       : "r"((intptr_t)(src_stride)),  // %3
         "r"((intptr_t)(dst_stride)),  // %4
-        "m"(kUVLinearMadd31_SSSE3)      // %5
+        "m"(kUVLinearMadd31_SSSE3)    // %5
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
         "xmm7");
 }
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index fea3e64e1..e65654d92 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -194,21 +194,21 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
       "vmlal.u8    q10, d2, d24                  \n"
       "vmlal.u8    q11, d3, d24                  \n"
 
-      // (3 * line_0 + line_1) >> 2
+      // (3 * line_0 + line_1 + 2) >> 2
       "vqrshrn.u16 d0, q8, #2                    \n"
       "vqrshrn.u16 d1, q9, #2                    \n"
       "vqrshrn.u16 d2, q10, #2                   \n"
       "vqrshrn.u16 d3, q11, #2                   \n"
 
-      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
       "vmovl.u8    q8, d1                        \n"
       "vmlal.u8    q8, d0, d24                   \n"
       "vqrshrn.u16 d0, q8, #2                    \n"
 
-      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
       "vrhadd.u8   d1, d1, d2                    \n"
 
-      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
       "vmovl.u8    q8, d2                        \n"
       "vmlal.u8    q8, d3, d24                   \n"
       "vqrshrn.u16 d2, q8, #2                    \n"
@@ -240,15 +240,15 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
       "vrhadd.u8   q0, q0, q2                    \n"
       "vrhadd.u8   q1, q1, q3                    \n"
 
-      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
       "vmovl.u8    q3, d1                        \n"
       "vmlal.u8    q3, d0, d24                   \n"
       "vqrshrn.u16 d0, q3, #2                    \n"
 
-      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
       "vrhadd.u8   d1, d1, d2                    \n"
 
-      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
       "vmovl.u8    q3, d2                        \n"
       "vmlal.u8    q3, d3, d24                   \n"
       "vqrshrn.u16 d2, q3, #2                    \n"
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 3a3d499dc..03a798cd4 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -201,22 +201,22 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
       "umlal       v19.8h, v3.8b, v20.8b         \n"
       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
 
-      // (3 * line_0 + line_1) >> 2
+      // (3 * line_0 + line_1 + 2) >> 2
       "uqrshrn     v0.8b, v16.8h, #2             \n"
       "uqrshrn     v1.8b, v17.8h, #2             \n"
       "uqrshrn     v2.8b, v18.8h, #2             \n"
       "uqrshrn     v3.8b, v19.8h, #2             \n"
       "prfm        pldl1keep, [%3, 448]          \n"
 
-      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
       "ushll       v16.8h, v1.8b, #0             \n"
       "umlal       v16.8h, v0.8b, v20.8b         \n"
       "uqrshrn     v0.8b, v16.8h, #2             \n"
 
-      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
       "urhadd      v1.8b, v1.8b, v2.8b           \n"
 
-      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
       "ushll       v16.8h, v2.8b, #0             \n"
       "umlal       v16.8h, v3.8b, v20.8b         \n"
       "uqrshrn     v2.8b, v16.8h, #2             \n"
@@ -251,16 +251,16 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
       "urhadd      v3.8b, v3.8b, v7.8b           \n"
       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
 
-      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
       "ushll       v4.8h, v1.8b, #0              \n"
       "umlal       v4.8h, v0.8b, v20.8b          \n"
       "uqrshrn     v0.8b, v4.8h, #2              \n"
       "prfm        pldl1keep, [%3, 448]          \n"
 
-      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
       "urhadd      v1.8b, v1.8b, v2.8b           \n"
 
-      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
       "ushll       v4.8h, v2.8b, #0              \n"
       "umlal       v4.8h, v3.8b, v20.8b          \n"
       "uqrshrn     v2.8b, v4.8h, #2              \n"
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
index ab58966d5..003ad2a17 100644
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@@ -690,8 +690,7 @@ void ScaleUVLinearUp2(int src_width,
 #endif
 
   if (dst_height == 1) {
-    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv,
-               dst_width);
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
   } else {
     dy = FixedDiv(src_height - 1, dst_height - 1);
     y = (1 << 15) - 1;
diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc
index 60bdfdd68..a81ab19a8 100644
--- a/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@@ -470,7 +470,7 @@ static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
 
 // BT.2020 full range YUV to RGB reference
 static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
-  *r = RoundToByte(y                        + (v - 128) * 1.474600);
+  *r = RoundToByte(y + (v - 128) * 1.474600);
   *g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353);
   *b = RoundToByte(y + (u - 128) * 1.881400);
 }
@@ -609,9 +609,15 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
 
 // BT.601 limited range.
 TEST_F(LibYUVColorTest, TestFullYUV) {
-  int rh[256] = { 0, };
-  int gh[256] = { 0, };
-  int bh[256] = { 0, };
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -633,9 +639,15 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
 
 // BT.601 full range.
 TEST_F(LibYUVColorTest, TestFullYUVJ) {
-  int rh[256] = { 0, };
-  int gh[256] = { 0, };
-  int bh[256] = { 0, };
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -657,9 +669,15 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
 
 // BT.709 limited range.
 TEST_F(LibYUVColorTest, TestFullYUVH) {
-  int rh[256] = { 0, };
-  int gh[256] = { 0, };
-  int bh[256] = { 0, };
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -682,9 +700,15 @@ TEST_F(LibYUVColorTest, TestFullYUVH) {
 
 // BT.709 full range.
 TEST_F(LibYUVColorTest, TestFullYUVF) {
-  int rh[256] = { 0, };
-  int gh[256] = { 0, };
-  int bh[256] = { 0, };
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -706,9 +730,15 @@ TEST_F(LibYUVColorTest, TestFullYUVF) {
 
 // BT.2020 limited range.
 TEST_F(LibYUVColorTest, TestFullYUVU) {
-  int rh[256] = { 0, };
-  int gh[256] = { 0, };
-  int bh[256] = { 0, };
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@@ -731,9 +761,15 @@ TEST_F(LibYUVColorTest, TestFullYUVU) {
 
 // BT.2020 full range.
 TEST_F(LibYUVColorTest, TestFullYUVV) {
-  int rh[256] = { 0, };
-  int gh[256] = { 0, };
-  int bh[256] = { 0, };
+  int rh[256] = {
+      0,
+  };
+  int gh[256] = {
+      0,
+  };
+  int bh[256] = {
+      0,
+  };
   for (int u = 0; u < 256; ++u) {
     for (int v = 0; v < 256; ++v) {
       for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index c7c5daffe..18b910e58 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -794,10 +794,10 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
   I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
                         l, m)
-#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
   I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
                         l, m)
-#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
   I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
                         l, m)
 #define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
@@ -824,10 +824,10 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
   I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
                         l, m)
-#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
   I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
                         l, m)
-#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
   I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
                         l, m)
 #define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
@@ -854,10 +854,10 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
   I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
                         l, m)
-#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
   I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
                         l, m)
-#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)                \
   I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
                         l, m)
 
diff --git a/util/color.cc b/util/color.cc
index 2333276ba..8c3bbefd2 100644
--- a/util/color.cc
+++ b/util/color.cc
@@ -18,11 +18,15 @@
 
 // For those MCs that can be represented as kr and kb:
 // Full range
-// float M[3][3] {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
-// float B[3] {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
+// float M[3][3]
+// {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
+// float B[3]
+// {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
 // Limited range
-// float M[3][3] {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
-// float B[3] {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
+// float M[3][3]
+// {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
+// float B[3]
+// {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
 
 // mc bt
 // 1 bt.709      KR = 0.2126; KB = 0.0722
@@ -56,11 +60,10 @@
 // #define BR (-VR * 128 + YB)
 
 int round(float v) {
-    return (int) (v + 0.5);
+  return (int)(v + 0.5);
 }
 
 int main(int argc, const char* argv[]) {
-
   if (argc < 2) {
     printf("color kr kb\n");
     return -1;
@@ -81,11 +84,11 @@ int main(int argc, const char* argv[]) {
 
   printf("KR = %4f; ", kr);
   printf("KB = %4f\n", kb);
-//  printf("KG = %4f\n", kg);
-// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-// #define YB 32    /* 64 / 2 */
-//
-// // U and V contributions to R,G,B.
+  //  printf("KG = %4f\n", kg);
+  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+  // #define YB 32    /* 64 / 2 */
+  //
+  // // U and V contributions to R,G,B.
 
   printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
   printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
@@ -102,11 +105,11 @@ int main(int argc, const char* argv[]) {
   printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
   printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
 
-//  printf("KG = %4f\n", kg);
-// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-// #define YB 32    /* 64 / 2 */
-//
-// // U and V contributions to R,G,B.
+  //  printf("KG = %4f\n", kg);
+  // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+  // #define YB 32    /* 64 / 2 */
+  //
+  // // U and V contributions to R,G,B.
 
   printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
   printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
@@ -115,4 +118,3 @@ int main(int argc, const char* argv[]) {
 
   return 0;
 }
-