diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h
index 57a6a12e9..26845b602 100644
--- a/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -67,7 +67,6 @@ static const int kCpuHasLOONGARCH = 0x20;
 static const int kCpuHasLSX = 0x100;
 static const int kCpuHasLASX = 0x200;
 
-
 // Optional init function. TestCpuFlag does an auto-init.
 // Returns cpu_info flags.
 LIBYUV_API
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 2b0845c16..fee2d2481 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -3613,9 +3613,9 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
                           int scale,
                           int width);
 void Convert16To8Row_AVX512BW(const uint16_t* src_y,
-                             uint8_t* dst_y,
-                             int scale,
-                             int width);
+                              uint8_t* dst_y,
+                              int scale,
+                              int width);
 void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
                                uint8_t* dst_ptr,
                                int scale,
diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h
index a2eb82db8..b52a38a99 100644
--- a/include/libyuv/row_sve.h
+++ b/include/libyuv/row_sve.h
@@ -499,8 +499,8 @@ static inline void I422ToRGB565Row_SVE_SC(
       // Calculate a predicate for the final iteration to deal with the tail.
       "cnth     %[vl]                                   \n"
       "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X
-          RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
+          RGB8TORGB565_SVE_FROM_TOP_2X
       "st2h     {z18.h, z19.h}, p1, [%[dst]] \n"
 
       "99:                                              \n"
@@ -558,8 +558,8 @@ static inline void I422ToARGB1555Row_SVE_SC(
       // Calculate a predicate for the final iteration to deal with the tail.
       "cnth     %[vl]                                   \n"
       "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X
-          RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
+          RGB8TOARGB1555_SVE_FROM_TOP_2X
       "st2h     {z0.h, z1.h}, p1, [%[dst]] \n"
 
       "99:                                              \n"
@@ -617,8 +617,8 @@ static inline void I422ToARGB4444Row_SVE_SC(
       // Calculate a predicate for the final iteration to deal with the tail.
       "cnth     %[vl]                                   \n"
       "whilelt  p1.b, wzr, %w[width]                    \n"  //
-      READYUV422_SVE_2X I422TORGB_SVE_2X
-          RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X
+      READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
+          RGB8TOARGB4444_SVE_FROM_TOP_2X
       "st2h     {z0.h, z1.h}, p1, [%[dst]] \n"
 
       "99:                                              \n"
diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc
index 492969259..3838abd72 100644
--- a/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@@ -29,7 +29,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                                int count) {
   uint64_t diff;
 
-      asm volatile (
+  asm volatile(
       "xor         %3,%3                         \n"
       "xor         %%r8,%%r8                     \n"
       "xor         %%r9,%%r9                     \n"
@@ -77,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                                int count) {
   uint32_t diff = 0u;
 
-  asm volatile (
+  asm volatile(
       // Process 16 bytes per loop.
       LABELALIGN
       "1:                                        \n"
@@ -121,7 +121,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
                                int count) {
   uint32_t diff;
 
-  asm volatile (
+  asm volatile(
       "movdqa      %4,%%xmm2                     \n"
       "movdqa      %5,%%xmm3                     \n"
       "pxor        %%xmm0,%%xmm0                 \n"
@@ -180,7 +180,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
                               int count) {
   uint32_t diff;
 
-      asm volatile (
+  asm volatile(
       "vbroadcastf128 %4,%%ymm2                  \n"
       "vbroadcastf128 %5,%%ymm3                  \n"
       "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
@@ -234,7 +234,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count) {
   uint32_t sse;
-      asm volatile (
+  asm volatile(
       "pxor        %%xmm0,%%xmm0                 \n"
       "pxor        %%xmm5,%%xmm5                 \n"
 
@@ -300,7 +300,7 @@ static const uvec32 kHashMul3 = {
 
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
   uint32_t hash;
-      asm volatile (
+  asm volatile(
       "movd        %2,%%xmm0                     \n"
       "pxor        %%xmm7,%%xmm7                 \n"
       "movdqa      %4,%%xmm6                     \n"
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
index c2aea6074..afdd60121 100644
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -28,7 +28,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
                               int count) {
   uint32_t diff;
 
-  asm volatile (
+  asm volatile(
       "vmov.u16    q4, #0                        \n"  // accumulator
 
       "1:                                        \n"
@@ -58,7 +58,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count) {
   uint32_t sse;
-  asm volatile (
+  asm volatile(
       "vmov.u8     q8, #0                        \n"
       "vmov.u8     q10, #0                       \n"
       "vmov.u8     q9, #0                        \n"
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index 07292deff..49246aaeb 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -26,7 +26,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
                               const uint8_t* src_b,
                               int count) {
   uint32_t diff;
-  asm volatile (
+  asm volatile(
       "movi        v4.8h, #0                     \n"
 
       "1:                                        \n"
@@ -55,7 +55,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count) {
   uint32_t sse;
-  asm volatile (
+  asm volatile(
       "movi        v16.16b, #0                   \n"
       "movi        v17.16b, #0                   \n"
       "movi        v18.16b, #0                   \n"
@@ -116,30 +116,30 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
   uint32_t hash = seed;
   const uint32_t c16 = 0x92d9e201;  // 33^16
   uint32_t tmp, tmp2;
-  asm("ld1   {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
-      "ld1   {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]]    \n"
+      asm("ld1         {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
+      "ld1         {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
 
       // count is always a multiple of 16.
       // maintain two accumulators, reduce and then final sum in scalar since
       // this has better performance on little cores.
-      "1:                                \n"
-      "ldr   q0, [%[src]], #16           \n"
-      "subs  %w[count], %w[count], #16   \n"
-      "tbl   v3.16b, {v0.16b}, v19.16b   \n"
-      "tbl   v2.16b, {v0.16b}, v18.16b   \n"
-      "tbl   v1.16b, {v0.16b}, v17.16b   \n"
-      "tbl   v0.16b, {v0.16b}, v16.16b   \n"
-      "mul   v3.4s, v3.4s, v7.4s         \n"
-      "mul   v2.4s, v2.4s, v6.4s         \n"
-      "mla   v3.4s, v1.4s, v5.4s         \n"
-      "mla   v2.4s, v0.4s, v4.4s         \n"
-      "addv  s1, v3.4s                   \n"
-      "addv  s0, v2.4s                   \n"
-      "fmov  %w[tmp2], s1                \n"
-      "fmov  %w[tmp], s0                 \n"
-      "add   %w[tmp], %w[tmp], %w[tmp2]  \n"
-      "madd  %w[hash], %w[hash], %w[c16], %w[tmp] \n"
-      "b.gt  1b                          \n"
+      "1:                                        \n"
+      "ldr         q0, [%[src]], #16             \n"
+      "subs        %w[count], %w[count], #16     \n"
+      "tbl         v3.16b, {v0.16b}, v19.16b     \n"
+      "tbl         v2.16b, {v0.16b}, v18.16b     \n"
+      "tbl         v1.16b, {v0.16b}, v17.16b     \n"
+      "tbl         v0.16b, {v0.16b}, v16.16b     \n"
+      "mul         v3.4s, v3.4s, v7.4s           \n"
+      "mul         v2.4s, v2.4s, v6.4s           \n"
+      "mla         v3.4s, v1.4s, v5.4s           \n"
+      "mla         v2.4s, v0.4s, v4.4s           \n"
+      "addv        s1, v3.4s                     \n"
+      "addv        s0, v2.4s                     \n"
+      "fmov        %w[tmp2], s1                  \n"
+      "fmov        %w[tmp], s0                   \n"
+      "add         %w[tmp], %w[tmp], %w[tmp2]    \n"
+      "madd        %w[hash], %w[hash], %w[c16], %w[tmp] \n"
+      "b.gt        1b                            \n"
       : [hash] "+r"(hash),                // %[hash]
         [count] "+r"(count),              // %[count]
         [tmp] "=&r"(tmp),                 // %[tmp]
@@ -157,7 +157,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
                                       const uint8_t* src_b,
                                       int count) {
   uint32_t diff;
-  asm volatile (
+  asm volatile(
       "movi        v4.4s, #0                     \n"
       "movi        v5.4s, #0                     \n"
       "movi        v6.16b, #1                    \n"
@@ -190,7 +190,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
                                      int count) {
   // count is guaranteed to be a multiple of 32.
   uint32_t sse;
-  asm volatile (
+  asm volatile(
       "movi        v4.4s, #0                     \n"
       "movi        v5.4s, #0                     \n"
 
diff --git a/source/convert.cc b/source/convert.cc
index bf886bc1c..0bdb8998f 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -665,7 +665,7 @@ int I010ToNV12(const uint16_t* src_y,
   void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
                           int width) = Convert16To8Row_C;
   void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
-                      uint8_t* dst_uv, int width) = MergeUVRow_C;
+                     uint8_t* dst_uv, int width) = MergeUVRow_C;
   if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 ||
       height == 0) {
     return -1;
diff --git a/source/convert_to_argb.cc b/source/convert_to_argb.cc
index d0ff18a5e..72d21b042 100644
--- a/source/convert_to_argb.cc
+++ b/source/convert_to_argb.cc
@@ -70,9 +70,8 @@ int ConvertToARGB(const uint8_t* sample,
   uint8_t* rotate_buffer = NULL;
   int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
 
-  if (dst_argb == NULL || sample == NULL ||
-      src_width <= 0 || src_width > INT_MAX / 4 ||
-      crop_width <= 0 || crop_width > INT_MAX / 4 ||
+  if (dst_argb == NULL || sample == NULL || src_width <= 0 ||
+      src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 ||
       src_height == 0 || crop_height == 0) {
     return -1;
   }
@@ -81,7 +80,8 @@ int ConvertToARGB(const uint8_t* sample,
   }
 
   if (need_buf) {
-    const uint64_t rotate_buffer_size = (uint64_t)crop_width * 4 * abs_crop_height;
+    const uint64_t rotate_buffer_size =
+        (uint64_t)crop_width * 4 * abs_crop_height;
     if (rotate_buffer_size > SIZE_MAX) {
       return -1;  // Invalid size.
     }
diff --git a/source/convert_to_i420.cc b/source/convert_to_i420.cc
index a2bc189be..aab071e1a 100644
--- a/source/convert_to_i420.cc
+++ b/source/convert_to_i420.cc
@@ -65,8 +65,9 @@ int ConvertToI420(const uint8_t* sample,
   const int inv_crop_height =
       (src_height < 0) ? -abs_crop_height : abs_crop_height;
 
-  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || src_width > INT_MAX / 4 ||
-      crop_width <= 0 || src_height == 0 || crop_height == 0) {
+  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
+      src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 ||
+      crop_height == 0) {
     return -1;
   }
 
@@ -78,7 +79,8 @@ int ConvertToI420(const uint8_t* sample,
   if (need_buf) {
     int y_size = crop_width * abs_crop_height;
     int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
-    const uint64_t rotate_buffer_size = (uint64_t)y_size + (uint64_t)uv_size * 2;
+    const uint64_t rotate_buffer_size =
+        (uint64_t)y_size + (uint64_t)uv_size * 2;
     if (rotate_buffer_size > SIZE_MAX) {
       return -1;  // Invalid size.
     }
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index e253797e8..7fda09d43 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -191,7 +191,8 @@ static int ARGBRotate180(const uint8_t* src_argb,
 #endif
 #if defined(HAS_COPYROW_AVX512BW)
   if (TestCpuFlag(kCpuHasAVX512BW)) {
-    CopyRow = IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
+    CopyRow =
+        IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
   }
 #endif
 #if defined(HAS_COPYROW_ERMS)
diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc
index 48926b687..fd5eee05f 100644
--- a/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
                         uint8_t* dst,
                         int dst_stride,
                         int width) {
-  asm volatile (
+  asm volatile(
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
@@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
                              uint8_t* dst,
                              int dst_stride,
                              int width) {
-  asm volatile (
+  asm volatile(
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
@@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
                          uint8_t* dst_b,
                          int dst_stride_b,
                          int width) {
-  asm volatile (
+  asm volatile(
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
@@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
                           uint8_t* dst,
                           int dst_stride,
                           int width) {
-  asm volatile (
+  asm volatile(
       // Main loop transpose 4x4.  Read a column, write a row.
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"  // a b c d
@@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
                           uint8_t* dst,
                           int dst_stride,
                           int width) {
-  asm volatile (
+  asm volatile(
       // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
       "1:                                        \n"
       "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index 334a9f998..a16ef7266 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -27,57 +27,57 @@ void TransposeWx8_NEON(const uint8_t* src,
                        int dst_stride,
                        int width) {
   const uint8_t* temp;
-  asm volatile (
+  asm volatile(
       // loops are on blocks of 8. loop will stop when
       // counter gets to or below 0. starting the counter
       // at w-8 allow for this
-      "sub         %[width], #8                           \n"
+      "sub         %[width], #8                  \n"
 
-      "1:                                                 \n"
-      "mov         %[temp], %[src]                        \n"
-      "vld1.8      {d0}, [%[temp]], %[src_stride]         \n"
-      "vld1.8      {d1}, [%[temp]], %[src_stride]         \n"
-      "vld1.8      {d2}, [%[temp]], %[src_stride]         \n"
-      "vld1.8      {d3}, [%[temp]], %[src_stride]         \n"
-      "vld1.8      {d4}, [%[temp]], %[src_stride]         \n"
-      "vld1.8      {d5}, [%[temp]], %[src_stride]         \n"
-      "vld1.8      {d6}, [%[temp]], %[src_stride]         \n"
-      "vld1.8      {d7}, [%[temp]]                        \n"
-      "add         %[src], #8                             \n"
+      "1:                                        \n"
+      "mov         %[temp], %[src]               \n"
+      "vld1.8      {d0}, [%[temp]], %[src_stride] \n"
+      "vld1.8      {d1}, [%[temp]], %[src_stride] \n"
+      "vld1.8      {d2}, [%[temp]], %[src_stride] \n"
+      "vld1.8      {d3}, [%[temp]], %[src_stride] \n"
+      "vld1.8      {d4}, [%[temp]], %[src_stride] \n"
+      "vld1.8      {d5}, [%[temp]], %[src_stride] \n"
+      "vld1.8      {d6}, [%[temp]], %[src_stride] \n"
+      "vld1.8      {d7}, [%[temp]]               \n"
+      "add         %[src], #8                    \n"
 
-      "vtrn.8      d1, d0                                 \n"
-      "vtrn.8      d3, d2                                 \n"
-      "vtrn.8      d5, d4                                 \n"
-      "vtrn.8      d7, d6                                 \n"
-      "subs        %[width], #8                           \n"
+      "vtrn.8      d1, d0                        \n"
+      "vtrn.8      d3, d2                        \n"
+      "vtrn.8      d5, d4                        \n"
+      "vtrn.8      d7, d6                        \n"
+      "subs        %[width], #8                  \n"
 
-      "vtrn.16     d1, d3                                 \n"
-      "vtrn.16     d0, d2                                 \n"
-      "vtrn.16     d5, d7                                 \n"
-      "vtrn.16     d4, d6                                 \n"
+      "vtrn.16     d1, d3                        \n"
+      "vtrn.16     d0, d2                        \n"
+      "vtrn.16     d5, d7                        \n"
+      "vtrn.16     d4, d6                        \n"
 
-      "vtrn.32     d1, d5                                 \n"
-      "vtrn.32     d0, d4                                 \n"
-      "vtrn.32     d3, d7                                 \n"
-      "vtrn.32     d2, d6                                 \n"
+      "vtrn.32     d1, d5                        \n"
+      "vtrn.32     d0, d4                        \n"
+      "vtrn.32     d3, d7                        \n"
+      "vtrn.32     d2, d6                        \n"
 
-      "vrev16.8    q0, q0                                 \n"
-      "vrev16.8    q1, q1                                 \n"
-      "vrev16.8    q2, q2                                 \n"
-      "vrev16.8    q3, q3                                 \n"
+      "vrev16.8    q0, q0                        \n"
+      "vrev16.8    q1, q1                        \n"
+      "vrev16.8    q2, q2                        \n"
+      "vrev16.8    q3, q3                        \n"
 
-      "mov         %[temp], %[dst]                        \n"
-      "vst1.8      {d1}, [%[temp]], %[dst_stride]         \n"
-      "vst1.8      {d0}, [%[temp]], %[dst_stride]         \n"
-      "vst1.8      {d3}, [%[temp]], %[dst_stride]         \n"
-      "vst1.8      {d2}, [%[temp]], %[dst_stride]         \n"
-      "vst1.8      {d5}, [%[temp]], %[dst_stride]         \n"
-      "vst1.8      {d4}, [%[temp]], %[dst_stride]         \n"
-      "vst1.8      {d7}, [%[temp]], %[dst_stride]         \n"
-      "vst1.8      {d6}, [%[temp]]                        \n"
-      "add         %[dst], %[dst], %[dst_stride], lsl #3  \n"
+      "mov         %[temp], %[dst]               \n"
+      "vst1.8      {d1}, [%[temp]], %[dst_stride] \n"
+      "vst1.8      {d0}, [%[temp]], %[dst_stride] \n"
+      "vst1.8      {d3}, [%[temp]], %[dst_stride] \n"
+      "vst1.8      {d2}, [%[temp]], %[dst_stride] \n"
+      "vst1.8      {d5}, [%[temp]], %[dst_stride] \n"
+      "vst1.8      {d4}, [%[temp]], %[dst_stride] \n"
+      "vst1.8      {d7}, [%[temp]], %[dst_stride] \n"
+      "vst1.8      {d6}, [%[temp]]               \n"
+      "add         %[dst], %[dst], %[dst_stride], lsl #3 \n"
 
-      "bge         1b                                     \n"
+      "bge         1b                            \n"
       : [temp] "=&r"(temp),            // %[temp]
         [src] "+r"(src),               // %[src]
         [dst] "+r"(dst),               // %[dst]
@@ -95,72 +95,72 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                          int dst_stride_b,
                          int width) {
   const uint8_t* temp;
-  asm volatile (
+  asm volatile(
       // loops are on blocks of 8. loop will stop when
       // counter gets to or below 0. starting the counter
       // at w-8 allow for this
-      "sub         %[width], #8                                 \n"
+      "sub         %[width], #8                  \n"
 
-      "1:                                                       \n"
-      "mov         %[temp], %[src]                              \n"
-      "vld2.8      {d0,  d1},  [%[temp]], %[src_stride]         \n"
-      "vld2.8      {d2,  d3},  [%[temp]], %[src_stride]         \n"
-      "vld2.8      {d4,  d5},  [%[temp]], %[src_stride]         \n"
-      "vld2.8      {d6,  d7},  [%[temp]], %[src_stride]         \n"
-      "vld2.8      {d16, d17}, [%[temp]], %[src_stride]         \n"
-      "vld2.8      {d18, d19}, [%[temp]], %[src_stride]         \n"
-      "vld2.8      {d20, d21}, [%[temp]], %[src_stride]         \n"
-      "vld2.8      {d22, d23}, [%[temp]]                        \n"
-      "add         %[src], #8*2                                 \n"
+      "1:                                        \n"
+      "mov         %[temp], %[src]               \n"
+      "vld2.8      {d0,  d1},  [%[temp]], %[src_stride] \n"
+      "vld2.8      {d2,  d3},  [%[temp]], %[src_stride] \n"
+      "vld2.8      {d4,  d5},  [%[temp]], %[src_stride] \n"
+      "vld2.8      {d6,  d7},  [%[temp]], %[src_stride] \n"
+      "vld2.8      {d16, d17}, [%[temp]], %[src_stride] \n"
+      "vld2.8      {d18, d19}, [%[temp]], %[src_stride] \n"
+      "vld2.8      {d20, d21}, [%[temp]], %[src_stride] \n"
+      "vld2.8      {d22, d23}, [%[temp]]         \n"
+      "add         %[src], #8*2                  \n"
 
-      "vtrn.8      q1, q0                                       \n"
-      "vtrn.8      q3, q2                                       \n"
-      "vtrn.8      q9, q8                                       \n"
-      "vtrn.8      q11, q10                                     \n"
-      "subs        %[width], #8                                 \n"
+      "vtrn.8      q1, q0                        \n"
+      "vtrn.8      q3, q2                        \n"
+      "vtrn.8      q9, q8                        \n"
+      "vtrn.8      q11, q10                      \n"
+      "subs        %[width], #8                  \n"
 
-      "vtrn.16     q1, q3                                       \n"
-      "vtrn.16     q0, q2                                       \n"
-      "vtrn.16     q9, q11                                      \n"
-      "vtrn.16     q8, q10                                      \n"
+      "vtrn.16     q1, q3                        \n"
+      "vtrn.16     q0, q2                        \n"
+      "vtrn.16     q9, q11                       \n"
+      "vtrn.16     q8, q10                       \n"
 
-      "vtrn.32     q1, q9                                       \n"
-      "vtrn.32     q0, q8                                       \n"
-      "vtrn.32     q3, q11                                      \n"
-      "vtrn.32     q2, q10                                      \n"
+      "vtrn.32     q1, q9                        \n"
+      "vtrn.32     q0, q8                        \n"
+      "vtrn.32     q3, q11                       \n"
+      "vtrn.32     q2, q10                       \n"
 
-      "vrev16.8    q0, q0                                       \n"
-      "vrev16.8    q1, q1                                       \n"
-      "vrev16.8    q2, q2                                       \n"
-      "vrev16.8    q3, q3                                       \n"
-      "vrev16.8    q8, q8                                       \n"
-      "vrev16.8    q9, q9                                       \n"
-      "vrev16.8    q10, q10                                     \n"
-      "vrev16.8    q11, q11                                     \n"
+      "vrev16.8    q0, q0                        \n"
+      "vrev16.8    q1, q1                        \n"
+      "vrev16.8    q2, q2                        \n"
+      "vrev16.8    q3, q3                        \n"
+      "vrev16.8    q8, q8                        \n"
+      "vrev16.8    q9, q9                        \n"
+      "vrev16.8    q10, q10                      \n"
+      "vrev16.8    q11, q11                      \n"
 
-      "mov         %[temp], %[dst_a]                            \n"
-      "vst1.8      {d2},  [%[temp]], %[dst_stride_a]            \n"
-      "vst1.8      {d0},  [%[temp]], %[dst_stride_a]            \n"
-      "vst1.8      {d6},  [%[temp]], %[dst_stride_a]            \n"
-      "vst1.8      {d4},  [%[temp]], %[dst_stride_a]            \n"
-      "vst1.8      {d18}, [%[temp]], %[dst_stride_a]            \n"
-      "vst1.8      {d16}, [%[temp]], %[dst_stride_a]            \n"
-      "vst1.8      {d22}, [%[temp]], %[dst_stride_a]            \n"
-      "vst1.8      {d20}, [%[temp]]                             \n"
-      "add         %[dst_a], %[dst_a], %[dst_stride_a], lsl #3  \n"
+      "mov         %[temp], %[dst_a]             \n"
+      "vst1.8      {d2},  [%[temp]], %[dst_stride_a] \n"
+      "vst1.8      {d0},  [%[temp]], %[dst_stride_a] \n"
+      "vst1.8      {d6},  [%[temp]], %[dst_stride_a] \n"
+      "vst1.8      {d4},  [%[temp]], %[dst_stride_a] \n"
+      "vst1.8      {d18}, [%[temp]], %[dst_stride_a] \n"
+      "vst1.8      {d16}, [%[temp]], %[dst_stride_a] \n"
+      "vst1.8      {d22}, [%[temp]], %[dst_stride_a] \n"
+      "vst1.8      {d20}, [%[temp]]              \n"
+      "add         %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
 
-      "mov         %[temp], %[dst_b]                            \n"
-      "vst1.8      {d3},  [%[temp]], %[dst_stride_b]            \n"
-      "vst1.8      {d1},  [%[temp]], %[dst_stride_b]            \n"
-      "vst1.8      {d7},  [%[temp]], %[dst_stride_b]            \n"
-      "vst1.8      {d5},  [%[temp]], %[dst_stride_b]            \n"
-      "vst1.8      {d19}, [%[temp]], %[dst_stride_b]            \n"
-      "vst1.8      {d17}, [%[temp]], %[dst_stride_b]            \n"
-      "vst1.8      {d23}, [%[temp]], %[dst_stride_b]            \n"
-      "vst1.8      {d21}, [%[temp]]                             \n"
-      "add         %[dst_b], %[dst_b], %[dst_stride_b], lsl #3  \n"
+      "mov         %[temp], %[dst_b]             \n"
+      "vst1.8      {d3},  [%[temp]], %[dst_stride_b] \n"
+      "vst1.8      {d1},  [%[temp]], %[dst_stride_b] \n"
+      "vst1.8      {d7},  [%[temp]], %[dst_stride_b] \n"
+      "vst1.8      {d5},  [%[temp]], %[dst_stride_b] \n"
+      "vst1.8      {d19}, [%[temp]], %[dst_stride_b] \n"
+      "vst1.8      {d17}, [%[temp]], %[dst_stride_b] \n"
+      "vst1.8      {d23}, [%[temp]], %[dst_stride_b] \n"
+      "vst1.8      {d21}, [%[temp]]              \n"
+      "add         %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
 
-      "bge         1b                                           \n"
+      "bge         1b                            \n"
       : [temp] "=&r"(temp),                // %[temp]
         [src] "+r"(src),                   // %[src]
         [dst_a] "+r"(dst_a),               // %[dst_a]
@@ -184,7 +184,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
   uint8_t* dst1 = dst + dst_stride;
   uint8_t* dst2 = dst1 + dst_stride;
   uint8_t* dst3 = dst2 + dst_stride;
-  asm volatile (
+  asm volatile(
       // Main loop transpose 4x4.  Read a column, write a row.
       "1:                                        \n"
       "vld4.32     {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc
index dbf08edac..4a5e181a6 100644
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@@ -27,104 +27,104 @@ void TransposeWx16_NEON(const uint8_t* src,
                         int dst_stride,
                         int width) {
   const uint8_t* src_temp;
-  asm volatile (
-    "1:                                                \n"
-      "mov   %[src_temp], %[src]                         \n"
+  asm volatile(
+      "1:                                        \n"
+      "mov         %[src_temp], %[src]           \n"
 
-      "ld1   {v16.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v17.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v18.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v19.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v20.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v21.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v22.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v23.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v24.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v25.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v26.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v27.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v28.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v29.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v30.16b}, [%[src_temp]], %[src_stride]     \n"
-      "ld1   {v31.16b}, [%[src_temp]], %[src_stride]     \n"
+      "ld1         {v16.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v17.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v18.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v19.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v20.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v21.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v22.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v23.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v24.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v25.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v26.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v27.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v28.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v29.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v30.16b}, [%[src_temp]], %[src_stride] \n"
+      "ld1         {v31.16b}, [%[src_temp]], %[src_stride] \n"
 
-      "add   %[src], %[src], #16                         \n"
+      "add         %[src], %[src], #16           \n"
 
       // Transpose bytes within each 2x2 block.
-      "trn1  v0.16b, v16.16b, v17.16b                    \n"
-      "trn2  v1.16b, v16.16b, v17.16b                    \n"
-      "trn1  v2.16b, v18.16b, v19.16b                    \n"
-      "trn2  v3.16b, v18.16b, v19.16b                    \n"
-      "trn1  v4.16b, v20.16b, v21.16b                    \n"
-      "trn2  v5.16b, v20.16b, v21.16b                    \n"
-      "trn1  v6.16b, v22.16b, v23.16b                    \n"
-      "trn2  v7.16b, v22.16b, v23.16b                    \n"
-      "trn1  v8.16b, v24.16b, v25.16b                    \n"
-      "trn2  v9.16b, v24.16b, v25.16b                    \n"
-      "trn1  v10.16b, v26.16b, v27.16b                   \n"
-      "trn2  v11.16b, v26.16b, v27.16b                   \n"
-      "trn1  v12.16b, v28.16b, v29.16b                   \n"
-      "trn2  v13.16b, v28.16b, v29.16b                   \n"
-      "trn1  v14.16b, v30.16b, v31.16b                   \n"
-      "trn2  v15.16b, v30.16b, v31.16b                   \n"
+      "trn1        v0.16b, v16.16b, v17.16b      \n"
+      "trn2        v1.16b, v16.16b, v17.16b      \n"
+      "trn1        v2.16b, v18.16b, v19.16b      \n"
+      "trn2        v3.16b, v18.16b, v19.16b      \n"
+      "trn1        v4.16b, v20.16b, v21.16b      \n"
+      "trn2        v5.16b, v20.16b, v21.16b      \n"
+      "trn1        v6.16b, v22.16b, v23.16b      \n"
+      "trn2        v7.16b, v22.16b, v23.16b      \n"
+      "trn1        v8.16b, v24.16b, v25.16b      \n"
+      "trn2        v9.16b, v24.16b, v25.16b      \n"
+      "trn1        v10.16b, v26.16b, v27.16b     \n"
+      "trn2        v11.16b, v26.16b, v27.16b     \n"
+      "trn1        v12.16b, v28.16b, v29.16b     \n"
+      "trn2        v13.16b, v28.16b, v29.16b     \n"
+      "trn1        v14.16b, v30.16b, v31.16b     \n"
+      "trn2        v15.16b, v30.16b, v31.16b     \n"
 
       // Transpose 2x2-byte blocks within each 4x4 block.
-      "trn1  v16.8h, v0.8h, v2.8h                        \n"
-      "trn1  v17.8h, v1.8h, v3.8h                        \n"
-      "trn2  v18.8h, v0.8h, v2.8h                        \n"
-      "trn2  v19.8h, v1.8h, v3.8h                        \n"
-      "trn1  v20.8h, v4.8h, v6.8h                        \n"
-      "trn1  v21.8h, v5.8h, v7.8h                        \n"
-      "trn2  v22.8h, v4.8h, v6.8h                        \n"
-      "trn2  v23.8h, v5.8h, v7.8h                        \n"
-      "trn1  v24.8h, v8.8h, v10.8h                       \n"
-      "trn1  v25.8h, v9.8h, v11.8h                       \n"
-      "trn2  v26.8h, v8.8h, v10.8h                       \n"
-      "trn2  v27.8h, v9.8h, v11.8h                       \n"
-      "trn1  v28.8h, v12.8h, v14.8h                      \n"
-      "trn1  v29.8h, v13.8h, v15.8h                      \n"
-      "trn2  v30.8h, v12.8h, v14.8h                      \n"
-      "trn2  v31.8h, v13.8h, v15.8h                      \n"
+      "trn1        v16.8h, v0.8h, v2.8h          \n"
+      "trn1        v17.8h, v1.8h, v3.8h          \n"
+      "trn2        v18.8h, v0.8h, v2.8h          \n"
+      "trn2        v19.8h, v1.8h, v3.8h          \n"
+      "trn1        v20.8h, v4.8h, v6.8h          \n"
+      "trn1        v21.8h, v5.8h, v7.8h          \n"
+      "trn2        v22.8h, v4.8h, v6.8h          \n"
+      "trn2        v23.8h, v5.8h, v7.8h          \n"
+      "trn1        v24.8h, v8.8h, v10.8h         \n"
+      "trn1        v25.8h, v9.8h, v11.8h         \n"
+      "trn2        v26.8h, v8.8h, v10.8h         \n"
+      "trn2        v27.8h, v9.8h, v11.8h         \n"
+      "trn1        v28.8h, v12.8h, v14.8h        \n"
+      "trn1        v29.8h, v13.8h, v15.8h        \n"
+      "trn2        v30.8h, v12.8h, v14.8h        \n"
+      "trn2        v31.8h, v13.8h, v15.8h        \n"
 
-      "subs  %w[width], %w[width], #16                   \n"
+      "subs        %w[width], %w[width], #16     \n"
 
       // Transpose 4x4-byte blocks within each 8x8 block.
-      "trn1  v0.4s, v16.4s, v20.4s                       \n"
-      "trn1  v2.4s, v17.4s, v21.4s                       \n"
-      "trn1  v4.4s, v18.4s, v22.4s                       \n"
-      "trn1  v6.4s, v19.4s, v23.4s                       \n"
-      "trn2  v8.4s, v16.4s, v20.4s                       \n"
-      "trn2  v10.4s, v17.4s, v21.4s                      \n"
-      "trn2  v12.4s, v18.4s, v22.4s                      \n"
-      "trn2  v14.4s, v19.4s, v23.4s                      \n"
-      "trn1  v1.4s, v24.4s, v28.4s                       \n"
-      "trn1  v3.4s, v25.4s, v29.4s                       \n"
-      "trn1  v5.4s, v26.4s, v30.4s                       \n"
-      "trn1  v7.4s, v27.4s, v31.4s                       \n"
-      "trn2  v9.4s, v24.4s, v28.4s                       \n"
-      "trn2  v11.4s, v25.4s, v29.4s                      \n"
-      "trn2  v13.4s, v26.4s, v30.4s                      \n"
-      "trn2  v15.4s, v27.4s, v31.4s                      \n"
+      "trn1        v0.4s, v16.4s, v20.4s         \n"
+      "trn1        v2.4s, v17.4s, v21.4s         \n"
+      "trn1        v4.4s, v18.4s, v22.4s         \n"
+      "trn1        v6.4s, v19.4s, v23.4s         \n"
+      "trn2        v8.4s, v16.4s, v20.4s         \n"
+      "trn2        v10.4s, v17.4s, v21.4s        \n"
+      "trn2        v12.4s, v18.4s, v22.4s        \n"
+      "trn2        v14.4s, v19.4s, v23.4s        \n"
+      "trn1        v1.4s, v24.4s, v28.4s         \n"
+      "trn1        v3.4s, v25.4s, v29.4s         \n"
+      "trn1        v5.4s, v26.4s, v30.4s         \n"
+      "trn1        v7.4s, v27.4s, v31.4s         \n"
+      "trn2        v9.4s, v24.4s, v28.4s         \n"
+      "trn2        v11.4s, v25.4s, v29.4s        \n"
+      "trn2        v13.4s, v26.4s, v30.4s        \n"
+      "trn2        v15.4s, v27.4s, v31.4s        \n"
 
       // Transpose 8x8-byte blocks and store.
-      "st2   {v0.d, v1.d}[0], [%[dst]], %[dst_stride]    \n"
-      "st2   {v2.d, v3.d}[0], [%[dst]], %[dst_stride]    \n"
-      "st2   {v4.d, v5.d}[0], [%[dst]], %[dst_stride]    \n"
-      "st2   {v6.d, v7.d}[0], [%[dst]], %[dst_stride]    \n"
-      "st2   {v8.d, v9.d}[0], [%[dst]], %[dst_stride]    \n"
-      "st2   {v10.d, v11.d}[0], [%[dst]], %[dst_stride]  \n"
-      "st2   {v12.d, v13.d}[0], [%[dst]], %[dst_stride]  \n"
-      "st2   {v14.d, v15.d}[0], [%[dst]], %[dst_stride]  \n"
-      "st2   {v0.d, v1.d}[1], [%[dst]], %[dst_stride]    \n"
-      "st2   {v2.d, v3.d}[1], [%[dst]], %[dst_stride]    \n"
-      "st2   {v4.d, v5.d}[1], [%[dst]], %[dst_stride]    \n"
-      "st2   {v6.d, v7.d}[1], [%[dst]], %[dst_stride]    \n"
-      "st2   {v8.d, v9.d}[1], [%[dst]], %[dst_stride]    \n"
-      "st2   {v10.d, v11.d}[1], [%[dst]], %[dst_stride]  \n"
-      "st2   {v12.d, v13.d}[1], [%[dst]], %[dst_stride]  \n"
-      "st2   {v14.d, v15.d}[1], [%[dst]], %[dst_stride]  \n"
+      "st2         {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n"
+      "st2         {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n"
+      "st2         {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n"
+      "st2         {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n"
+      "st2         {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n"
+      "st2         {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n"
+      "st2         {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n"
+      "st2         {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n"
+      "st2         {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n"
+      "st2         {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n"
+      "st2         {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n"
+      "st2         {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n"
+      "st2         {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n"
+      "st2         {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n"
+      "st2         {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n"
+      "st2         {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n"
 
-      "b.gt  1b                                          \n"
+      "b.gt        1b                            \n"
       : [src] "+r"(src),                          // %[src]
         [src_temp] "=&r"(src_temp),               // %[src_temp]
         [dst] "+r"(dst),                          // %[dst]
@@ -145,76 +145,76 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                          int dst_stride_b,
                          int width) {
   const uint8_t* temp;
-  asm volatile (
+  asm volatile(
       // loops are on blocks of 8. loop will stop when
       // counter gets to or below 0. starting the counter
       // at w-8 allow for this
-      "sub         %w[width], %w[width], #8                      \n"
+      "sub         %w[width], %w[width], #8      \n"
 
-      "1:                                                        \n"
-      "mov         %[temp], %[src]                               \n"
-      "ld1         {v0.16b}, [%[temp]], %[src_stride]            \n"
-      "ld1         {v1.16b}, [%[temp]], %[src_stride]            \n"
-      "ld1         {v2.16b}, [%[temp]], %[src_stride]            \n"
-      "ld1         {v3.16b}, [%[temp]], %[src_stride]            \n"
-      "ld1         {v4.16b}, [%[temp]], %[src_stride]            \n"
-      "ld1         {v5.16b}, [%[temp]], %[src_stride]            \n"
-      "ld1         {v6.16b}, [%[temp]], %[src_stride]            \n"
-      "ld1         {v7.16b}, [%[temp]]                           \n"
-      "add         %[src], %[src], #16                           \n"
+      "1:                                        \n"
+      "mov         %[temp], %[src]               \n"
+      "ld1         {v0.16b}, [%[temp]], %[src_stride] \n"
+      "ld1         {v1.16b}, [%[temp]], %[src_stride] \n"
+      "ld1         {v2.16b}, [%[temp]], %[src_stride] \n"
+      "ld1         {v3.16b}, [%[temp]], %[src_stride] \n"
+      "ld1         {v4.16b}, [%[temp]], %[src_stride] \n"
+      "ld1         {v5.16b}, [%[temp]], %[src_stride] \n"
+      "ld1         {v6.16b}, [%[temp]], %[src_stride] \n"
+      "ld1         {v7.16b}, [%[temp]]           \n"
+      "add         %[src], %[src], #16           \n"
 
-      "trn1        v16.16b, v0.16b, v1.16b                       \n"
-      "trn2        v17.16b, v0.16b, v1.16b                       \n"
-      "trn1        v18.16b, v2.16b, v3.16b                       \n"
-      "trn2        v19.16b, v2.16b, v3.16b                       \n"
-      "trn1        v20.16b, v4.16b, v5.16b                       \n"
-      "trn2        v21.16b, v4.16b, v5.16b                       \n"
-      "trn1        v22.16b, v6.16b, v7.16b                       \n"
-      "trn2        v23.16b, v6.16b, v7.16b                       \n"
+      "trn1        v16.16b, v0.16b, v1.16b       \n"
+      "trn2        v17.16b, v0.16b, v1.16b       \n"
+      "trn1        v18.16b, v2.16b, v3.16b       \n"
+      "trn2        v19.16b, v2.16b, v3.16b       \n"
+      "trn1        v20.16b, v4.16b, v5.16b       \n"
+      "trn2        v21.16b, v4.16b, v5.16b       \n"
+      "trn1        v22.16b, v6.16b, v7.16b       \n"
+      "trn2        v23.16b, v6.16b, v7.16b       \n"
 
-      "subs        %w[width], %w[width],  #8                     \n"
+      "subs        %w[width], %w[width],  #8     \n"
 
-      "trn1        v0.8h, v16.8h, v18.8h                         \n"
-      "trn2        v1.8h, v16.8h, v18.8h                         \n"
-      "trn1        v2.8h, v20.8h, v22.8h                         \n"
-      "trn2        v3.8h, v20.8h, v22.8h                         \n"
-      "trn1        v4.8h, v17.8h, v19.8h                         \n"
-      "trn2        v5.8h, v17.8h, v19.8h                         \n"
-      "trn1        v6.8h, v21.8h, v23.8h                         \n"
-      "trn2        v7.8h, v21.8h, v23.8h                         \n"
+      "trn1        v0.8h, v16.8h, v18.8h         \n"
+      "trn2        v1.8h, v16.8h, v18.8h         \n"
+      "trn1        v2.8h, v20.8h, v22.8h         \n"
+      "trn2        v3.8h, v20.8h, v22.8h         \n"
+      "trn1        v4.8h, v17.8h, v19.8h         \n"
+      "trn2        v5.8h, v17.8h, v19.8h         \n"
+      "trn1        v6.8h, v21.8h, v23.8h         \n"
+      "trn2        v7.8h, v21.8h, v23.8h         \n"
 
-      "trn1        v16.4s, v0.4s, v2.4s                          \n"
-      "trn2        v17.4s, v0.4s, v2.4s                          \n"
-      "trn1        v18.4s, v1.4s, v3.4s                          \n"
-      "trn2        v19.4s, v1.4s, v3.4s                          \n"
-      "trn1        v20.4s, v4.4s, v6.4s                          \n"
-      "trn2        v21.4s, v4.4s, v6.4s                          \n"
-      "trn1        v22.4s, v5.4s, v7.4s                          \n"
-      "trn2        v23.4s, v5.4s, v7.4s                          \n"
+      "trn1        v16.4s, v0.4s, v2.4s          \n"
+      "trn2        v17.4s, v0.4s, v2.4s          \n"
+      "trn1        v18.4s, v1.4s, v3.4s          \n"
+      "trn2        v19.4s, v1.4s, v3.4s          \n"
+      "trn1        v20.4s, v4.4s, v6.4s          \n"
+      "trn2        v21.4s, v4.4s, v6.4s          \n"
+      "trn1        v22.4s, v5.4s, v7.4s          \n"
+      "trn2        v23.4s, v5.4s, v7.4s          \n"
 
-      "mov         %[temp], %[dst_a]                             \n"
-      "st1         {v16.d}[0], [%[temp]], %[dst_stride_a]        \n"
-      "st1         {v18.d}[0], [%[temp]], %[dst_stride_a]        \n"
-      "st1         {v17.d}[0], [%[temp]], %[dst_stride_a]        \n"
-      "st1         {v19.d}[0], [%[temp]], %[dst_stride_a]        \n"
-      "st1         {v16.d}[1], [%[temp]], %[dst_stride_a]        \n"
-      "st1         {v18.d}[1], [%[temp]], %[dst_stride_a]        \n"
-      "st1         {v17.d}[1], [%[temp]], %[dst_stride_a]        \n"
-      "st1         {v19.d}[1], [%[temp]]                         \n"
-      "add         %[dst_a], %[dst_a], %[dst_stride_a], lsl #3   \n"
+      "mov         %[temp], %[dst_a]             \n"
+      "st1         {v16.d}[0], [%[temp]], %[dst_stride_a] \n"
+      "st1         {v18.d}[0], [%[temp]], %[dst_stride_a] \n"
+      "st1         {v17.d}[0], [%[temp]], %[dst_stride_a] \n"
+      "st1         {v19.d}[0], [%[temp]], %[dst_stride_a] \n"
+      "st1         {v16.d}[1], [%[temp]], %[dst_stride_a] \n"
+      "st1         {v18.d}[1], [%[temp]], %[dst_stride_a] \n"
+      "st1         {v17.d}[1], [%[temp]], %[dst_stride_a] \n"
+      "st1         {v19.d}[1], [%[temp]]         \n"
+      "add         %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
 
-      "mov         %[temp], %[dst_b]                             \n"
-      "st1         {v20.d}[0], [%[temp]], %[dst_stride_b]        \n"
-      "st1         {v22.d}[0], [%[temp]], %[dst_stride_b]        \n"
-      "st1         {v21.d}[0], [%[temp]], %[dst_stride_b]        \n"
-      "st1         {v23.d}[0], [%[temp]], %[dst_stride_b]        \n"
-      "st1         {v20.d}[1], [%[temp]], %[dst_stride_b]        \n"
-      "st1         {v22.d}[1], [%[temp]], %[dst_stride_b]        \n"
-      "st1         {v21.d}[1], [%[temp]], %[dst_stride_b]        \n"
-      "st1         {v23.d}[1], [%[temp]]                         \n"
-      "add         %[dst_b], %[dst_b], %[dst_stride_b], lsl #3   \n"
+      "mov         %[temp], %[dst_b]             \n"
+      "st1         {v20.d}[0], [%[temp]], %[dst_stride_b] \n"
+      "st1         {v22.d}[0], [%[temp]], %[dst_stride_b] \n"
+      "st1         {v21.d}[0], [%[temp]], %[dst_stride_b] \n"
+      "st1         {v23.d}[0], [%[temp]], %[dst_stride_b] \n"
+      "st1         {v20.d}[1], [%[temp]], %[dst_stride_b] \n"
+      "st1         {v22.d}[1], [%[temp]], %[dst_stride_b] \n"
+      "st1         {v21.d}[1], [%[temp]], %[dst_stride_b] \n"
+      "st1         {v23.d}[1], [%[temp]]         \n"
+      "add         %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
 
-      "b.ge        1b                                            \n"
+      "b.ge        1b                            \n"
       : [temp] "=&r"(temp),                           // %[temp]
         [src] "+r"(src),                              // %[src]
         [dst_a] "+r"(dst_a),                          // %[dst_a]
@@ -239,7 +239,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
   uint8_t* dst1 = dst + dst_stride;
   uint8_t* dst2 = dst1 + dst_stride;
   uint8_t* dst3 = dst2 + dst_stride;
-  asm volatile (
+  asm volatile(
       // Main loop transpose 4x4.  Read a column, write a row.
       "1:                                        \n"
       "ld4         {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 2ec59759f..c2ad5b8f5 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -161,7 +161,7 @@ static const lvec8 kShuffleNV21 = {
 
 #ifdef HAS_J400TOARGBROW_SSE2
 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
 
@@ -192,7 +192,7 @@ void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
       "pslld       $0x18,%%xmm5                  \n"
       "movdqa      %3,%%xmm4                     \n"
@@ -230,7 +230,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
 }
 
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
       "pslld       $0x18,%%xmm5                  \n"
       "movdqa      %3,%%xmm4                     \n"
@@ -269,7 +269,7 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 
 // Same code as RAWToARGB with different shuffler and A in low bits
 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
       "psrld       $0x18,%%xmm5                  \n"
       "movdqa      %3,%%xmm4                     \n"
@@ -309,7 +309,7 @@ void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
                          uint8_t* dst_rgb24,
                          int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm3                     \n"
       "movdqa      %4,%%xmm4                     \n"
       "movdqa      %5,%%xmm5                     \n"
@@ -339,7 +339,7 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
 }
 
 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "mov         $0x1080108,%%eax              \n"
       "movd        %%eax,%%xmm5                  \n"
       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
@@ -387,7 +387,7 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "mov         $0x1080108,%%eax              \n"
       "movd        %%eax,%%xmm5                  \n"
       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
@@ -438,7 +438,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "mov         $0xf0f0f0f,%%eax              \n"
       "movd        %%eax,%%xmm4                  \n"
       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
@@ -475,10 +475,9 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-      "movdqa      %3,%%xmm6                     \n"
+      asm volatile("movdqa      %3,%%xmm6                     \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -505,18 +504,18 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
       "lea         0x30(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src),                    // %0
-        "+r"(dst),                    // %1
-        "+r"(width)                   // %2
-      : "m"(kShuffleMaskARGBToRGB24)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+               : "+r"(src),                    // %0
+                 "+r"(dst),                    // %1
+                 "+r"(width)                   // %2
+               : "m"(kShuffleMaskARGBToRGB24)  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6");
 }
 
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
-      "movdqa      %3,%%xmm6                     \n"
+      asm volatile("movdqa      %3,%%xmm6                     \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -543,11 +542,12 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
       "lea         0x30(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src),                  // %0
-        "+r"(dst),                  // %1
-        "+r"(width)                 // %2
-      : "m"(kShuffleMaskARGBToRAW)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+               : "+r"(src),                  // %0
+                 "+r"(dst),                  // %1
+                 "+r"(width)                 // %2
+               : "m"(kShuffleMaskARGBToRAW)  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6");
 }
 
 #ifdef HAS_ARGBTORGB24ROW_AVX2
@@ -555,7 +555,7 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
 
 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm6                  \n"
       "vmovdqa     %4,%%ymm7                     \n"
 
@@ -615,7 +615,7 @@ static const ulvec8 kPermARGBToRGB24_2 = {
     50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
 
 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "vmovdqa     %3,%%ymm5                     \n"
       "vmovdqa     %4,%%ymm6                     \n"
       "vmovdqa     %5,%%ymm7                     \n"
@@ -649,7 +649,7 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_ARGBTORAWROW_AVX2
 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm6                  \n"
       "vmovdqa     %4,%%ymm7                     \n"
 
@@ -694,7 +694,7 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 #endif
 
 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm3,%%xmm3                 \n"
       "psrld       $0x1b,%%xmm3                  \n"
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
@@ -734,7 +734,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 uint8_t* dst,
                                 uint32_t dither4,
                                 int width) {
-  asm volatile (
+  asm volatile(
       "movd        %3,%%xmm6                     \n"
       "punpcklbw   %%xmm6,%%xmm6                 \n"
       "movdqa      %%xmm6,%%xmm7                 \n"
@@ -782,7 +782,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
                                 uint32_t dither4,
                                 int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastss %3,%%xmm6                    \n"
       "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
       "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
@@ -824,7 +824,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrld       $0x1b,%%xmm4                  \n"
       "movdqa      %%xmm4,%%xmm5                 \n"
@@ -865,7 +865,7 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psllw       $0xc,%%xmm4                   \n"
       "movdqa      %%xmm4,%%xmm3                 \n"
@@ -928,7 +928,7 @@ static const uint32_t kMaskAG10 = 0xc000ff00;
 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
 
 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
       "movd        %4,%%xmm3                     \n"  // multipler for RB
       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
@@ -967,7 +967,7 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
       "movd        %4,%%xmm3                     \n"  // multipler for RB
       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
@@ -1007,7 +1007,7 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_ARGBTOAR30ROW_AVX2
 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
@@ -1044,7 +1044,7 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_ABGRTOAR30ROW_AVX2
 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
@@ -1090,7 +1090,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
                          uint16_t* dst_ar64,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqa      %%xmm0,%%xmm1                 \n"
@@ -1112,7 +1112,7 @@ void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
 void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
                          uint16_t* dst_ab64,
                          int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm2                     \n"
       "movdqa      %4,%%xmm3                     \n" LABELALIGN
       "1:                                        \n"
@@ -1137,7 +1137,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1159,10 +1159,9 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile (
-      "movdqa      %3,%%xmm2                     \n"
+      asm volatile("movdqa      %3,%%xmm2                     \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1175,18 +1174,18 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_ab64),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleARGBToABGR)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_ab64),          // %0
+                 "+r"(dst_argb),          // %1
+                 "+r"(width)              // %2
+               : "m"(kShuffleARGBToABGR)  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 
 #ifdef HAS_ARGBTOAR64ROW_AVX2
 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
@@ -1211,7 +1210,7 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
 void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
                         uint16_t* dst_ab64,
                         int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm2                  \n"
       "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
       "1:                                        \n"
@@ -1239,7 +1238,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -1265,8 +1264,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile (
-      "vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
+      asm volatile("vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -1281,11 +1279,11 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_ab64),          // %0
-        "+r"(dst_argb),          // %1
-        "+r"(width)              // %2
-      : "m"(kShuffleARGBToABGR)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_ab64),          // %0
+                 "+r"(dst_argb),          // %1
+                 "+r"(width)              // %2
+               : "m"(kShuffleARGBToABGR)  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif
 
@@ -1360,7 +1358,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
@@ -1381,7 +1379,7 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16.
 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
@@ -1399,7 +1397,7 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 // Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
 // Same as ABGRToYRow but different coefficients, no add 16.
 void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
@@ -1417,7 +1415,7 @@ void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16.
 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
@@ -1441,7 +1439,7 @@ static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vbroadcastf128 %5,%%ymm7                  \n"
@@ -1462,7 +1460,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 #ifdef HAS_ABGRTOYROW_AVX2
 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vbroadcastf128 %5,%%ymm7                  \n"
@@ -1483,7 +1481,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
@@ -1502,7 +1500,7 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 #ifdef HAS_ABGRTOYJROW_AVX2
 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
 void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
@@ -1521,7 +1519,7 @@ void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 #ifdef HAS_RGBATOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
@@ -1542,7 +1540,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
@@ -1615,7 +1613,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
@@ -1678,7 +1676,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
@@ -1741,7 +1739,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
@@ -1806,7 +1804,7 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
@@ -1870,7 +1868,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
@@ -1936,7 +1934,7 @@ void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
@@ -2001,7 +1999,7 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %4,%%xmm3                     \n"
       "movdqa      %5,%%xmm4                     \n"
       "movdqa      %6,%%xmm5                     \n"
@@ -2055,7 +2053,7 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
 #endif  // HAS_ARGBTOUV444ROW_SSSE3
 
 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
@@ -2076,7 +2074,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
@@ -2135,7 +2133,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
 }
 
 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
@@ -2152,7 +2150,7 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 }
 
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
@@ -2173,7 +2171,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
@@ -2236,7 +2234,7 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
@@ -2657,7 +2655,8 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  asm volatile (YUVTORGB_SETUP(
+  asm volatile(
+      YUVTORGB_SETUP(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA444
@@ -2983,7 +2982,8 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  asm volatile (YUVTORGB_SETUP(
+  asm volatile(
+      YUVTORGB_SETUP(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA210
@@ -3015,7 +3015,8 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  asm volatile (YUVTORGB_SETUP(
+  asm volatile(
+      YUVTORGB_SETUP(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA410
@@ -3081,7 +3082,8 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  asm volatile (YUVTORGB_SETUP(
+  asm volatile(
+      YUVTORGB_SETUP(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA422
@@ -3109,7 +3111,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (YUVTORGB_SETUP(
+  asm volatile(
+      YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:                                        \n" READNV12
@@ -3130,7 +3133,8 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (YUVTORGB_SETUP(
+  asm volatile(
+      YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:                                        \n" READNV21
@@ -3151,7 +3155,7 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %[kShuffleYUY2Y],%%xmm6       \n"
       "movdqa      %[kShuffleYUY2UV],%%xmm7      \n" YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -3173,7 +3177,7 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %[kShuffleUYVYY],%%xmm6       \n"
       "movdqa      %[kShuffleUYVYUV],%%xmm7      \n" YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -3196,7 +3200,8 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (YUVTORGB_SETUP(
+  asm volatile(
+      YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:                                        \n" READP210
@@ -3217,7 +3222,8 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (YUVTORGB_SETUP(
+  asm volatile(
+      YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:                                        \n" READP410
@@ -4051,7 +4057,8 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  asm volatile (YUVTORGB_SETUP_AVX2(
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA210_AVX2
@@ -4086,7 +4093,8 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  asm volatile (YUVTORGB_SETUP_AVX2(
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA410_AVX2
@@ -4161,7 +4169,8 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  asm volatile (YUVTORGB_SETUP_AVX2(
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA444_AVX2
@@ -4195,7 +4204,8 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  asm volatile (YUVTORGB_SETUP_AVX2(
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA422_AVX2
@@ -4271,7 +4281,8 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (YUVTORGB_SETUP_AVX2(
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:                                        \n" READNV12_AVX2
@@ -4297,7 +4308,8 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (YUVTORGB_SETUP_AVX2(
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:                                        \n" READNV21_AVX2
@@ -4323,7 +4335,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6    \n"
       "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@@ -4350,7 +4362,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %[kShuffleUYVYY],%%ymm6    \n"
       "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@@ -4378,7 +4390,8 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (YUVTORGB_SETUP_AVX2(
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:                                        \n" READP210_AVX2
@@ -4404,7 +4417,8 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (YUVTORGB_SETUP_AVX2(
+  asm volatile(
+      YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:                                        \n" READP410_AVX2
@@ -4501,7 +4515,7 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
       "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
@@ -4546,7 +4560,7 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
       "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
@@ -4590,10 +4604,9 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
 
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-      "movdqa      %3,%%xmm5                     \n"
+      asm volatile("movdqa      %3,%%xmm5                     \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
       "pshufb      %%xmm5,%%xmm0                 \n"
@@ -4601,21 +4614,20 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
       "lea         0x10(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src),           // %0
-        "+r"(dst),           // %1
-        "+r"(temp_width)     // %2
-      : "m"(kShuffleMirror)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
+               : "+r"(src),           // %0
+                 "+r"(dst),           // %1
+                 "+r"(temp_width)     // %2
+               : "m"(kShuffleMirror)  // %3
+               : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-      "vbroadcastf128 %3,%%ymm5                  \n"
+      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
@@ -4625,11 +4637,11 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src),           // %0
-        "+r"(dst),           // %1
-        "+r"(temp_width)     // %2
-      : "m"(kShuffleMirror)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
+               : "+r"(src),           // %0
+                 "+r"(dst),           // %1
+                 "+r"(temp_width)     // %2
+               : "m"(kShuffleMirror)  // %3
+               : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_AVX2
 
@@ -4640,10 +4652,9 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
 
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-      "movdqa      %3,%%xmm5                     \n"
+      asm volatile("movdqa      %3,%%xmm5                     \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
       "pshufb      %%xmm5,%%xmm0                 \n"
@@ -4651,21 +4662,20 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
       "lea         0x10(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_uv),          // %0
-        "+r"(dst_uv),          // %1
-        "+r"(temp_width)       // %2
-      : "m"(kShuffleMirrorUV)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
+               : "+r"(src_uv),          // %0
+                 "+r"(dst_uv),          // %1
+                 "+r"(temp_width)       // %2
+               : "m"(kShuffleMirrorUV)  // %3
+               : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORUVROW_SSSE3
 
 #ifdef HAS_MIRRORUVROW_AVX2
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-      "vbroadcastf128 %3,%%ymm5                  \n"
+      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
@@ -4675,11 +4685,11 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_uv),          // %0
-        "+r"(dst_uv),          // %1
-        "+r"(temp_width)       // %2
-      : "m"(kShuffleMirrorUV)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
+               : "+r"(src_uv),          // %0
+                 "+r"(dst_uv),          // %1
+                 "+r"(temp_width)       // %2
+               : "m"(kShuffleMirrorUV)  // %3
+               : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORUVROW_AVX2
 
@@ -4692,7 +4702,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
                             uint8_t* dst_v,
                             int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
+  asm volatile(
       "movdqa      %4,%%xmm1                     \n"
       "lea         -0x10(%0,%3,2),%0             \n"
       "sub         %1,%2                         \n"
@@ -4732,7 +4742,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
                           int width) {
   intptr_t temp_width = (intptr_t)(width);
   src_rgb24 += width * 3 - 48;
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
@@ -4767,10 +4777,9 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
 
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-      "lea         -0x10(%0,%2,4),%0             \n"
+      asm volatile("lea         -0x10(%0,%2,4),%0             \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
@@ -4779,11 +4788,11 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src),        // %0
-        "+r"(dst),        // %1
-        "+r"(temp_width)  // %2
-      :
-      : "memory", "cc", "xmm0");
+               : "+r"(src),        // %0
+                 "+r"(dst),        // %1
+                 "+r"(temp_width)  // %2
+               :
+               : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBMIRRORROW_SSE2
 
@@ -4792,10 +4801,9 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-      "vmovdqu     %3,%%ymm5                     \n"
+      asm volatile("vmovdqu     %3,%%ymm5                     \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
       "vmovdqu     %%ymm0,(%1)                   \n"
@@ -4803,11 +4811,11 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src),                    // %0
-        "+r"(dst),                    // %1
-        "+r"(temp_width)              // %2
-      : "m"(kARGBShuffleMirror_AVX2)  // %3
-      : "memory", "cc", "xmm0", "xmm5");
+               : "+r"(src),                    // %0
+                 "+r"(dst),                    // %1
+                 "+r"(temp_width)              // %2
+               : "m"(kARGBShuffleMirror_AVX2)  // %3
+               : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
@@ -4816,7 +4824,7 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
@@ -4854,7 +4862,7 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
@@ -4891,7 +4899,7 @@ void DetileRow_SSE2(const uint8_t* src,
                     ptrdiff_t src_tile_stride,
                     uint8_t* dst,
                     int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "sub         $0x10,%2                      \n"
@@ -4912,7 +4920,7 @@ void DetileRow_16_SSE2(const uint16_t* src,
                        ptrdiff_t src_tile_stride,
                        uint16_t* dst,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -4935,7 +4943,7 @@ void DetileRow_16_AVX(const uint16_t* src,
                       ptrdiff_t src_tile_stride,
                       uint16_t* dst,
                       int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "lea         (%0,%3,2),%0                  \n"
@@ -4960,7 +4968,7 @@ void DetileToYUY2_SSE2(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"  // Load 16 Y
       "sub         $0x10,%3                      \n"
@@ -4999,7 +5007,7 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width) {
-  asm volatile (
+  asm volatile(
       "movdqu      %4,%%xmm1                     \n"
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
@@ -5026,10 +5034,9 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_uv,
                          int width) {
-  asm volatile (
-      "sub         %0,%1                         \n"
+      asm volatile("sub         %0,%1                         \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vpmovzxbw   (%0),%%zmm0                   \n"
       "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
@@ -5041,12 +5048,12 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u,
       "sub         $0x20,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_AVX512BW
 
@@ -5055,10 +5062,9 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-      "sub         %0,%1                         \n"
+      asm volatile("sub         %0,%1                         \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vpmovzxbw   (%0),%%ymm0                   \n"
       "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
@@ -5070,12 +5076,12 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
@@ -5084,10 +5090,9 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-      "sub         %0,%1                         \n"
+      asm volatile("sub         %0,%1                         \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
@@ -5100,12 +5105,12 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
       "lea         0x20(%2),%2                   \n"
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
-      : "+r"(src_u),   // %0
-        "+r"(src_v),   // %1
-        "+r"(dst_uv),  // %2
-        "+r"(width)    // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_u),   // %0
+                 "+r"(src_v),   // %1
+                 "+r"(dst_uv),  // %2
+                 "+r"(width)    // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_SSE2
 
@@ -5115,7 +5120,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
                         uint16_t* dst_uv,
                         int depth,
                         int width) {
-  asm volatile (
+  asm volatile(
       "vmovd       %4,%%xmm3                     \n"
       "vmovd       %5,%%xmm4                     \n"
 
@@ -5154,7 +5159,7 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
                         int depth,
                         int width) {
   depth = 16 - depth;
-  asm volatile (
+  asm volatile(
       "vmovd       %4,%%xmm3                     \n"
       "vbroadcastf128 %5,%%ymm4                  \n"
       "sub         %1,%2                         \n"
@@ -5200,7 +5205,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width) {
-  asm volatile (
+  asm volatile(
       "vmovd       %3,%%xmm3                     \n"
       "vpbroadcastw %%xmm3,%%ymm3                \n"
       "sub         %0,%1                         \n"
@@ -5236,7 +5241,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
                        uint16_t* dst_y,
                        int scale,
                        int width) {
-  asm volatile (
+  asm volatile(
       "vmovd       %3,%%xmm3                     \n"
       "vpbroadcastw %%xmm3,%%ymm3                \n"
       "sub         %0,%1                         \n"
@@ -5272,7 +5277,7 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y,
                            uint8_t* dst_y,
                            int scale,
                            int width) {
-  asm volatile (
+  asm volatile(
       "movd        %3,%%xmm2                     \n"
       "punpcklwd   %%xmm2,%%xmm2                 \n"
       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
@@ -5302,7 +5307,7 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
                           uint8_t* dst_y,
                           int scale,
                           int width) {
-  asm volatile (
+  asm volatile(
       "vmovd       %3,%%xmm2                     \n"
       "vpbroadcastw %%xmm2,%%ymm2                \n"
 
@@ -5334,11 +5339,10 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y,
                               uint8_t* dst_y,
                               int scale,
                               int width) {
-  asm volatile (
-      "vpbroadcastw %3,%%zmm2                    \n"
+      asm volatile("vpbroadcastw %3,%%zmm2                    \n"
 
-      // 64 pixels per loop.
-      LABELALIGN
+               // 64 pixels per loop.
+               LABELALIGN
       "1:                                        \n"
       "vmovups     (%0),%%zmm0                   \n"
       "vmovups     0x40(%0),%%zmm1               \n"
@@ -5353,11 +5357,11 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y,
       "sub         $0x40,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_y),  // %0
-        "+r"(dst_y),  // %1
-        "+r"(width)   // %2
-      : "r"(scale)    // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_y),  // %0
+                 "+r"(dst_y),  // %1
+                 "+r"(width)   // %2
+               : "r"(scale)    // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_CONVERT16TO8ROW_AVX2
 
@@ -5369,7 +5373,7 @@ void Convert8To16Row_SSE2(const uint8_t* src_y,
                           uint16_t* dst_y,
                           int scale,
                           int width) {
-  asm volatile (
+  asm volatile(
       "movd        %3,%%xmm2                     \n"
       "punpcklwd   %%xmm2,%%xmm2                 \n"
       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
@@ -5401,7 +5405,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
                           uint16_t* dst_y,
                           int scale,
                           int width) {
-  asm volatile (
+  asm volatile(
       "vmovd       %3,%%xmm2                     \n"
       "vpbroadcastw %%xmm2,%%ymm2                \n"
 
@@ -5456,7 +5460,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -5514,35 +5518,38 @@ static const uvec8 kSplitRGBShuffleSSE41[5] = {
     {0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u},
 };
 
-void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r,
-                       uint8_t* dst_g, uint8_t* dst_b, int width) {
+void SplitRGBRow_SSE41(const uint8_t* src_rgb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width) {
   asm volatile(
-      "movdqa      48(%5), %%xmm0               \n"
-      "1:                                       \n"
-      "movdqu      (%0),%%xmm1                  \n"
-      "movdqu      0x10(%0),%%xmm2              \n"
-      "movdqu      0x20(%0),%%xmm3              \n"
-      "lea         0x30(%0),%0                  \n"
-      "movdqa      %%xmm1, %%xmm4               \n"
-      "pblendvb    %%xmm3, %%xmm1               \n"
-      "pblendvb    %%xmm2, %%xmm3               \n"
-      "pblendvb    %%xmm4, %%xmm2               \n"
-      "palignr     $0xF, %%xmm0, %%xmm0         \n"
-      "pblendvb    %%xmm2, %%xmm1               \n"
-      "pblendvb    %%xmm3, %%xmm2               \n"
-      "pblendvb    %%xmm4, %%xmm3               \n"
-      "palignr     $0x1, %%xmm0, %%xmm0         \n"
-      "pshufb      0(%5), %%xmm1                \n"
-      "pshufb      16(%5), %%xmm2               \n"
-      "pshufb      32(%5), %%xmm3               \n"
-      "movdqu      %%xmm1,(%1)                  \n"
-      "lea         0x10(%1),%1                  \n"
-      "movdqu      %%xmm2,(%2)                  \n"
-      "lea         0x10(%2),%2                  \n"
-      "movdqu      %%xmm3,(%3)                  \n"
-      "lea         0x10(%3),%3                  \n"
-      "sub         $0x10,%4                     \n"
-      "jg          1b                           \n"
+      "movdqa      48(%5), %%xmm0                \n"
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm1                   \n"
+      "movdqu      0x10(%0),%%xmm2               \n"
+      "movdqu      0x20(%0),%%xmm3               \n"
+      "lea         0x30(%0),%0                   \n"
+      "movdqa      %%xmm1, %%xmm4                \n"
+      "pblendvb    %%xmm3, %%xmm1                \n"
+      "pblendvb    %%xmm2, %%xmm3                \n"
+      "pblendvb    %%xmm4, %%xmm2                \n"
+      "palignr     $0xF, %%xmm0, %%xmm0          \n"
+      "pblendvb    %%xmm2, %%xmm1                \n"
+      "pblendvb    %%xmm3, %%xmm2                \n"
+      "pblendvb    %%xmm4, %%xmm3                \n"
+      "palignr     $0x1, %%xmm0, %%xmm0          \n"
+      "pshufb      0(%5), %%xmm1                 \n"
+      "pshufb      16(%5), %%xmm2                \n"
+      "pshufb      32(%5), %%xmm3                \n"
+      "movdqu      %%xmm1,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "movdqu      %%xmm2,(%2)                   \n"
+      "lea         0x10(%2),%2                   \n"
+      "movdqu      %%xmm3,(%3)                   \n"
+      "lea         0x10(%3),%3                   \n"
+      "sub         $0x10,%4                      \n"
+      "jg          1b                            \n"
       : "+r"(src_rgb),                  // %0
         "+r"(dst_r),                    // %1
         "+r"(dst_g),                    // %2
@@ -5554,50 +5561,53 @@ void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r,
 #endif  // HAS_SPLITRGBROW_SSE41
 
 #ifdef HAS_SPLITRGBROW_AVX2
-void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r,
-                      uint8_t* dst_g, uint8_t* dst_b, int width) {
+void SplitRGBRow_AVX2(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
   asm volatile(
-      "vbroadcasti128        48(%5), %%ymm0                        \n"
-      "vbroadcasti128        64(%5), %%ymm7                        \n"
+      "vbroadcasti128 48(%5), %%ymm0             \n"
+      "vbroadcasti128 64(%5), %%ymm7             \n"
 #if defined(__x86_64__)
-      "vbroadcasti128        0(%5), %%ymm8                         \n"
-      "vbroadcasti128        16(%5), %%ymm9                        \n"
-      "vbroadcasti128        32(%5), %%ymm10                       \n"
+      "vbroadcasti128 0(%5), %%ymm8              \n"
+      "vbroadcasti128 16(%5), %%ymm9             \n"
+      "vbroadcasti128 32(%5), %%ymm10            \n"
 #endif
-      "1:                                                          \n"
-      "vmovdqu               (%0),%%ymm4                           \n"
-      "vmovdqu               0x20(%0),%%ymm5                       \n"
-      "vmovdqu               0x40(%0),%%ymm6                       \n"
-      "lea                   0x60(%0),%0                           \n"
-      "vpblendd              $240, %%ymm5, %%ymm4, %%ymm1          \n"
-      "vperm2i128            $33, %%ymm6, %%ymm4, %%ymm2           \n"
-      "vpblendd              $240, %%ymm6, %%ymm5, %%ymm3          \n"
-      "vpblendvb             %%ymm0, %%ymm3, %%ymm1, %%ymm4        \n"
-      "vpblendvb             %%ymm0, %%ymm1, %%ymm2, %%ymm5        \n"
-      "vpblendvb             %%ymm0, %%ymm2, %%ymm3, %%ymm6        \n"
-      "vpblendvb             %%ymm7, %%ymm5, %%ymm4, %%ymm1        \n"
-      "vpblendvb             %%ymm7, %%ymm6, %%ymm5, %%ymm2        \n"
-      "vpblendvb             %%ymm7, %%ymm4, %%ymm6, %%ymm3        \n"
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm4                   \n"
+      "vmovdqu     0x20(%0),%%ymm5               \n"
+      "vmovdqu     0x40(%0),%%ymm6               \n"
+      "lea         0x60(%0),%0                   \n"
+      "vpblendd    $240, %%ymm5, %%ymm4, %%ymm1  \n"
+      "vperm2i128  $33, %%ymm6, %%ymm4, %%ymm2   \n"
+      "vpblendd    $240, %%ymm6, %%ymm5, %%ymm3  \n"
+      "vpblendvb   %%ymm0, %%ymm3, %%ymm1, %%ymm4 \n"
+      "vpblendvb   %%ymm0, %%ymm1, %%ymm2, %%ymm5 \n"
+      "vpblendvb   %%ymm0, %%ymm2, %%ymm3, %%ymm6 \n"
+      "vpblendvb   %%ymm7, %%ymm5, %%ymm4, %%ymm1 \n"
+      "vpblendvb   %%ymm7, %%ymm6, %%ymm5, %%ymm2 \n"
+      "vpblendvb   %%ymm7, %%ymm4, %%ymm6, %%ymm3 \n"
 #if defined(__x86_64__)
-      "vpshufb               %%ymm8, %%ymm1, %%ymm1                \n"
-      "vpshufb               %%ymm9, %%ymm2, %%ymm2                \n"
-      "vpshufb               %%ymm10, %%ymm3, %%ymm3               \n"
+      "vpshufb     %%ymm8, %%ymm1, %%ymm1        \n"
+      "vpshufb     %%ymm9, %%ymm2, %%ymm2        \n"
+      "vpshufb     %%ymm10, %%ymm3, %%ymm3       \n"
 #else
-      "vbroadcasti128        0(%5), %%ymm4                         \n"
-      "vbroadcasti128        16(%5), %%ymm5                        \n"
-      "vbroadcasti128        32(%5), %%ymm6                        \n"
-      "vpshufb               %%ymm4, %%ymm1, %%ymm1                \n"
-      "vpshufb               %%ymm5, %%ymm2, %%ymm2                \n"
-      "vpshufb               %%ymm6, %%ymm3, %%ymm3                \n"
+      "vbroadcasti128 0(%5), %%ymm4              \n"
+      "vbroadcasti128 16(%5), %%ymm5             \n"
+      "vbroadcasti128 32(%5), %%ymm6             \n"
+      "vpshufb     %%ymm4, %%ymm1, %%ymm1        \n"
+      "vpshufb     %%ymm5, %%ymm2, %%ymm2        \n"
+      "vpshufb     %%ymm6, %%ymm3, %%ymm3        \n"
 #endif
-      "vmovdqu               %%ymm1,(%1)                           \n"
-      "lea                   0x20(%1),%1                           \n"
-      "vmovdqu               %%ymm2,(%2)                           \n"
-      "lea                   0x20(%2),%2                           \n"
-      "vmovdqu               %%ymm3,(%3)                           \n"
-      "lea                   0x20(%3),%3                           \n"
-      "sub                   $0x20,%4                              \n"
-      "jg                    1b                                    \n"
+      "vmovdqu     %%ymm1,(%1)                   \n"
+      "lea         0x20(%1),%1                   \n"
+      "vmovdqu     %%ymm2,(%2)                   \n"
+      "lea         0x20(%2),%2                   \n"
+      "vmovdqu     %%ymm3,(%3)                   \n"
+      "lea         0x20(%3),%3                   \n"
+      "sub         $0x20,%4                      \n"
+      "jg          1b                            \n"
       : "+r"(src_rgb),                  // %0
         "+r"(dst_r),                    // %1
         "+r"(dst_g),                    // %2
@@ -5607,7 +5617,8 @@ void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r,
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
         "xmm7"
 #if defined(__x86_64__)
-        , "xmm8", "xmm9", "xmm10"
+        ,
+        "xmm8", "xmm9", "xmm10"
 #endif
   );
 }
@@ -5640,7 +5651,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_rgb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      (%1),%%xmm1                   \n"
@@ -5695,7 +5706,7 @@ void MergeARGBRow_SSE2(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
@@ -5736,7 +5747,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
 
       "movq        (%2),%%xmm0                   \n"  // B
@@ -5774,7 +5785,7 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
@@ -5819,7 +5830,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
 
       "vmovdqu     (%2),%%xmm0                   \n"  // B
@@ -5861,7 +5872,7 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
-  asm volatile (
+  asm volatile(
       "sub         %1,%2                         \n"
       "sub         %1,%3                         \n"
       "sub         %1,%4                         \n"
@@ -5912,7 +5923,7 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
 
       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
@@ -5961,7 +5972,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_b,
                         uint8_t* dst_a,
                         int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %6,%%xmm3                     \n"
       "sub         %1,%2                         \n"
       "sub         %1,%3                         \n"
@@ -5994,7 +6005,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
 #if defined(__i386__)
         "+m"(width)  // %5
 #else
-        "+rm"(width)          // %5
+        "+rm"(width)  // %5
 #endif
       : "m"(kShuffleMaskARGBSplit)  // %6
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
@@ -6007,7 +6018,7 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_g,
                         uint8_t* dst_b,
                         int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %5,%%xmm3                     \n"
 
       LABELALIGN
@@ -6048,7 +6059,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
-  asm volatile (
+  asm volatile(
       "sub         %1,%2                         \n"
       "sub         %1,%3                         \n"
       "sub         %1,%4                         \n"
@@ -6085,7 +6096,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
 #if defined(__i386__)
         "+m"(width)  // %5
 #else
-        "+rm"(width)          // %5
+        "+rm"(width)  // %5
 #endif
       : "m"(kShuffleMaskARGBSplit),   // %6
         "m"(kShuffleMaskARGBPermute)  // %7
@@ -6099,7 +6110,7 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile (
+  asm volatile(
       "vmovdqa     %6,%%ymm3                     \n"
       "vbroadcastf128 %5,%%ymm4                  \n"
 
@@ -6146,7 +6157,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r,
                        int depth,
                        int width) {
   int shift = depth - 10;
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
@@ -6194,7 +6205,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r,
 #if defined(__i386__)
       : "m"(shift)  // %5
 #else
-      : "rm"(shift)           // %5
+      : "rm"(shift)   // %5
 #endif
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
@@ -6212,7 +6223,7 @@ void MergeAR64Row_AVX2(const uint16_t* src_r,
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
   mask = (mask << 16) + mask;
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
@@ -6263,7 +6274,7 @@ void MergeAR64Row_AVX2(const uint16_t* src_r,
 #if defined(__i386__)
         "+m"(width)  // %5
 #else
-        "+rm"(width)          // %5
+        "+rm"(width)  // %5
 #endif
       : "m"(shift),            // %6
         "m"(mask),             // %7
@@ -6283,7 +6294,7 @@ void MergeXR64Row_AVX2(const uint16_t* src_r,
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
   mask = (mask << 16) + mask;
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "vmovdqa     %7,%%ymm5                     \n"
@@ -6346,7 +6357,7 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
                             int depth,
                             int width) {
   int shift = depth - 8;
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
@@ -6386,7 +6397,7 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
 #if defined(__i386__)
         "+m"(width)  // %5
 #else
-        "+rm"(width)          // %5
+        "+rm"(width)  // %5
 #endif
       : "m"(shift),                 // %6
         "m"(MergeARGB16To8Shuffle)  // %7
@@ -6402,7 +6413,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
                             int depth,
                             int width) {
   int shift = depth - 8;
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "vbroadcastf128 %6,%%ymm5                  \n"
@@ -6446,7 +6457,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
 
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "test        $0xf,%0                       \n"
       "jne         2f                            \n"
       "test        $0xf,%1                       \n"
@@ -6486,7 +6497,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_COPYROW_AVX
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -6507,7 +6518,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_COPYROW_AVX512BW
 void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vmovups     (%0),%%zmm0                   \n"
       "vmovups     0x40(%0),%%zmm1               \n"
@@ -6530,20 +6541,19 @@ void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) {
 // Multiple of 1.
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-      "rep         movsb                         \n"
-      : "+S"(src),       // %0
-        "+D"(dst),       // %1
-        "+c"(width_tmp)  // %2
-      :
-      : "memory", "cc");
+      asm volatile("rep         movsb                         \n"
+               : "+S"(src),       // %0
+                 "+D"(dst),       // %1
+                 "+c"(width_tmp)  // %2
+               :
+               : "memory", "cc");
 }
 #endif  // HAS_COPYROW_ERMS
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm0,%%xmm0                 \n"
       "pslld       $0x18,%%xmm0                  \n"
       "pcmpeqb     %%xmm1,%%xmm1                 \n"
@@ -6578,7 +6588,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
 
@@ -6608,7 +6618,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0), %%xmm0                  \n"
       "movdqu      0x10(%0), %%xmm1              \n"
@@ -6637,7 +6647,7 @@ static const uvec8 kShuffleAlphaShort_AVX2 = {
 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-  asm volatile (
+  asm volatile(
       "vmovdqa     %3,%%ymm4                     \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
 
@@ -6673,7 +6683,7 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm0,%%xmm0                 \n"
       "pslld       $0x18,%%xmm0                  \n"
       "pcmpeqb     %%xmm1,%%xmm1                 \n"
@@ -6710,7 +6720,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
 
@@ -6741,38 +6751,35 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width >> 2);
   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-  asm volatile (
-      "rep         stosl                         \n"
-      : "+D"(dst),       // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v32)         // %2
-      : "memory", "cc");
+      asm volatile("rep         stosl                         \n"
+               : "+D"(dst),       // %0
+                 "+c"(width_tmp)  // %1
+               : "a"(v32)         // %2
+               : "memory", "cc");
 }
 
 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-      "rep         stosb                         \n"
-      : "+D"(dst),       // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v8)          // %2
-      : "memory", "cc");
+      asm volatile("rep         stosb                         \n"
+               : "+D"(dst),       // %0
+                 "+c"(width_tmp)  // %1
+               : "a"(v8)          // %2
+               : "memory", "cc");
 }
 
 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-      "rep         stosl                         \n"
-      : "+D"(dst_argb),  // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v32)         // %2
-      : "memory", "cc");
+      asm volatile("rep         stosl                         \n"
+               : "+D"(dst_argb),  // %0
+                 "+c"(width_tmp)  // %1
+               : "a"(v32)         // %2
+               : "memory", "cc");
 }
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_SSE2
 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
 
@@ -6799,7 +6806,7 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
                         int stride_yuy2,
                         uint8_t* dst_uv,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -6827,7 +6834,7 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
@@ -6866,7 +6873,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
@@ -6898,7 +6905,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
 }
 
 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -6922,7 +6929,7 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
@@ -6961,7 +6968,7 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
@@ -6995,7 +7002,7 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
 
 #ifdef HAS_YUY2TOYROW_AVX2
 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
 
@@ -7024,7 +7031,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
                         int stride_yuy2,
                         uint8_t* dst_uv,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -7052,7 +7059,7 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
@@ -7092,7 +7099,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
@@ -7127,7 +7134,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
 }
 
 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -7152,7 +7159,7 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
@@ -7192,7 +7199,7 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
@@ -7237,7 +7244,7 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
                         const uint8_t* src_argb1,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm7,%%xmm7                 \n"
       "psrlw       $0xf,%%xmm7                   \n"
       "pcmpeqb     %%xmm6,%%xmm6                 \n"
@@ -7325,7 +7332,7 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0,
                          const uint8_t* alpha,
                          uint8_t* dst,
                          int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psllw       $0x8,%%xmm5                   \n"
       "mov         $0x80808080,%%eax             \n"
@@ -7377,7 +7384,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
                         const uint8_t* alpha,
                         uint8_t* dst,
                         int width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
       "mov         $0x80808080,%%eax             \n"
@@ -7437,7 +7444,7 @@ static const vec8 kAttenuateShuffle = {6,    -128, 6,    -128, 6,  -128,
 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
@@ -7492,7 +7499,7 @@ static const lvec8 kAttenuateShuffle_AVX2 = {
 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
-  asm volatile (
+  asm volatile(
       "vmovdqa     %3,%%ymm4                     \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpslld      $0x18,%%ymm5,%%ymm5           \n"
@@ -7538,7 +7545,7 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-  asm volatile (
+  asm volatile(
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -7586,7 +7593,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "vbroadcastf128 %5,%%ymm5                  \n"
 
@@ -7648,7 +7655,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
@@ -7710,7 +7717,7 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %2,%%xmm2                     \n"
       "movdqa      %3,%%xmm3                     \n"
       "movdqa      %4,%%xmm4                     \n"
@@ -7771,7 +7778,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
                               uint8_t* dst_argb,
                               const int8_t* matrix_argb,
                               int width) {
-  asm volatile (
+  asm volatile(
       "movdqu      (%3),%%xmm5                   \n"
       "pshufd      $0x00,%%xmm5,%%xmm2           \n"
       "pshufd      $0x55,%%xmm5,%%xmm3           \n"
@@ -7836,7 +7843,7 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
                           int interval_size,
                           int interval_offset,
                           int width) {
-  asm volatile (
+  asm volatile(
       "movd        %2,%%xmm2                     \n"
       "movd        %3,%%xmm3                     \n"
       "movd        %4,%%xmm4                     \n"
@@ -7887,7 +7894,7 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        int width,
                        uint32_t value) {
-  asm volatile (
+  asm volatile(
       "movd        %3,%%xmm2                     \n"
       "punpcklbw   %%xmm2,%%xmm2                 \n"
       "punpcklqdq  %%xmm2,%%xmm2                 \n"
@@ -7923,11 +7930,10 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
-      "pxor        %%xmm5,%%xmm5                 \n"
+      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
 
-      // 4 pixel loop.
-      LABELALIGN
+               // 4 pixel loop.
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "lea         0x10(%0),%0                   \n"
@@ -7946,12 +7952,12 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
       "lea         0x10(%2),%2                   \n"
       "sub         $0x4,%3                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+               : "+r"(src_argb),   // %0
+                 "+r"(src_argb1),  // %1
+                 "+r"(dst_argb),   // %2
+                 "+r"(width)       // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 
@@ -7961,11 +7967,10 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
-      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
-      // 4 pixel loop.
-      LABELALIGN
+               // 4 pixel loop.
+               LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm1                   \n"
       "lea         0x20(%0),%0                   \n"
@@ -7983,12 +7988,12 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
       "sub         $0x8,%3                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_argb),   // %0
-        "+r"(src_argb1),  // %1
-        "+r"(dst_argb),   // %2
-        "+r"(width)       // %3
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+               : "+r"(src_argb),   // %0
+                 "+r"(src_argb1),  // %1
+                 "+r"(dst_argb),   // %2
+                 "+r"(width)       // %3
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_ARGBMULTIPLYROW_AVX2
 
@@ -7998,7 +8003,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile (
+  asm volatile(
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -8026,7 +8031,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile (
+  asm volatile(
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -8054,7 +8059,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
+  asm volatile(
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -8082,7 +8087,7 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
+  asm volatile(
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -8114,7 +8119,7 @@ void SobelXRow_SSE2(const uint8_t* src_y0,
                     const uint8_t* src_y2,
                     uint8_t* dst_sobelx,
                     int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
@@ -8168,7 +8173,7 @@ void SobelYRow_SSE2(const uint8_t* src_y0,
                     const uint8_t* src_y1,
                     uint8_t* dst_sobely,
                     int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -8221,7 +8226,7 @@ void SobelRow_SSE2(const uint8_t* src_sobelx,
                    const uint8_t* src_sobely,
                    uint8_t* dst_argb,
                    int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
@@ -8268,7 +8273,7 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
                           const uint8_t* src_sobely,
                           uint8_t* dst_y,
                           int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
@@ -8303,7 +8308,7 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx,
                      const uint8_t* src_sobely,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
@@ -8351,7 +8356,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
                                   int32_t* cumsum,
                                   const int32_t* previous_cumsum,
                                   int width) {
-  asm volatile (
+  asm volatile(
       "pxor        %%xmm0,%%xmm0                 \n"
       "pxor        %%xmm1,%%xmm1                 \n"
       "sub         $0x4,%3                       \n"
@@ -8431,7 +8436,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
                                     int area,
                                     uint8_t* dst,
                                     int count) {
-  asm volatile (
+  asm volatile(
       "movd        %5,%%xmm5                     \n"
       "cvtdq2ps    %%xmm5,%%xmm5                 \n"
       "rcpss       %%xmm5,%%xmm4                 \n"
@@ -8566,7 +8571,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
                         int width) {
   intptr_t src_argb_stride_temp = src_argb_stride;
   intptr_t temp;
-  asm volatile (
+  asm volatile(
       "movq        (%3),%%xmm2                   \n"
       "movq        0x08(%3),%%xmm7               \n"
       "shl         $0x10,%1                      \n"
@@ -8651,7 +8656,7 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
                           ptrdiff_t src_stride,
                           int width,
                           int source_y_fraction) {
-  asm volatile (
+  asm volatile(
       "sub         %1,%0                         \n"
       "cmp         $0x0,%3                       \n"
       "je          100f                          \n"
@@ -8732,7 +8737,7 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
                          ptrdiff_t src_stride,
                          int width,
                          int source_y_fraction) {
-  asm volatile (
+  asm volatile(
       "sub         %1,%0                         \n"
       "cmp         $0x0,%3                       \n"
       "je          100f                          \n"
@@ -8809,10 +8814,9 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           const uint8_t* shuffler,
                           int width) {
-  asm volatile (
-      "movdqu      (%3),%%xmm5                   \n"
+      asm volatile("movdqu      (%3),%%xmm5                   \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -8824,11 +8828,11 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
       "lea         0x20(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(shuffler)    // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+               : "+r"(src_argb),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : "r"(shuffler)    // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
 
@@ -8838,10 +8842,9 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-  asm volatile (
-      "vbroadcastf128 (%3),%%ymm5                \n"
+      asm volatile("vbroadcastf128 (%3),%%ymm5                \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -8854,11 +8857,11 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(shuffler)    // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+               : "+r"(src_argb),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : "r"(shuffler)    // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
@@ -8868,10 +8871,9 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-  asm volatile (
-      "sub         %1,%2                         \n"
+      asm volatile("sub         %1,%2                         \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movq        (%1),%%xmm2                   \n"
       "movq        0x00(%1,%2,1),%%xmm1          \n"
@@ -8887,13 +8889,13 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
       "lea         0x20(%3),%3                   \n"
       "sub         $0x10,%4                      \n"
       "jg          1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_yuy2),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_y),     // %0
+                 "+r"(src_u),     // %1
+                 "+r"(src_v),     // %2
+                 "+r"(dst_yuy2),  // %3
+                 "+rm"(width)     // %4
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOYUY2ROW_SSE2
 
@@ -8903,10 +8905,9 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-  asm volatile (
-      "sub         %1,%2                         \n"
+      asm volatile("sub         %1,%2                         \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movq        (%1),%%xmm2                   \n"
       "movq        0x00(%1,%2,1),%%xmm1          \n"
@@ -8922,13 +8923,13 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
       "lea         0x20(%3),%3                   \n"
       "sub         $0x10,%4                      \n"
       "jg          1b                            \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_uyvy),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_y),     // %0
+                 "+r"(src_u),     // %1
+                 "+r"(src_v),     // %2
+                 "+r"(dst_uyvy),  // %3
+                 "+rm"(width)     // %4
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOUYVYROW_SSE2
 
@@ -8938,10 +8939,9 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-  asm volatile (
-      "sub         %1,%2                         \n"
+      asm volatile("sub         %1,%2                         \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vpmovzxbw   (%1),%%ymm1                   \n"
       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
@@ -8960,13 +8960,13 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
       "sub         $0x20,%4                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_yuy2),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_y),     // %0
+                 "+r"(src_u),     // %1
+                 "+r"(src_v),     // %2
+                 "+r"(dst_yuy2),  // %3
+                 "+rm"(width)     // %4
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOYUY2ROW_AVX2
 
@@ -8976,10 +8976,9 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-  asm volatile (
-      "sub         %1,%2                         \n"
+      asm volatile("sub         %1,%2                         \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vpmovzxbw   (%1),%%ymm1                   \n"
       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
@@ -8998,13 +8997,13 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
       "sub         $0x20,%4                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_y),     // %0
-        "+r"(src_u),     // %1
-        "+r"(src_v),     // %2
-        "+r"(dst_uyvy),  // %3
-        "+rm"(width)     // %4
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_y),     // %0
+                 "+r"(src_u),     // %1
+                 "+r"(src_v),     // %2
+                 "+r"(dst_uyvy),  // %3
+                 "+rm"(width)     // %4
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOUYVYROW_AVX2
 
@@ -9013,11 +9012,10 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const float* poly,
                             int width) {
-  asm volatile (
-      "pxor        %%xmm3,%%xmm3                 \n"
+      asm volatile("pxor        %%xmm3,%%xmm3                 \n"
 
-      // 2 pixel loop.
-      LABELALIGN
+               // 2 pixel loop.
+               LABELALIGN
       "1:                                        \n"
       "movq        (%0),%%xmm0                   \n"
       "lea         0x8(%0),%0                    \n"
@@ -9055,11 +9053,12 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
       "lea         0x8(%1),%1                    \n"
       "sub         $0x2,%2                       \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_argb),  // %1
-        "+r"(width)      // %2
-      : "r"(poly)        // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+               : "+r"(src_argb),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : "r"(poly)        // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
@@ -9068,7 +9067,7 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const float* poly,
                             int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 (%3),%%ymm4                \n"
       "vbroadcastf128 0x10(%3),%%ymm5            \n"
       "vbroadcastf128 0x20(%3),%%ymm6            \n"
@@ -9111,7 +9110,7 @@ void HalfFloatRow_SSE2(const uint16_t* src,
                        float scale,
                        int width) {
   scale *= kScaleBias;
-  asm volatile (
+  asm volatile(
       "movd        %3,%%xmm4                     \n"
       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -9149,7 +9148,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
                        float scale,
                        int width) {
   scale *= kScaleBias;
-  asm volatile (
+  asm volatile(
       "vbroadcastss %3, %%ymm4                   \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
       "sub         %0,%1                         \n"
@@ -9179,7 +9178,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
 #if defined(__x86_64__)
       : "x"(scale)  // %3
 #else
-      : "m"(scale)            // %3
+      : "m"(scale)    // %3
 #endif
       : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
 }
@@ -9190,7 +9189,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
                        uint16_t* dst,
                        float scale,
                        int width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastss %3, %%ymm4                   \n"
       "sub         %0,%1                         \n"
 
@@ -9217,7 +9216,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
 #if defined(__x86_64__)
       : "x"(scale)  // %3
 #else
-      : "m"(scale)            // %3
+      : "m"(scale)    // %3
 #endif
       : "memory", "cc", "xmm2", "xmm3", "xmm4");
 }
@@ -9225,7 +9224,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
 
 #ifdef HAS_HALFFLOATROW_F16C
 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       // 16 pixel loop.
       LABELALIGN
@@ -9256,7 +9255,7 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb,
                            const uint8_t* table_argb,
                            int width) {
   uintptr_t pixel_temp;
-  asm volatile (
+  asm volatile(
       // 1 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -9289,7 +9288,7 @@ void RGBColorTableRow_X86(uint8_t* dst_argb,
                           const uint8_t* table_argb,
                           int width) {
   uintptr_t pixel_temp;
-  asm volatile (
+  asm volatile(
       // 1 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -9322,7 +9321,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
                                  uint32_t lumacoeff) {
   uintptr_t pixel_temp;
   uintptr_t table_temp;
-  asm volatile (
+  asm volatile(
       "movd        %6,%%xmm3                     \n"
       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
@@ -9426,7 +9425,7 @@ void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
                           const uint8_t* src_vu,
                           uint8_t* dst_yuv24,
                           int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "movdqa      (%4),%%xmm4                   \n"  // 3 shuffler constants
       "movdqa      16(%4),%%xmm5                 \n"
@@ -9464,7 +9463,7 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "vbroadcastf128 (%4),%%ymm4                \n"  // 3 shuffler constants
       "vbroadcastf128 16(%4),%%ymm5              \n"
@@ -9512,7 +9511,7 @@ void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
                            const uint8_t* src_vu,
                            uint8_t* dst_yuv24,
                            int width) {
-  asm volatile (
+  asm volatile(
       "sub         %0,%1                         \n"
       "vmovdqa     (%4),%%ymm4                   \n"  // 3 shuffler constants
       "vmovdqa     32(%4),%%ymm5                 \n"
@@ -9551,10 +9550,9 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
 
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile (
-      "movdqu      %3,%%xmm5                     \n"
+      asm volatile("movdqu      %3,%%xmm5                     \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -9566,20 +9564,19 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
       "lea         0x20(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src_uv),        // %0
-        "+r"(dst_vu),        // %1
-        "+r"(width)          // %2
-      : "m"(kShuffleUVToVU)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+               : "+r"(src_uv),        // %0
+                 "+r"(dst_vu),        // %1
+                 "+r"(width)          // %2
+               : "m"(kShuffleUVToVU)  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_SWAPUVROW_SSSE3
 
 #ifdef HAS_SWAPUVROW_AVX2
 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile (
-      "vbroadcastf128 %3,%%ymm5                  \n"
+      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -9592,11 +9589,11 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_uv),        // %0
-        "+r"(dst_vu),        // %1
-        "+r"(width)          // %2
-      : "m"(kShuffleUVToVU)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+               : "+r"(src_uv),        // %0
+                 "+r"(dst_vu),        // %1
+                 "+r"(width)          // %2
+               : "m"(kShuffleUVToVU)  // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_SWAPUVROW_AVX2
 
@@ -9606,7 +9603,7 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
                           int src_stride_v,
                           uint8_t* dst_uv,
                           int width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
@@ -9652,7 +9649,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
                          int src_stride_v,
                          uint8_t* dst_uv,
                          int width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
@@ -9694,7 +9691,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
 }
 
 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "pxor        %%xmm1,%%xmm1                 \n"
 
       LABELALIGN
diff --git a/source/row_lasx.cc b/source/row_lasx.cc
index 734d7ee29..3613b0adc 100644
--- a/source/row_lasx.cc
+++ b/source/row_lasx.cc
@@ -2039,7 +2039,7 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
                                   int width,
                                   const struct RgbConstants* rgbconstants) {
   int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
-  asm volatile (
+  asm volatile(
       "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
@@ -2101,7 +2101,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
                                   int width,
                                   const struct RgbConstants* rgbconstants) {
   int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
-  asm volatile (
+  asm volatile(
       "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
@@ -2165,7 +2165,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
       1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
       25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0,
       25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
-  asm volatile (
+  asm volatile(
       "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
diff --git a/source/row_lsx.cc b/source/row_lsx.cc
index 50d5ba6a0..10546a90d 100644
--- a/source/row_lsx.cc
+++ b/source/row_lsx.cc
@@ -2807,7 +2807,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
                                  uint8_t* dst_y,
                                  int width,
                                  const struct RgbConstants* rgbconstants) {
-  asm volatile (
+  asm volatile(
       "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
       "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
@@ -2866,7 +2866,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
                                  uint8_t* dst_y,
                                  int width,
                                  const struct RgbConstants* rgbconstants) {
-  asm volatile (
+  asm volatile(
       "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
       "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
@@ -2924,7 +2924,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
                       7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
                       0,  13, 0,  16, 0,  19, 0,  22, 0,  25, 0,  28, 0,
                       31, 0,  2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
-  asm volatile (
+  asm volatile(
       "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
       "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
diff --git a/source/row_neon.cc b/source/row_neon.cc
index cfbb364d1..8c51b6bb3 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -140,7 +140,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV444 YUVTORGB
@@ -164,7 +164,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV444 YUVTORGB
           RGBTORGB8
@@ -187,7 +187,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
@@ -212,7 +212,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV444 YUVTORGB
           RGBTORGB8
@@ -238,7 +238,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
           RGBTORGB8
@@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
@@ -285,7 +285,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
@@ -316,7 +316,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
@@ -348,7 +348,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
           RGBTORGB8
@@ -381,7 +381,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "vmov.u8     d7, #0x0f                     \n"  // vbic bits to clear
@@ -404,7 +404,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV400 YUVTORGB
@@ -421,7 +421,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
 }
 
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d23, #255                     \n"
       "1:                                        \n"
       "vld1.8      {d20}, [%0]!                  \n"
@@ -442,7 +442,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
@@ -463,7 +463,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
@@ -484,7 +484,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
@@ -505,7 +505,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
@@ -526,7 +526,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                           uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
@@ -546,7 +546,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
@@ -565,7 +565,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
@@ -585,7 +585,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pairs of UV
       "subs        %3, %3, #16                   \n"  // 16 processed per loop
@@ -609,7 +609,7 @@ void DetileRow_NEON(const uint8_t* src,
                     ptrdiff_t src_tile_stride,
                     uint8_t* dst,
                     int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q0}, [%0], %3                \n"  // load 16 bytes
       "subs        %2, %2, #16                   \n"  // 16 processed per loop
@@ -629,7 +629,7 @@ void DetileRow_16_NEON(const uint16_t* src,
                        ptrdiff_t src_tile_stride,
                        uint16_t* dst,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.16     {q0, q1}, [%0], %3            \n"  // load 16 pixels
       "subs        %2, %2, #16                   \n"  // 16 processed per loop
@@ -650,7 +650,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld2.8      {d0, d1}, [%0], %4            \n"
       "subs        %3, %3, #16                   \n"
@@ -675,7 +675,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
       "pld         [%0, #1792]                   \n"
@@ -701,7 +701,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
       "vld1.8      {q1}, [%1], %5                \n"  // Load 8 UV
@@ -723,7 +723,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
 #endif
 
 void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q14}, [%0]!                  \n"  // Load lower bits.
       "vld1.8      {q9}, [%0]!                   \n"  // Load upper bits row
@@ -767,7 +767,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load U
       "vld1.8      {q1}, [%1]!                   \n"  // load V
@@ -789,7 +789,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB
       "vld3.8      {d1, d3, d5}, [%0]!           \n"  // next 8 RGB
@@ -814,7 +814,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_rgb,
                       int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load R
       "vld1.8      {q1}, [%1]!                   \n"  // load G
@@ -840,7 +840,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
@@ -868,7 +868,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q2}, [%0]!                   \n"  // load R
       "vld1.8      {q1}, [%1]!                   \n"  // load G
@@ -895,7 +895,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
@@ -920,7 +920,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     q3, #255                      \n"  // load A(255)
       "1:                                        \n"
       "vld1.8      {q2}, [%0]!                   \n"  // load R
@@ -947,7 +947,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
                        int depth,
                        int width) {
   int shift = 10 - depth;
-  asm volatile (
+  asm volatile(
       "vmov.u32    q14, #1023                    \n"
       "vdup.32     q15, %5                       \n"
       "1:                                        \n"
@@ -984,7 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
                           uint8_t* dst_ar30,
                           int /* depth */,
                           int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u32    q14, #1023                    \n"
       "1:                                        \n"
       "vld1.16     {d4}, [%2]!                   \n"  // B
@@ -1021,7 +1021,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
                        int width) {
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
-  asm volatile (
+  asm volatile(
 
       "vdup.u16    q15, %6                       \n"
       "vdup.u16    q14, %7                       \n"
@@ -1061,7 +1061,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
                        int width) {
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
-  asm volatile (
+  asm volatile(
 
       "vmov.u8     q3, #0xff                     \n"  // A (0xffff)
       "vdup.u16    q15, %5                       \n"
@@ -1098,7 +1098,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
                             int depth,
                             int width) {
   int shift = 8 - depth;
-  asm volatile (
+  asm volatile(
 
       "vdup.16     q15, %6                       \n"
       "1:                                        \n"
@@ -1134,7 +1134,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
                             int depth,
                             int width) {
   int shift = 8 - depth;
-  asm volatile (
+  asm volatile(
 
       "vdup.16     q15, %5                       \n"
       "vmov.u8     d6, #0xff                     \n"  // A (0xff)
@@ -1162,7 +1162,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
 
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 32
       "subs        %2, %2, #32                   \n"  // 32 processed per loop
@@ -1178,7 +1178,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
-  asm volatile (
+  asm volatile(
       "vdup.8      q0, %2                        \n"  // duplicate 16 bytes
       "1:                                        \n"
       "subs        %1, %1, #16                   \n"  // 16 bytes per loop
@@ -1192,7 +1192,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
 
 // ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
-  asm volatile (
+  asm volatile(
       "vdup.u32    q0, %2                        \n"  // duplicate 4 ints
       "1:                                        \n"
       "subs        %1, %1, #4                    \n"  // 4 pixels per loop
@@ -1205,7 +1205,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
 }
 
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       // Start at end of source row.
       "add         %0, %0, %2                    \n"
       "sub         %0, %0, #32                   \n"  // 32 bytes per loop
@@ -1227,7 +1227,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  asm volatile (
+  asm volatile(
       // Start at end of source row.
       "mov         r12, #-16                     \n"
       "add         %0, %0, %2, lsl #1            \n"
@@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  asm volatile (
+  asm volatile(
       // Start at end of source row.
       "mov         r12, #-16                     \n"
       "add         %0, %0, %3, lsl #1            \n"
@@ -1272,7 +1272,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
 }
 
 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "add         %0, %0, %2, lsl #2            \n"
       "sub         %0, #32                       \n"
 
@@ -1296,7 +1296,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_rgb24,
                          int width) {
   src_rgb24 += width * 3 - 24;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld3.8      {d0, d1, d2}, [%0], %3        \n"  // src -= 24
       "subs        %2, #8                        \n"  // 8 pixels per loop.
@@ -1315,7 +1315,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d4, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RGB24.
@@ -1331,7 +1331,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d4, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
@@ -1348,7 +1348,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 }
 
 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d0, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
@@ -1364,7 +1364,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
   );
 }
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@@ -1395,7 +1395,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d3, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
@@ -1441,7 +1441,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d3, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
@@ -1470,7 +1470,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                             uint8_t* dst_argb,
                             int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d3, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
@@ -1489,7 +1489,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_rgb24,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of ARGB.
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
@@ -1506,7 +1506,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d1, d2, d3, d4}, [%0]!       \n"  // load 8 pixels of ARGB.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@@ -1522,7 +1522,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
       "subs        %2, %2, #16                   \n"  // 16 processed per loop.
@@ -1537,7 +1537,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of UYVY.
       "subs        %2, %2, #16                   \n"  // 16 processed per loop.
@@ -1555,7 +1555,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
       "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
@@ -1575,7 +1575,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
       "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
@@ -1596,7 +1596,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
+  asm volatile(
       "add         %1, %0, %1                    \n"  // stride + src_yuy2
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
@@ -1623,7 +1623,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile (
+  asm volatile(
       "add         %1, %0, %1                    \n"  // stride + src_uyvy
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
@@ -1649,7 +1649,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
                         int stride_yuy2,
                         uint8_t* dst_uv,
                         int width) {
-  asm volatile (
+  asm volatile(
       "add         %1, %0, %1                    \n"  // stride + src_yuy2
       "1:                                        \n"
       "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
@@ -1673,7 +1673,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-  asm volatile (
+  asm volatile(
       "vld1.8      {q2}, [%3]                    \n"  // shuffler
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 4 pixels.
@@ -1695,7 +1695,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 Ys
       "vld1.8      {d1}, [%1]!                   \n"  // load 8 Us
@@ -1717,7 +1717,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld2.8      {d1, d3}, [%0]!               \n"  // load 16 Ys
       "vld1.8      {d0}, [%1]!                   \n"  // load 8 Us
@@ -1737,7 +1737,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                           uint8_t* dst_rgb565,
                           int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@@ -1755,7 +1755,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
                                 uint32_t dither4,
                                 int width) {
-  asm volatile (
+  asm volatile(
       "vdup.32     d7, %2                        \n"  // dither4
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%1]!       \n"  // load 8 pixels of ARGB.
@@ -1776,7 +1776,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb1555,
                             int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@@ -1793,7 +1793,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb4444,
                             int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d7, #0x0f                     \n"  // bits to clear with
                                                       // vbic.
       "1:                                        \n"
@@ -1812,7 +1812,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels
@@ -1839,7 +1839,7 @@ static void ARGBToUV444MatrixRow_NEON(
     uint8_t* dst_v,
     int width,
     const struct RgbUVConstants* rgbuvconstants) {
-  asm volatile (
+  asm volatile(
 
       "vld1.8      {d0}, [%4]                    \n"  // load rgbuvconstants
       "vdup.u8     d24, d0[0]                    \n"  // UB  0.875  coefficient
@@ -2367,7 +2367,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-  asm volatile (
+  asm volatile(
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
       "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                       // coefficient
@@ -2433,7 +2433,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  asm volatile (
+  asm volatile(
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
       "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                       // coefficient
@@ -2551,7 +2551,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
 }
 
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
       "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
       "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
@@ -2577,7 +2577,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                          uint8_t* dst_y,
                          int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
       "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
       "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
@@ -2603,7 +2603,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
       "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
       "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
@@ -2629,7 +2629,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"
       "vld1.8      {q2}, [%0]!                   \n"
@@ -2652,7 +2652,7 @@ static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ab64,
                         int width) {
-  asm volatile (
+  asm volatile(
       "vld1.8      {q4}, [%3]                    \n"  // shuffler
 
       "1:                                        \n"
@@ -2678,7 +2678,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
 void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"
       "vld1.16     {q1}, [%0]!                   \n"
@@ -2704,7 +2704,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
 void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile (
+  asm volatile(
       "vld1.8      {d8}, [%3]                    \n"  // shuffler
 
       "1:                                        \n"
@@ -2757,7 +2757,7 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                                   uint8_t* dst_y,
                                   int width,
                                   const struct RgbConstants* rgbconstants) {
-  asm volatile (
+  asm volatile(
       "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
       "vdup.u8     d20, d0[0]                    \n"
       "vdup.u8     d21, d0[1]                    \n"
@@ -2807,7 +2807,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
                                   uint8_t* dst_y,
                                   int width,
                                   const struct RgbConstants* rgbconstants) {
-  asm volatile (
+  asm volatile(
       "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
       "vdup.u8     d20, d0[0]                    \n"
       "vdup.u8     d21, d0[1]                    \n"
@@ -2851,7 +2851,7 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                                  uint8_t* dst_y,
                                  int width,
                                  const struct RgbConstants* rgbconstants) {
-  asm volatile (
+  asm volatile(
       "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
       "vdup.u8     d20, d0[0]                    \n"
       "vdup.u8     d21, d0[1]                    \n"
@@ -2903,7 +2903,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
                          int dst_width,
                          int source_y_fraction) {
   int y1_fraction = source_y_fraction;
-  asm volatile (
+  asm volatile(
       "cmp         %4, #0                        \n"
       "beq         100f                          \n"
       "add         %2, %1                        \n"
@@ -2965,7 +2965,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
   int y0_fraction = 256 - y1_fraction;
   const uint16_t* src_ptr1 = src_ptr + src_stride;
 
-  asm volatile (
+  asm volatile(
       "cmp         %4, #0                        \n"
       "beq         100f                          \n"
       "cmp         %4, #128                      \n"
@@ -3020,7 +3020,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "subs        %3, #8                        \n"
       "blt         89f                           \n"
       // Blend 8 pixels.
@@ -3079,7 +3079,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u16    q15, #0x00ff                  \n"  // 255 for rounding up
 
       // Attenuate 8 pixels.
@@ -3108,7 +3108,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
                           int interval_size,
                           int interval_offset,
                           int width) {
-  asm volatile (
+  asm volatile(
       "vdup.u16    q8, %2                        \n"
       "vshr.u16    q8, q8, #1                    \n"  // scale >>= 1
       "vdup.u16    q9, %3                        \n"  // interval multiply.
@@ -3150,7 +3150,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        int width,
                        uint32_t value) {
-  asm volatile (
+  asm volatile(
       "vdup.u32    q0, %3                        \n"  // duplicate scale value.
       "vzip.u8     d0, d1                        \n"  // d0 aarrggbb.
       "vshr.u16    q0, q0, #1                    \n"  // scale / 2.
@@ -3184,7 +3184,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
 // Similar to ARGBToYJ but stores ARGB.
 // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
       "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
       "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
@@ -3211,7 +3211,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d20, #17                      \n"  // BB coefficient
       "vmov.u8     d21, #68                      \n"  // BG coefficient
       "vmov.u8     d22, #35                      \n"  // BR coefficient
@@ -3252,7 +3252,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              const int8_t* matrix_argb,
                              int width) {
-  asm volatile (
+  asm volatile(
       "vld1.8      {q2}, [%3]                    \n"  // load 3 ARGB vectors.
       "vmovl.s8    q0, d4                        \n"  // B,G coefficients s16.
       "vmovl.s8    q1, d5                        \n"  // R,A coefficients s16.
@@ -3311,7 +3311,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
+  asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
@@ -3340,7 +3340,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile (
+  asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
@@ -3363,7 +3363,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
+  asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
@@ -3390,7 +3390,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
                    const uint8_t* src_sobely,
                    uint8_t* dst_argb,
                    int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d3, #255                      \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
@@ -3415,7 +3415,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
                           const uint8_t* src_sobely,
                           uint8_t* dst_y,
                           int width) {
-  asm volatile (
+  asm volatile(
       // 16 pixel loop.
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 16 sobelx.
@@ -3441,7 +3441,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
                      const uint8_t* src_sobely,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d3, #255                      \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
@@ -3468,7 +3468,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
                     const uint8_t* src_y2,
                     uint8_t* dst_sobelx,
                     int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {d0}, [%0],%5                 \n"  // top
       "vld1.8      {d1}, [%0],%6                 \n"
@@ -3506,7 +3506,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
                     const uint8_t* src_y1,
                     uint8_t* dst_sobely,
                     int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {d0}, [%0],%4                 \n"  // left
       "vld1.8      {d1}, [%1],%4                 \n"
@@ -3543,7 +3543,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
                        uint16_t* dst,
                        float scale,
                        int width) {
-    asm volatile (
+  asm volatile(
 
       "1:                                        \n"
       "vld1.16     {q0, q1}, [%0]!               \n"  // load 16 shorts
@@ -3564,11 +3564,11 @@ void HalfFloatRow_NEON(const uint16_t* src,
       "vqshrn.u32  d1, q9, #13                   \n"
       "vqshrn.u32  d2, q10, #13                  \n"
       "vqshrn.u32  d3, q11, #13                  \n"
-      "vst1.16     {q0, q1}, [%1]!               \n" // store 16 fp16
+      "vst1.16     {q0, q1}, [%1]!               \n"  // store 16 fp16
       "bgt         1b                            \n"
-      : "+r"(src),              // %0
-        "+r"(dst),              // %1
-        "+r"(width)             // %2
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
       : "w"(scale * 1.9259299444e-34f)  // %3
       : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
 }
@@ -3577,7 +3577,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
                          float* dst,
                          float scale,
                          int width) {
-  asm volatile (
+  asm volatile(
 
       "1:                                        \n"
       "vld1.8      {d2}, [%0]!                   \n"  // load 8 bytes
@@ -3606,7 +3606,7 @@ void GaussCol_NEON(const uint16_t* src0,
                    const uint16_t* src4,
                    uint32_t* dst,
                    int width) {
-  asm volatile (
+  asm volatile(
       "vmov.u16    d6, #4                        \n"  // constant 4
       "vmov.u16    d7, #6                        \n"  // constant 6
 
@@ -3643,7 +3643,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
   const uint32_t* src1 = src + 1;
   const uint32_t* src2 = src + 2;
   const uint32_t* src3 = src + 3;
-  asm volatile (
+  asm volatile(
       "vmov.u32    q10, #4                       \n"  // constant 4
       "vmov.u32    q11, #6                       \n"  // constant 6
 
@@ -3681,7 +3681,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q2}, [%0]!                   \n"  // load 16 Y values
       "vld2.8      {d0, d2}, [%1]!               \n"  // load 8 VU values
@@ -3705,7 +3705,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
                       int src_stride_ayuv,
                       uint8_t* dst_uv,
                       int width) {
-  asm volatile (
+  asm volatile(
       "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
@@ -3736,7 +3736,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
                       int src_stride_ayuv,
                       uint8_t* dst_vu,
                       int width) {
-  asm volatile (
+  asm volatile(
       "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
@@ -3766,7 +3766,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
 // Copy row of AYUV Y's into Y.
 // Similar to ARGBExtractAlphaRow_NEON
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV pixels
@@ -3782,7 +3782,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
 
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 UV values
       "vld2.8      {d1, d3}, [%0]!               \n"
@@ -3805,7 +3805,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
                          int width) {
   const uint8_t* src_u_1 = src_u + src_stride_u;
   const uint8_t* src_v_1 = src_v + src_stride_v;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 16 U values
       "vld1.8      {q1}, [%2]!                   \n"  // load 16 V values
@@ -3836,7 +3836,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
                         int depth,
                         int width) {
   int shift = depth - 16;  // Negative for right shift.
-  asm volatile (
+  asm volatile(
       "vdup.16     q2, %4                        \n"
       "1:                                        \n"
       "vld2.16     {q0, q1}, [%0]!               \n"  // load 8 UV
@@ -3860,7 +3860,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                         int depth,
                         int width) {
   int shift = 16 - depth;
-  asm volatile (
+  asm volatile(
       "vdup.16     q2, %4                        \n"
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"  // load 8 U
@@ -3882,7 +3882,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width) {
-  asm volatile (
+  asm volatile(
       "vdup.16     q2, %3                        \n"
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"
@@ -3904,7 +3904,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        uint16_t* dst_y,
                        int scale,
                        int width) {
-  asm volatile (
+  asm volatile(
       "vdup.16     d8, %3                        \n"
       "1:                                        \n"
       "vld1.16     {q2, q3}, [%0]!               \n"
@@ -3936,7 +3936,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
                           int scale,
                           int width) {
   int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
-  asm volatile (
+  asm volatile(
       "vdup.16     q2, %3                        \n"
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 55f686766..a8ba41357 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -239,7 +239,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n" /* A */
       "1:                                        \n" READYUV444 I4XXTORGB
@@ -263,7 +263,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV444 I4XXTORGB
           RGBTORGB8
@@ -290,12 +290,13 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   uint16_t limit = 0x3ff0;
   uint16_t alpha = 0xc000;
-  asm volatile (YUVTORGB_SETUP
-      "dup      v22.8h, %w[limit]                  \n"
-      "dup      v23.8h, %w[alpha]                  \n"
-      "1:                                          \n" READYUV210 NVTORGB
-      "subs     %w[width], %w[width], #8           \n" STOREAR30
-      "b.gt     1b                                 \n"
+  asm volatile(
+      YUVTORGB_SETUP
+      "dup         v22.8h, %w[limit]             \n"
+      "dup         v23.8h, %w[alpha]             \n"
+      "1:                                        \n" READYUV210 NVTORGB
+      "subs        %w[width], %w[width], #8      \n" STOREAR30
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),             // %[src_y]
         [src_u] "+r"(src_u),             // %[src_u]
         [src_v] "+r"(src_v),             // %[src_v]
@@ -318,12 +319,13 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   uint16_t limit = 0x3ff0;
   uint16_t alpha = 0xc000;
-  asm volatile (YUVTORGB_SETUP
-      "dup      v22.8h, %w[limit]                  \n"
-      "dup      v23.8h, %w[alpha]                  \n"
-      "1:                                          \n" READYUV410 NVTORGB
-      "subs     %w[width], %w[width], #8           \n" STOREAR30
-      "b.gt     1b                                 \n"
+  asm volatile(
+      YUVTORGB_SETUP
+      "dup         v22.8h, %w[limit]             \n"
+      "dup         v23.8h, %w[alpha]             \n"
+      "1:                                        \n" READYUV410 NVTORGB
+      "subs        %w[width], %w[width], #8      \n" STOREAR30
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),             // %[src_y]
         [src_u] "+r"(src_u),             // %[src_u]
         [src_v] "+r"(src_v),             // %[src_v]
@@ -345,13 +347,13 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
   const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   const uint16_t limit = 0x3ff0;
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
-      "dup      v22.8h, %w[limit]                  \n"
-      "movi     v23.8h, #0xc0, lsl #8              \n"  // A
-      "1:                                          \n" READYUV212 NVTORGB
-      "subs     %w[width], %w[width], #8           \n" STOREAR30
-      "b.gt     1b                                 \n"
+      "dup         v22.8h, %w[limit]             \n"
+      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
+      "1:                                        \n" READYUV212 NVTORGB
+      "subs        %w[width], %w[width], #8      \n" STOREAR30
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),             // %[src_y]
         [src_u] "+r"(src_u),             // %[src_u]
         [src_v] "+r"(src_v),             // %[src_v]
@@ -369,12 +371,13 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (YUVTORGB_SETUP
-      "movi        v19.8b, #255             \n"
-      "1:                                   \n" READYUV210 NVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8 \n"
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "1:                                        \n" READYUV210 NVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                       \n"
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
         [src_v] "+r"(src_v),                               // %[src_v]
@@ -391,12 +394,13 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (YUVTORGB_SETUP
-      "movi        v19.8b, #255             \n"
-      "1:                                   \n" READYUV410 NVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8 \n"
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8b, #255                  \n"
+      "1:                                        \n" READYUV410 NVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                       \n"
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
         [src_v] "+r"(src_v),                               // %[src_v]
@@ -415,13 +419,13 @@ void I212ToARGBRow_NEON(const uint16_t* src_y,
                         int width) {
   const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
-      "movi        v19.8b, #255             \n"
-      "1:                                   \n" READYUV212 NVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8 \n"
+      "movi        v19.8b, #255                  \n"
+      "1:                                        \n" READYUV212 NVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                       \n"
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),            // %[src_y]
         [src_u] "+r"(src_u),            // %[src_u]
         [src_v] "+r"(src_v),            // %[src_v]
@@ -438,7 +442,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n" /* A */
       "1:                                        \n" READYUV422 I4XXTORGB
@@ -468,13 +472,13 @@ void P210ToARGBRow_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   asm volatile(
       YUVTORGB_SETUP
-      "movi         v19.8b, #255                       \n"
-      "ldr          q2, [%[kIndices]]                  \n"
-      "1:                                              \n"  //
+      "movi        v19.8b, #255                  \n"
+      "ldr         q2, [%[kIndices]]             \n"
+      "1:                                        \n"  //
       READYUVP210 NVTORGB RGBTORGB8
-      "subs         %w[width], %w[width], #8           \n"
-      "st4          {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt         1b                                 \n"
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),                     // %[src_y]
         [src_uv] "+r"(src_uv),                   // %[src_uv]
         [dst_argb] "+r"(dst_argb),               // %[dst_argb]
@@ -497,13 +501,13 @@ void P410ToARGBRow_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   asm volatile(
       YUVTORGB_SETUP
-      "movi         v19.8b, #255               \n"
-      "ldr          q2, [%[kIndices]]   \n"
-      "1:                                      \n"  //
+      "movi        v19.8b, #255                  \n"
+      "ldr         q2, [%[kIndices]]             \n"
+      "1:                                        \n"  //
       READYUVP410 NVTORGB RGBTORGB8
-      "subs         %w[width], %w[width], #8   \n"
-      "st4          {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt         1b                         \n"
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),                     // %[src_y]
         [src_uv] "+r"(src_uv),                   // %[src_uv]
         [dst_argb] "+r"(dst_argb),               // %[dst_argb]
@@ -523,12 +527,12 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   const uint16_t limit = 0x3ff0;
   asm volatile(YUVTORGB_SETUP
-               "dup          v22.8h, %w[limit]          \n"
-               "movi         v23.8h, #0xc0, lsl #8      \n"  // A
-               "ldr          q2, [%[kIndices]]          \n"
-               "1:                                      \n" READYUVP210 NVTORGB
-               "subs         %w[width], %w[width], #8   \n" STOREAR30
-               "b.gt         1b                         \n"
+      "dup         v22.8h, %w[limit]             \n"
+      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
+      "ldr         q2, [%[kIndices]]             \n"
+      "1:                                        \n" READYUVP210 NVTORGB
+      "subs        %w[width], %w[width], #8      \n" STOREAR30
+      "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),                     // %[src_y]
                  [src_uv] "+r"(src_uv),                   // %[src_uv]
                  [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
@@ -549,12 +553,12 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   uint16_t limit = 0x3ff0;
   asm volatile(YUVTORGB_SETUP
-               "dup          v22.8h, %w[limit]          \n"
-               "movi         v23.8h, #0xc0, lsl #8      \n"  // A
-               "ldr          q2, [%[kIndices]]          \n"
-               "1:                                      \n" READYUVP410 NVTORGB
-               "subs         %w[width], %w[width], #8   \n" STOREAR30
-               "b.gt         1b                         \n"
+      "dup         v22.8h, %w[limit]             \n"
+      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
+      "ldr         q2, [%[kIndices]]             \n"
+      "1:                                        \n" READYUVP410 NVTORGB
+      "subs        %w[width], %w[width], #8      \n" STOREAR30
+      "b.gt        1b                            \n"
                : [src_y] "+r"(src_y),                     // %[src_y]
                  [src_uv] "+r"(src_uv),                   // %[src_uv]
                  [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
@@ -575,13 +579,13 @@ void I422ToAR30Row_NEON(const uint8_t* src_y,
   const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   const uint16_t limit = 0x3ff0;
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
-      "dup      v22.8h, %w[limit]                  \n"
-      "movi     v23.8h, #0xc0, lsl #8              \n"  // A
-      "1:                                          \n" READYUV422 I4XXTORGB
-      "subs     %w[width], %w[width], #8           \n" STOREAR30
-      "b.gt     1b                                 \n"
+      "dup         v22.8h, %w[limit]             \n"
+      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
+      "1:                                        \n" READYUV422 I4XXTORGB
+      "subs        %w[width], %w[width], #8      \n" STOREAR30
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),             // %[src_y]
         [src_u] "+r"(src_u),             // %[src_u]
         [src_v] "+r"(src_v),             // %[src_v]
@@ -600,7 +604,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n"
       "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV444
@@ -626,13 +630,14 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile (YUVTORGB_SETUP
-      "1:                                     \n"
-      "ld1        {v19.16b}, [%[src_a]], #16  \n" READYUV410
-      "uqshrn     v19.8b, v19.8h, #2          \n" NVTORGB RGBTORGB8
-      "subs       %w[width], %w[width], #8    \n"
-      "st4        {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt       1b                          \n"
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n"
+      "ld1         {v19.16b}, [%[src_a]], #16    \n" READYUV410
+      "uqshrn      v19.8b, v19.8h, #2            \n" NVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
+      "st4         {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
         [src_v] "+r"(src_v),                               // %[src_v]
@@ -651,7 +656,8 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile (YUVTORGB_SETUP
+  asm volatile(
+      YUVTORGB_SETUP
       "1:                                        \n"
       "ld1         {v19.16b}, [%[src_a]], #16    \n" READYUV210
       "uqshrn      v19.8b, v19.8h, #2            \n" NVTORGB RGBTORGB8
@@ -676,7 +682,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n"
       "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV422
@@ -701,7 +707,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "movi        v15.8b, #255                  \n" /* A */
       "1:                                        \n" READYUV422 I4XXTORGB
@@ -725,7 +731,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 I4XXTORGB
           RGBTORGB8
@@ -767,7 +773,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 I4XXTORGB
           RGBTORGB8_TOP
@@ -807,14 +813,15 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (YUVTORGB_SETUP
-      "movi    v19.8h, #0x80, lsl #8             \n"
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi        v19.8h, #0x80, lsl #8         \n"
       "1:                                        \n"  //
       READYUV422 I4XXTORGB RGBTORGB8_TOP
-      "subs    %w[width], %w[width], #8          \n"  //
+      "subs        %w[width], %w[width], #8      \n"  //
       ARGBTOARGB1555_FROM_TOP
-      "st1     {v19.8h}, [%[dst_argb1555]], #16  \n"  // store 8 pixels RGB1555.
-      "b.gt    1b                                \n"
+      "st1         {v19.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels RGB1555.
+      "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
         [src_v] "+r"(src_v),                               // %[src_v]
@@ -837,7 +844,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 I4XXTORGB
           RGBTORGB8
@@ -861,7 +868,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "movi        v1.16b, #128                  \n"
       "movi        v19.8b, #255                  \n"
@@ -884,7 +891,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
 
 #if defined(LIBYUV_USE_ST4)
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "movi        v23.8b, #255                  \n"
       "1:                                        \n"
       "ld1         {v20.8b}, [%0], #8            \n"
@@ -902,7 +909,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 }
 #else
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "movi        v20.8b, #255                  \n"
       "1:                                        \n"
       "ldr         d16, [%0], #8                 \n"
@@ -927,7 +934,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n"
       "ldr         q2, [%[kNV12Table]]           \n"
@@ -950,7 +957,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n"
       "ldr         q2, [%[kNV12Table]]           \n"
@@ -973,7 +980,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "ldr         q2, [%[kNV12Table]]           \n"
       "1:                                        \n" READNV12 NVTORGB RGBTORGB8
@@ -995,7 +1002,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "ldr         q2, [%[kNV12Table]]           \n"
       "1:                                        \n" READNV12 NVTORGB RGBTORGB8
@@ -1017,7 +1024,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                           uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
       "ldr         q2, [%[kNV12Table]]           \n"
       "1:                                        \n" READNV12 NVTORGB
@@ -1041,14 +1048,14 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
-      "movi        v19.8b, #255                   \n"
+      "movi        v19.8b, #255                  \n"
       "ldr         q2, [%[kNV21InterleavedTable]] \n"
-      "1:                                         \n" READYUY2 NVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8       \n"
+      "1:                                        \n" READYUY2 NVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                             \n"
+      "b.gt        1b                            \n"
       : [src_yuy2] "+r"(src_yuy2),                          // %[src_yuy2]
         [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
         [width] "+r"(width)                                 // %[width]
@@ -1062,14 +1069,14 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
+  asm volatile(
       YUVTORGB_SETUP
-      "movi        v19.8b, #255                   \n"
+      "movi        v19.8b, #255                  \n"
       "ldr         q2, [%[kNV12InterleavedTable]] \n"
-      "1:                                         \n" READUYVY NVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8       \n"
+      "1:                                        \n" READUYVY NVTORGB RGBTORGB8
+      "subs        %w[width], %w[width], #8      \n"
       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                             \n"
+      "b.gt        1b                            \n"
       : [src_uyvy] "+r"(src_uyvy),                          // %[src_yuy2]
         [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
         [width] "+r"(width)                                 // %[width]
@@ -1084,7 +1091,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
@@ -1109,7 +1116,7 @@ void DetileRow_NEON(const uint8_t* src,
                     ptrdiff_t src_tile_stride,
                     uint8_t* dst,
                     int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], %3            \n"  // load 16 bytes
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
@@ -1129,7 +1136,7 @@ void DetileRow_16_NEON(const uint16_t* src,
                        ptrdiff_t src_tile_stride,
                        uint16_t* dst,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.8h,v1.8h}, [%0], %3       \n"  // load 16 pixels
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
@@ -1150,7 +1157,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld2         {v0.8b,v1.8b}, [%0], %4       \n"
       "subs        %w3, %w3, #16                 \n"
@@ -1175,7 +1182,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
       "prfm        pldl1keep, [%0, 1792]         \n"
@@ -1201,7 +1208,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
       "ld1         {v1.16b}, [%1], %5            \n"  // load 8 UVs
@@ -1226,7 +1233,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
 // Unpack MT2T into tiled P010 64 pixels at a time. See
 // tinyurl.com/mtk-10bit-video-format for format documentation.
 void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v7.16b}, [%0], #16           \n"
       "ld1         {v0.16b-v3.16b}, [%0], #64    \n"
@@ -1267,7 +1274,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load U
       "ld1         {v1.16b}, [%1], #16           \n"  // load V
@@ -1291,7 +1298,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                         int depth,
                         int width) {
   int shift = 16 - depth;
-  asm volatile (
+  asm volatile(
       "dup         v2.8h, %w4                    \n"
       "1:                                        \n"
       "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
@@ -1316,7 +1323,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load U
       "ld1         {v1.16b}, [%1], #16           \n"  // load V
@@ -1342,7 +1349,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                         int depth,
                         int width) {
   int shift = 16 - depth;
-  asm volatile (
+  asm volatile(
       "dup         v4.8h, %w4                    \n"
       "1:                                        \n"
       "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
@@ -1371,7 +1378,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
@@ -1396,7 +1403,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_rgb,
                       int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load R
       "ld1         {v1.16b}, [%1], #16           \n"  // load G
@@ -1424,7 +1431,7 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
       "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
@@ -1453,7 +1460,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%2], #16           \n"  // load B
       "ld1         {v1.16b}, [%1], #16           \n"  // load G
@@ -1484,7 +1491,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%2], #16           \n"  // load B
       "ld1         {v1.16b}, [%1], #16           \n"  // load G
@@ -1524,7 +1531,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
@@ -1549,7 +1556,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "movi        v3.16b, #255                  \n"  // load A(255)
       "1:                                        \n"
       "ld1         {v2.16b}, [%0], #16           \n"  // load R
@@ -1578,7 +1585,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
                        int depth,
                        int width) {
   int shift = 10 - depth;
-  asm volatile (
+  asm volatile(
       "movi        v30.16b, #255                 \n"
       "ushr        v30.4s, v30.4s, #22           \n"  // 1023
       "dup         v31.4s, %w5                   \n"
@@ -1619,24 +1626,24 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
   // Neon has no "shift left and accumulate/orr", so use a multiply-add to
   // perform the shift instead.
   int limit = 1023;
-  asm volatile (
-      "dup    v5.8h, %w[limit]          \n"
-      "movi   v6.8h, #16                \n"  // 1 << 4
-      "movi   v7.8h, #4, lsl #8         \n"  // 1 << 10
-      "1:                               \n"
-      "ldr    q0, [%0], #16             \n"  // xxxxxxRrrrrrrrrr
-      "ldr    q1, [%1], #16             \n"  // xxxxxxGggggggggg
-      "ldr    q2, [%2], #16             \n"  // xxxxxxBbbbbbbbbb
-      "umin   v0.8h, v0.8h, v5.8h       \n"  // 000000Rrrrrrrrrr
-      "umin   v1.8h, v1.8h, v5.8h       \n"  // 000000Gggggggggg
-      "movi   v4.8h, #0xc0, lsl #8      \n"  // 1100000000000000
-      "umin   v3.8h, v2.8h, v5.8h       \n"  // 000000Bbbbbbbbbb
-      "mla    v4.8h, v0.8h, v6.8h       \n"  // 11Rrrrrrrrrr0000
-      "mla    v3.8h, v1.8h, v7.8h       \n"  // ggggggBbbbbbbbbb
-      "usra   v4.8h, v1.8h, #6          \n"  // 11RrrrrrrrrrGggg
-      "subs   %w4, %w4, #8              \n"
-      "st2    {v3.8h, v4.8h}, [%3], #32 \n"
-      "b.gt   1b                        \n"
+  asm volatile(
+      "dup         v5.8h, %w[limit]              \n"
+      "movi        v6.8h, #16                    \n"  // 1 << 4
+      "movi        v7.8h, #4, lsl #8             \n"  // 1 << 10
+      "1:                                        \n"
+      "ldr         q0, [%0], #16                 \n"  // xxxxxxRrrrrrrrrr
+      "ldr         q1, [%1], #16                 \n"  // xxxxxxGggggggggg
+      "ldr         q2, [%2], #16                 \n"  // xxxxxxBbbbbbbbbb
+      "umin        v0.8h, v0.8h, v5.8h           \n"  // 000000Rrrrrrrrrr
+      "umin        v1.8h, v1.8h, v5.8h           \n"  // 000000Gggggggggg
+      "movi        v4.8h, #0xc0, lsl #8          \n"  // 1100000000000000
+      "umin        v3.8h, v2.8h, v5.8h           \n"  // 000000Bbbbbbbbbb
+      "mla         v4.8h, v0.8h, v6.8h           \n"  // 11Rrrrrrrrrr0000
+      "mla         v3.8h, v1.8h, v7.8h           \n"  // ggggggBbbbbbbbbb
+      "usra        v4.8h, v1.8h, #6              \n"  // 11RrrrrrrrrrGggg
+      "subs        %w4, %w4, #8                  \n"
+      "st2         {v3.8h, v4.8h}, [%3], #32     \n"
+      "b.gt        1b                            \n"
       : "+r"(src_r),     // %0
         "+r"(src_g),     // %1
         "+r"(src_b),     // %2
@@ -1655,7 +1662,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
                        int width) {
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
-  asm volatile (
+  asm volatile(
 
       "dup         v30.8h, %w7                   \n"
       "dup         v31.8h, %w6                   \n"
@@ -1698,7 +1705,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
                        int width) {
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
-  asm volatile (
+  asm volatile(
 
       "movi        v3.16b, #0xff                 \n"  // A (0xffff)
       "dup         v30.8h, %w6                   \n"
@@ -1739,7 +1746,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
                             int width) {
   // Shift is 8 - depth, +8 so the result is in the top half of each lane.
   int shift = 16 - depth;
-  asm volatile (
+  asm volatile(
       "dup         v31.8h, %w6                   \n"
       "1:                                        \n"
       "ldr         q0, [%0], #16                 \n"  // B
@@ -1777,7 +1784,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
                             int width) {
   // Shift is 8 - depth, +8 so the result is in the top half of each lane.
   int shift = 16 - depth;
-  asm volatile (
+  asm volatile(
       "dup         v31.8h, %w5                   \n"
       "movi        v3.16b, #0xff                 \n"  // A (0xff)
       "1:                                        \n"
@@ -1806,7 +1813,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
 
 // Copy multiple of 32.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"
       "prfm        pldl1keep, [%0, 448]          \n"
@@ -1823,7 +1830,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
-  asm volatile (
+  asm volatile(
       "dup         v0.16b, %w2                   \n"  // duplicate 16 bytes
       "1:                                        \n"
       "subs        %w1, %w1, #16                 \n"  // 16 bytes per loop
@@ -1836,7 +1843,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
 }
 
 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
-  asm volatile (
+  asm volatile(
       "dup         v0.4s, %w2                    \n"  // duplicate 4 ints
       "1:                                        \n"
       "subs        %w1, %w1, #4                  \n"  // 4 ints per loop
@@ -1853,7 +1860,7 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
 
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (
+  asm volatile(
       // Start at end of source row.
       "ld1         {v3.16b}, [%3]                \n"  // shuffler
       "add         %0, %0, %w2, sxtw             \n"
@@ -1878,7 +1885,7 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
                                        6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
 
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  asm volatile (
+  asm volatile(
       // Start at end of source row.
       "ld1         {v4.16b}, [%3]                \n"  // shuffler
       "add         %0, %0, %w2, sxtw #1          \n"
@@ -1902,7 +1909,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  asm volatile (
+  asm volatile(
       // Start at end of source row.
       "ld1         {v4.16b}, [%4]                \n"  // shuffler
       "add         %0, %0, %w3, sxtw #1          \n"
@@ -1931,7 +1938,7 @@ static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
                                          4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};
 
 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       // Start at end of source row.
       "ld1         {v4.16b}, [%3]                \n"  // shuffler
       "add         %0, %0, %w2, sxtw #2          \n"
@@ -1954,7 +1961,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
 void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_rgb24,
                          int width) {
-  asm volatile (
+  asm volatile(
       "ld1         {v3.16b}, [%4]                \n"  // shuffler
       "add         %0, %0, %w2, sxtw #1          \n"  // Start at end of row.
       "add         %0, %0, %w2, sxtw             \n"
@@ -1979,7 +1986,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile (
+  asm volatile(
       "movi        v4.8b, #255                   \n"  // Alpha
       "1:                                        \n"
       "ld3         {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of
@@ -1997,7 +2004,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "movi        v5.8b, #255                   \n"  // Alpha
       "1:                                        \n"
       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
@@ -2016,7 +2023,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 }
 
 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  asm volatile (
+  asm volatile(
       "movi        v0.8b, #255                   \n"  // Alpha
       "1:                                        \n"
       "ld3         {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
@@ -2035,7 +2042,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
 }
 
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
       "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
@@ -2067,14 +2074,14 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width) {
   asm volatile(
-      "movi    v3.16b, #255            \n"  // Alpha
-      "1:                              \n"
-      "ldp     q0, q4, [%0], #32       \n"  // load 16 RGB565 pixels
-      "subs    %w2, %w2, #16           \n"  // 16 processed per loop
-      "prfm    pldl1keep, [%0, 448]    \n" RGB565TOARGB
-      "st4     {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n"  // store 16 ARGB
-      "add     %1, %1, #64             \n"
-      "b.gt    1b                      \n"
+      "movi        v3.16b, #255                  \n"  // Alpha
+      "1:                                        \n"
+      "ldp         q0, q4, [%0], #32             \n"  // load 16 RGB565 pixels
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
+      "prfm        pldl1keep, [%0, 448]          \n" RGB565TOARGB
+      "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n"  // store 16 ARGB
+      "add         %1, %1, #64                   \n"
+      "b.gt        1b                            \n"
       : "+r"(src_rgb565),  // %0
         "+r"(dst_argb),    // %1
         "+r"(width)        // %2
@@ -2115,14 +2122,14 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width) {
   asm volatile(
-      "1:                            \n"
-      "ldp   q0, q4, [%0], #32       \n"  // load 16 ARGB1555 pixels
-      "prfm  pldl1keep, [%0, 448]    \n"
-      "subs  %w2, %w2, #16           \n"  // 16 processed per loop
+      "1:                                        \n"
+      "ldp         q0, q4, [%0], #32             \n"  // load 16 ARGB1555 pixels
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
       ARGB1555TOARGB
-      "st4   {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n"  // store 16 ARGB
-      "add   %1, %1, #64             \n"
-      "b.gt  1b                      \n"
+      "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n"  // store 16 ARGB
+      "add         %1, %1, #64                   \n"
+      "b.gt        1b                            \n"
       : "+r"(src_argb1555),  // %0
         "+r"(dst_argb),      // %1
         "+r"(width)          // %2
@@ -2150,7 +2157,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                             uint8_t* dst_argb,
                             int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v1.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@@ -2179,29 +2186,29 @@ static void ABCDToAR30Row_NEON(const uint8_t* src_abcd,
                                uint8_t* dst_ar30,
                                int width,
                                const uint8_t* indices) {
-  asm volatile (
-      "movi      v2.4s, #0xf, msl 16             \n"  // 0xfffff
-      "ldr       q3, [%[kAR30Row_BoxShifts]]     \n"
-      "ldp       q4, q5, [%[indices]]            \n"
+  asm volatile(
+      "movi        v2.4s, #0xf, msl 16           \n"  // 0xfffff
+      "ldr         q3, [%[kAR30Row_BoxShifts]]   \n"
+      "ldp         q4, q5, [%[indices]]          \n"
       "1:                                        \n"
-      "ldp       q0, q20, [%[src]], #32          \n"
-      "subs      %w[width], %w[width], #8        \n"
-      "tbl       v1.16b, {v0.16b}, v5.16b        \n"
-      "tbl       v21.16b, {v20.16b}, v5.16b      \n"
-      "tbl       v0.16b, {v0.16b}, v4.16b        \n"
-      "tbl       v20.16b, {v20.16b}, v4.16b      \n"
-      "ushl      v0.8h, v0.8h, v3.8h             \n"
-      "ushl      v20.8h, v20.8h, v3.8h           \n"
-      "ushl      v1.8h, v1.8h, v3.8h             \n"
-      "ushl      v21.8h, v21.8h, v3.8h           \n"
-      "ushr      v0.4s, v0.4s, #6                \n"
-      "ushr      v20.4s, v20.4s, #6              \n"
-      "shl       v1.4s, v1.4s, #14               \n"
-      "shl       v21.4s, v21.4s, #14             \n"
-      "bif       v0.16b, v1.16b, v2.16b          \n"
-      "bif       v20.16b, v21.16b, v2.16b        \n"
-      "stp       q0, q20, [%[dst]], #32          \n"
-      "b.gt      1b                              \n"
+      "ldp         q0, q20, [%[src]], #32        \n"
+      "subs        %w[width], %w[width], #8      \n"
+      "tbl         v1.16b, {v0.16b}, v5.16b      \n"
+      "tbl         v21.16b, {v20.16b}, v5.16b    \n"
+      "tbl         v0.16b, {v0.16b}, v4.16b      \n"
+      "tbl         v20.16b, {v20.16b}, v4.16b    \n"
+      "ushl        v0.8h, v0.8h, v3.8h           \n"
+      "ushl        v20.8h, v20.8h, v3.8h         \n"
+      "ushl        v1.8h, v1.8h, v3.8h           \n"
+      "ushl        v21.8h, v21.8h, v3.8h         \n"
+      "ushr        v0.4s, v0.4s, #6              \n"
+      "ushr        v20.4s, v20.4s, #6            \n"
+      "shl         v1.4s, v1.4s, #14             \n"
+      "shl         v21.4s, v21.4s, #14           \n"
+      "bif         v0.16b, v1.16b, v2.16b        \n"
+      "bif         v20.16b, v21.16b, v2.16b      \n"
+      "stp         q0, q20, [%[dst]], #32        \n"
+      "b.gt        1b                            \n"
       : [src] "+r"(src_abcd),                          // %[src]
         [dst] "+r"(dst_ar30),                          // %[dst]
         [width] "+r"(width)                            // %[width]
@@ -2221,7 +2228,7 @@ void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_rgb24,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
@@ -2237,7 +2244,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@@ -2255,7 +2262,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
@@ -2271,7 +2278,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
@@ -2290,7 +2297,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
@@ -2311,7 +2318,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
@@ -2334,7 +2341,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                       uint8_t* dst_v,
                       int width) {
   const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
       "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
@@ -2362,7 +2369,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
                       uint8_t* dst_v,
                       int width) {
   const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
       "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
@@ -2389,7 +2396,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_uv,
                         int width) {
   const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels
       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
@@ -2412,7 +2419,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-  asm volatile (
+  asm volatile(
       "ld1         {v2.16b}, [%3]                \n"  // shuffler
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 4 pixels.
@@ -2434,7 +2441,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld2         {v0.8b, v1.8b}, [%0], #16     \n"  // load 16 Ys
       "subs        %w4, %w4, #16                 \n"  // 16 pixels
@@ -2458,7 +2465,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld2         {v1.8b,v2.8b}, [%0], #16      \n"  // load 16 Ys
       "mov         v3.8b, v2.8b                  \n"
@@ -2480,7 +2487,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                           uint8_t* dst_rgb565,
                           int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
                                                                  // pixels
@@ -2499,7 +2506,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
                                 uint32_t dither4,
                                 int width) {
-  asm volatile (
+  asm volatile(
       "dup         v1.4s, %w3                    \n"  // dither4
       "1:                                        \n"
       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
@@ -2537,7 +2544,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb4444,
                             int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
                                                                  // pixels
@@ -2556,7 +2563,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
       "mov         v1.16b, v0.16b                \n"
@@ -2579,7 +2586,7 @@ static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ab64,
                         int width) {
-  asm volatile (
+  asm volatile(
       "ldr         q4, [%3]                      \n"  // shuffler
       "1:                                        \n"
       "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
@@ -2602,7 +2609,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 8 ARGB pixels
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@@ -2627,7 +2634,7 @@ static const uvec8 kShuffleARGBToAB64[2] = {
 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ab64,
                         int width) {
-  asm volatile (
+  asm volatile(
       "ldp         q6, q7, [%3]                  \n"  // 2 shufflers
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 8 pixels
@@ -2653,7 +2660,7 @@ static const uvec8 kShuffleAR64ToARGB = {1,  3,  5,  7,  9,  11, 13, 15,
 void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile (
+  asm volatile(
       "ldr         q4, [%3]                      \n"  // shuffler
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
@@ -2677,7 +2684,7 @@ static const uvec8 kShuffleAB64ToARGB = {5,  3,  1,  7,  13, 11, 9,  15,
 void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile (
+  asm volatile(
       "ldr         q4, [%3]                      \n"  // shuffler
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
@@ -2698,7 +2705,7 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
       "prfm        pldl1keep, [%0, 448]          \n"
@@ -2772,7 +2779,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
     uint8_t* dst_v,
     int width,
     const struct RgbUVConstantsI8* rgbuvconstants) {
-  asm("ld2r        {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
+      asm("ld2r        {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
       "movi        v29.16b, #0x80                \n"  // 128.5
       "1:                                        \n"
       "ldp         q0, q1, [%[src]], #32         \n"
@@ -3288,30 +3295,30 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
   const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
   asm volatile(
       RGBTOUV_SETUP_REG
-      "1:                                   \n"
-      "ldp         q0, q4, [%0], #32        \n"  // load 16 RGB565 pixels.
+      "1:                                        \n"
+      "ldp         q0, q4, [%0], #32             \n"  // load 16 RGB565 pixels.
       RGB565TOARGB
-      "uaddlp      v16.8h, v0.16b           \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]     \n"
-      "uaddlp      v17.8h, v1.16b           \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v18.8h, v2.16b           \n"  // R 16 bytes -> 8 shorts.
+      "uaddlp      v16.8h, v0.16b                \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v17.8h, v1.16b                \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v18.8h, v2.16b                \n"  // R 16 bytes -> 8 shorts.
 
-      "ldp         q0, q4, [%1], #32        \n"  // load 16 RGB565 pixels.
+      "ldp         q0, q4, [%1], #32             \n"  // load 16 RGB565 pixels.
       RGB565TOARGB
-      "uadalp      v16.8h, v0.16b           \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]     \n"
-      "uadalp      v17.8h, v1.16b           \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v18.8h, v2.16b           \n"  // R 16 bytes -> 8 shorts.
+      "uadalp      v16.8h, v0.16b                \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v17.8h, v1.16b                \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v18.8h, v2.16b                \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v16.8h, #1        \n"  // 2x average
-      "urshr       v1.8h, v17.8h, #1        \n"
-      "urshr       v2.8h, v18.8h, #1        \n"
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
 
-      "subs        %w4, %w4, #16            \n"  // 16 processed per loop.
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
       RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8        \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8        \n"  // store 8 pixels V.
-      "b.gt        1b                       \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
       : "+r"(src_rgb565),    // %0
         "+r"(src_rgb565_1),  // %1
         "+r"(dst_u),           // %2
@@ -3332,30 +3339,30 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
   const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
   asm volatile(
       RGBTOUV_SETUP_REG
-      "1:                                    \n"
-      "ldp         q0, q3, [%0], #32         \n"  // load 16 ARGB1555 pixels.
+      "1:                                        \n"
+      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB1555 pixels.
       RGB555TOARGB
-      "uaddlp      v16.8h, v0.16b            \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]      \n"
-      "uaddlp      v17.8h, v1.16b            \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v18.8h, v2.16b            \n"  // R 16 bytes -> 8 shorts.
+      "uaddlp      v16.8h, v0.16b                \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v17.8h, v1.16b                \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v18.8h, v2.16b                \n"  // R 16 bytes -> 8 shorts.
 
-      "ldp         q0, q3, [%1], #32         \n"  // load 16 ARGB1555 pixels.
+      "ldp         q0, q3, [%1], #32             \n"  // load 16 ARGB1555 pixels.
       RGB555TOARGB
-      "uadalp      v16.8h, v0.16b            \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]      \n"
-      "uadalp      v17.8h, v1.16b            \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v18.8h, v2.16b            \n"  // R 16 bytes -> 8 shorts.
+      "uadalp      v16.8h, v0.16b                \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v17.8h, v1.16b                \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v18.8h, v2.16b                \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v16.8h, #1         \n"  // 2x average
-      "urshr       v1.8h, v17.8h, #1         \n"
-      "urshr       v2.8h, v18.8h, #1         \n"
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
 
-      "subs        %w4, %w4, #16             \n"  // 16 processed per loop.
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
       RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8         \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8         \n"  // store 8 pixels V.
-      "b.gt        1b                        \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
       : "+r"(src_argb1555),    // %0
         "+r"(src_argb1555_1),  // %1
         "+r"(dst_u),           // %2
@@ -3376,30 +3383,30 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
   const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
   asm volatile(
       RGBTOUV_SETUP_REG  // sets v20-v25
-      "1:                                    \n"
-      "ldp         q0, q3, [%0], #32         \n"  // load 16 ARGB4444 pixels.
+      "1:                                        \n"
+      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB4444 pixels.
       ARGB4444TORGB
-      "uaddlp      v16.8h, v0.16b            \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%0, 448]      \n"
-      "uaddlp      v17.8h, v1.16b            \n"  // G 16 bytes -> 8 shorts.
-      "uaddlp      v18.8h, v2.16b            \n"  // R 16 bytes -> 8 shorts.
+      "uaddlp      v16.8h, v0.16b                \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uaddlp      v17.8h, v1.16b                \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp      v18.8h, v2.16b                \n"  // R 16 bytes -> 8 shorts.
 
-      "ldp         q0, q3, [%1], #32         \n"  // load 16 ARGB4444 pixels.
+      "ldp         q0, q3, [%1], #32             \n"  // load 16 ARGB4444 pixels.
       ARGB4444TORGB
-      "uadalp      v16.8h, v0.16b            \n"  // B 16 bytes -> 8 shorts.
-      "prfm        pldl1keep, [%1, 448]      \n"
-      "uadalp      v17.8h, v1.16b            \n"  // G 16 bytes -> 8 shorts.
-      "uadalp      v18.8h, v2.16b            \n"  // R 16 bytes -> 8 shorts.
+      "uadalp      v16.8h, v0.16b                \n"  // B 16 bytes -> 8 shorts.
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uadalp      v17.8h, v1.16b                \n"  // G 16 bytes -> 8 shorts.
+      "uadalp      v18.8h, v2.16b                \n"  // R 16 bytes -> 8 shorts.
 
-      "urshr       v0.8h, v16.8h, #1         \n"  // 2x average
-      "urshr       v1.8h, v17.8h, #1         \n"
-      "urshr       v2.8h, v18.8h, #1         \n"
+      "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
+      "urshr       v1.8h, v17.8h, #1             \n"
+      "urshr       v2.8h, v18.8h, #1             \n"
 
-      "subs        %w4, %w4, #16             \n"  // 16 processed per loop.
+      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
       RGBTOUV(v0.8h, v1.8h, v2.8h)
-      "st1         {v0.8b}, [%2], #8         \n"  // store 8 pixels U.
-      "st1         {v1.8b}, [%3], #8         \n"  // store 8 pixels V.
-      "b.gt        1b                        \n"
+      "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
+      "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
       : "+r"(src_argb4444),    // %0
         "+r"(src_argb4444_1),  // %1
         "+r"(dst_u),           // %2
@@ -3448,26 +3455,26 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                          uint8_t* dst_y,
                          int width) {
   asm volatile(
-      "movi        v4.16b, #25              \n"  // B * 0.1016 coefficient
-      "movi        v5.16b, #129             \n"  // G * 0.5078 coefficient
-      "movi        v6.16b, #66              \n"  // R * 0.2578 coefficient
-      "movi        v7.16b, #16              \n"  // Add 16 constant
-      "1:                                   \n"
-      "ldp         q0, q3, [%0], #32        \n"  // load 16 ARGB1555 pixels.
-      "subs        %w2, %w2, #16            \n"  // 16 processed per loop.
+      "movi        v4.16b, #25                   \n"  // B * 0.1016 coefficient
+      "movi        v5.16b, #129                  \n"  // G * 0.5078 coefficient
+      "movi        v6.16b, #66                   \n"  // R * 0.2578 coefficient
+      "movi        v7.16b, #16                   \n"  // Add 16 constant
+      "1:                                        \n"
+      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB1555 pixels.
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
       RGB555TOARGB
-      "umull       v16.8h, v0.8b, v4.8b     \n"  // B
-      "umull2      v17.8h, v0.16b, v4.16b   \n"  // B
-      "prfm        pldl1keep, [%0, 448]     \n"
-      "umlal       v16.8h, v1.8b, v5.8b     \n"  // G
-      "umlal2      v17.8h, v1.16b, v5.16b   \n"  // G
-      "umlal       v16.8h, v2.8b, v6.8b     \n"  // R
-      "umlal2      v17.8h, v2.16b, v6.16b   \n"  // R
-      "uqrshrn     v0.8b, v16.8h, #8        \n"  // 16 bit to 8 bit Y
-      "uqrshrn2    v0.16b, v17.8h, #8       \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.16b, v0.16b, v7.16b   \n"
-      "str         q0, [%1], #16            \n"  // store  pixels Y.
-      "b.gt        1b                       \n"
+      "umull       v16.8h, v0.8b, v4.8b          \n"  // B
+      "umull2      v17.8h, v0.16b, v4.16b        \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v16.8h, v1.8b, v5.8b          \n"  // G
+      "umlal2      v17.8h, v1.16b, v5.16b        \n"  // G
+      "umlal       v16.8h, v2.8b, v6.8b          \n"  // R
+      "umlal2      v17.8h, v2.16b, v6.16b        \n"  // R
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqrshrn2    v0.16b, v17.8h, #8            \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.16b, v0.16b, v7.16b        \n"
+      "str         q0, [%1], #16                 \n"  // store  pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_argb1555),  // %0
         "+r"(dst_y),         // %1
         "+r"(width)          // %2
@@ -3480,26 +3487,26 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width) {
   asm volatile(
-      "movi        v24.16b, #25              \n"  // B * 0.1016 coefficient
-      "movi        v25.16b, #129             \n"  // G * 0.5078 coefficient
-      "movi        v26.16b, #66              \n"  // R * 0.2578 coefficient
-      "movi        v27.16b, #16              \n"  // Add 16 constant
-      "1:                                    \n"
-      "ldp         q0, q3, [%0], #32         \n"  // load 16 ARGB4444 pixels.
-      "subs        %w2, %w2, #16             \n"  // 16 processed per loop.
+      "movi        v24.16b, #25                  \n"  // B * 0.1016 coefficient
+      "movi        v25.16b, #129                 \n"  // G * 0.5078 coefficient
+      "movi        v26.16b, #66                  \n"  // R * 0.2578 coefficient
+      "movi        v27.16b, #16                  \n"  // Add 16 constant
+      "1:                                        \n"
+      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB4444 pixels.
+      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
       ARGB4444TORGB
-      "umull       v16.8h, v0.8b, v24.8b     \n"  // B
-      "umull2      v17.8h, v0.16b, v24.16b   \n"  // B
-      "prfm        pldl1keep, [%0, 448]      \n"
-      "umlal       v16.8h, v1.8b, v25.8b     \n"  // G
-      "umlal2      v17.8h, v1.16b, v25.16b   \n"  // G
-      "umlal       v16.8h, v2.8b, v26.8b     \n"  // R
-      "umlal2      v17.8h, v2.16b, v26.16b   \n"  // R
-      "uqrshrn     v0.8b, v16.8h, #8         \n"  // 16 bit to 8 bit Y
-      "uqrshrn2    v0.16b, v17.8h, #8        \n"  // 16 bit to 8 bit Y
-      "uqadd       v0.16b, v0.16b, v27.16b   \n"
-      "str         q0, [%1], #16             \n"  // store 8 pixels Y.
-      "b.gt        1b                        \n"
+      "umull       v16.8h, v0.8b, v24.8b         \n"  // B
+      "umull2      v17.8h, v0.16b, v24.16b       \n"  // B
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "umlal       v16.8h, v1.8b, v25.8b         \n"  // G
+      "umlal2      v17.8h, v1.16b, v25.16b       \n"  // G
+      "umlal       v16.8h, v2.8b, v26.8b         \n"  // R
+      "umlal2      v17.8h, v2.16b, v26.16b       \n"  // R
+      "uqrshrn     v0.8b, v16.8h, #8             \n"  // 16 bit to 8 bit Y
+      "uqrshrn2    v0.16b, v17.8h, #8            \n"  // 16 bit to 8 bit Y
+      "uqadd       v0.16b, v0.16b, v27.16b       \n"
+      "str         q0, [%1], #16                 \n"  // store 8 pixels Y.
+      "b.gt        1b                            \n"
       : "+r"(src_argb4444),  // %0
         "+r"(dst_y),         // %1
         "+r"(width)          // %2
@@ -3517,7 +3524,7 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                                   uint8_t* dst_y,
                                   int width,
                                   const struct RgbConstants* rgbconstants) {
-  asm volatile (
+  asm volatile(
       "ldr         d0, [%3]                      \n"  // load rgbconstants
       "dup         v6.16b, v0.b[0]               \n"
       "dup         v7.16b, v0.b[1]               \n"
@@ -3551,7 +3558,7 @@ static void ARGBToYMatrixRow_NEON_DotProd(
     uint8_t* dst_y,
     int width,
     const struct RgbConstants* rgbconstants) {
-  asm volatile (
+  asm volatile(
       "ldr         d0, [%3]                      \n"  // load rgbconstants
       "dup         v16.4s, v0.s[0]               \n"
       "dup         v17.8h,  v0.h[2]              \n"
@@ -3653,7 +3660,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
                                   uint8_t* dst_y,
                                   int width,
                                   const struct RgbConstants* rgbconstants) {
-  asm volatile (
+  asm volatile(
       "ldr         d0, [%3]                      \n"  // load rgbconstants
       "dup         v6.16b, v0.b[0]               \n"
       "dup         v7.16b, v0.b[1]               \n"
@@ -3725,7 +3732,7 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                                  uint8_t* dst_y,
                                  int width,
                                  const struct RgbConstants* rgbconstants) {
-  asm volatile (
+  asm volatile(
       "ldr         d0, [%3]                      \n"  // load rgbconstants
       "dup         v5.16b, v0.b[0]               \n"
       "dup         v6.16b, v0.b[1]               \n"
@@ -3777,7 +3784,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   const uint8_t* src_ptr1 = src_ptr + src_stride;
-  asm volatile (
+  asm volatile(
       "cmp         %w4, #0                       \n"
       "b.eq        100f                          \n"
       "cmp         %w4, #128                     \n"
@@ -3843,7 +3850,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
   int y0_fraction = 256 - y1_fraction;
   const uint16_t* src_ptr1 = src_ptr + src_stride;
 
-  asm volatile (
+  asm volatile(
       "cmp         %w4, #0                       \n"
       "b.eq        100f                          \n"
       "cmp         %w4, #128                     \n"
@@ -3915,7 +3922,7 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
   const uint16_t* src_ptr1 = src_ptr + src_stride;
   int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
 
-  asm volatile (
+  asm volatile(
       "dup         v6.8h, %w6                    \n"
       "cmp         %w4, #0                       \n"
       "b.eq        100f                          \n"
@@ -3983,7 +3990,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (
+  asm volatile(
       "subs        %w3, %w3, #8                  \n"
       "b.lt        89f                           \n"
       // Blend 8 pixels.
@@ -4054,7 +4061,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
-  asm volatile (
+  asm volatile(
       "movi        v7.8h, #0x00ff                \n"  // 255 for rounding up
 
       // Attenuate 8 pixels.
@@ -4084,7 +4091,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
                           int interval_size,
                           int interval_offset,
                           int width) {
-  asm volatile (
+  asm volatile(
       "dup         v4.8h, %w2                    \n"
       "ushr        v4.8h, v4.8h, #1              \n"  // scale >>= 1
       "dup         v5.8h, %w3                    \n"  // interval multiply.
@@ -4127,30 +4134,30 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        int width,
                        uint32_t value) {
-  asm volatile (
-      "dup         v0.4s, %w3               \n"  // duplicate scale value.
-      "zip1        v0.16b, v0.16b, v0.16b   \n"  // v0.16b aarrggbbaarrggbb.
-      "ushr        v0.8h, v0.8h, #1         \n"  // scale / 2.
+  asm volatile(
+      "dup         v0.4s, %w3                    \n"  // duplicate scale value.
+      "zip1        v0.16b, v0.16b, v0.16b        \n"  // v0.16b aarrggbbaarrggbb.
+      "ushr        v0.8h, v0.8h, #1              \n"  // scale / 2.
 
       // 8 pixel loop.
-      "1:                                   \n"
+      "1:                                        \n"
       "ld1         {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
-      "subs        %w2, %w2, #8             \n"  // 8 processed per loop.
-      "uxtl        v4.8h, v4.8b             \n"
-      "prfm        pldl1keep, [%0, 448]     \n"
-      "uxtl        v5.8h, v5.8b             \n"
-      "uxtl        v6.8h, v6.8b             \n"
-      "uxtl        v7.8h, v7.8b             \n"
-      "sqrdmulh    v4.8h, v4.8h, v0.8h      \n"  // argb * scale * 2
-      "sqrdmulh    v5.8h, v5.8h, v0.8h      \n"
-      "sqrdmulh    v6.8h, v6.8h, v0.8h      \n"
-      "sqrdmulh    v7.8h, v7.8h, v0.8h      \n"
-      "uqxtn       v4.8b, v4.8h             \n"
-      "uqxtn       v5.8b, v5.8h             \n"
-      "uqxtn       v6.8b, v6.8h             \n"
-      "uqxtn       v7.8b, v7.8h             \n"
+      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
+      "uxtl        v4.8h, v4.8b                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uxtl        v5.8h, v5.8b                  \n"
+      "uxtl        v6.8h, v6.8b                  \n"
+      "uxtl        v7.8h, v7.8b                  \n"
+      "sqrdmulh    v4.8h, v4.8h, v0.8h           \n"  // argb * scale * 2
+      "sqrdmulh    v5.8h, v5.8h, v0.8h           \n"
+      "sqrdmulh    v6.8h, v6.8h, v0.8h           \n"
+      "sqrdmulh    v7.8h, v7.8h, v0.8h           \n"
+      "uqxtn       v4.8b, v4.8h                  \n"
+      "uqxtn       v5.8b, v5.8h                  \n"
+      "uqxtn       v6.8b, v6.8h                  \n"
+      "uqxtn       v7.8b, v7.8h                  \n"
       "st1         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
-      "b.gt        1b                       \n"
+      "b.gt        1b                            \n"
       : "+r"(src_argb),  // %0
         "+r"(dst_argb),  // %1
         "+r"(width)      // %2
@@ -4162,7 +4169,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
 // Similar to ARGBToYJ but stores ARGB.
 // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "movi        v24.8b, #29                   \n"  // B * 0.1140 coefficient
       "movi        v25.8b, #150                  \n"  // G * 0.5870 coefficient
       "movi        v26.8b, #77                   \n"  // R * 0.2990 coefficient
@@ -4193,22 +4200,22 @@ void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb,
                               uint8_t* dst_argb,
                               int width) {
   asm volatile(
-      "ld1r     {v24.4s}, [%[coeffs]]             \n"
-      "ldr      q25, [%[indices]]                 \n"
-      "1:                                         \n"
-      "ldp      q1, q3, [%[src]], #32             \n"  // load 8 ARGB
-      "movi     v0.4s, #0                         \n"
-      "movi     v2.4s, #0                         \n"
-      "subs     %w[width], %w[width], #8          \n"  // 8 processed per loop
-      "udot     v0.4s, v1.16b, v24.16b            \n"
-      "udot     v2.4s, v3.16b, v24.16b            \n"
-      "prfm     pldl1keep, [%[src], 448]          \n"
-      "uqrshrn  v0.8b, v0.8h, #8                  \n"
-      "uqrshrn  v2.8b, v2.8h, #8                  \n"
-      "tbl      v0.16b, {v0.16b, v1.16b}, v25.16b \n"  // merge in alpha
-      "tbl      v1.16b, {v2.16b, v3.16b}, v25.16b \n"
-      "stp      q0, q1, [%[dst]], #32             \n"  // store 8 pixels
-      "b.gt     1b                                \n"
+      "ld1r        {v24.4s}, [%[coeffs]]         \n"
+      "ldr         q25, [%[indices]]             \n"
+      "1:                                        \n"
+      "ldp         q1, q3, [%[src]], #32         \n"  // load 8 ARGB
+      "movi        v0.4s, #0                     \n"
+      "movi        v2.4s, #0                     \n"
+      "subs        %w[width], %w[width], #8      \n"  // 8 processed per loop
+      "udot        v0.4s, v1.16b, v24.16b        \n"
+      "udot        v2.4s, v3.16b, v24.16b        \n"
+      "prfm        pldl1keep, [%[src], 448]      \n"
+      "uqrshrn     v0.8b, v0.8h, #8              \n"
+      "uqrshrn     v2.8b, v2.8h, #8              \n"
+      "tbl         v0.16b, {v0.16b, v1.16b}, v25.16b \n"  // merge in alpha
+      "tbl         v1.16b, {v2.16b, v3.16b}, v25.16b \n"
+      "stp         q0, q1, [%[dst]], #32         \n"  // store 8 pixels
+      "b.gt        1b                            \n"
       : [src] "+r"(src_argb),                // %[src]
         [dst] "+r"(dst_argb),                // %[dst]
         [width] "+r"(width)                  // %[width]
@@ -4223,7 +4230,7 @@ void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb,
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
-  asm volatile (
+  asm volatile(
       "movi        v20.8b, #17                   \n"  // BB coefficient
       "movi        v21.8b, #68                   \n"  // BG coefficient
       "movi        v22.8b, #35                   \n"  // BR coefficient
@@ -4265,32 +4272,32 @@ static const uvec8 kARGBSepiaRowAlphaIndices = {3, 7, 11, 15, 19, 23, 27, 31};
 void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) {
   asm volatile(
       "ld3r        {v20.4s, v21.4s, v22.4s}, [%[coeffs]] \n"
-      "ldr         d23, [%[indices]]            \n"
-      "1:                                       \n"
-      "ldp         q0, q1, [%[dst]]             \n"
-      "movi        v2.4s, #0                    \n"
-      "movi        v3.4s, #0                    \n"
-      "movi        v4.4s, #0                    \n"
-      "movi        v5.4s, #0                    \n"
-      "movi        v6.4s, #0                    \n"
-      "movi        v7.4s, #0                    \n"
-      "udot        v2.4s, v0.16b, v20.16b       \n"
-      "udot        v3.4s, v1.16b, v20.16b       \n"
-      "udot        v4.4s, v0.16b, v21.16b       \n"
-      "udot        v5.4s, v1.16b, v21.16b       \n"
-      "udot        v6.4s, v0.16b, v22.16b       \n"
-      "udot        v7.4s, v1.16b, v22.16b       \n"
-      "subs        %w1, %w1, #8                 \n"
-      "prfm        pldl1keep, [%[dst], 448]     \n"
-      "uzp1        v6.8h, v6.8h, v7.8h          \n"
-      "uzp1        v5.8h, v4.8h, v5.8h          \n"
-      "uzp1        v4.8h, v2.8h, v3.8h          \n"
+      "ldr         d23, [%[indices]]             \n"
+      "1:                                        \n"
+      "ldp         q0, q1, [%[dst]]              \n"
+      "movi        v2.4s, #0                     \n"
+      "movi        v3.4s, #0                     \n"
+      "movi        v4.4s, #0                     \n"
+      "movi        v5.4s, #0                     \n"
+      "movi        v6.4s, #0                     \n"
+      "movi        v7.4s, #0                     \n"
+      "udot        v2.4s, v0.16b, v20.16b        \n"
+      "udot        v3.4s, v1.16b, v20.16b        \n"
+      "udot        v4.4s, v0.16b, v21.16b        \n"
+      "udot        v5.4s, v1.16b, v21.16b        \n"
+      "udot        v6.4s, v0.16b, v22.16b        \n"
+      "udot        v7.4s, v1.16b, v22.16b        \n"
+      "subs        %w1, %w1, #8                  \n"
+      "prfm        pldl1keep, [%[dst], 448]      \n"
+      "uzp1        v6.8h, v6.8h, v7.8h           \n"
+      "uzp1        v5.8h, v4.8h, v5.8h           \n"
+      "uzp1        v4.8h, v2.8h, v3.8h           \n"
       "tbl         v3.16b, {v0.16b, v1.16b}, v23.16b \n"
-      "uqshrn      v0.8b, v4.8h, #7             \n"
-      "uqshrn      v1.8b, v5.8h, #7             \n"
-      "uqshrn      v2.8b, v6.8h, #7             \n"
+      "uqshrn      v0.8b, v4.8h, #7              \n"
+      "uqshrn      v1.8b, v5.8h, #7              \n"
+      "uqshrn      v2.8b, v6.8h, #7              \n"
       "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%[dst]], #32 \n"
-      "b.gt        1b                           \n"
+      "b.gt        1b                            \n"
       : [dst] "+r"(dst_argb),                      // %[dst]
         [width] "+r"(width)                        // %[width]
       : [coeffs] "r"(&kARGBSepiaRowCoeffs),        // %[coeffs]
@@ -4306,7 +4313,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              const int8_t* matrix_argb,
                              int width) {
-  asm volatile (
+  asm volatile(
       "ld1         {v2.16b}, [%3]                \n"  // load 3 ARGB vectors.
       "sxtl        v0.8h, v2.8b                  \n"  // B,G coefficients s16.
       "sxtl2       v1.8h, v2.16b                 \n"  // R,A coefficients s16.
@@ -4365,51 +4372,51 @@ void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   const int8_t* matrix_argb,
                                   int width) {
-  asm volatile (
-      "ld1        {v31.16b}, [%[matrix_argb]]           \n"
+  asm volatile(
+      "ld1         {v31.16b}, [%[matrix_argb]]   \n"
 
-      "1:                                               \n"
-      "ld1        {v0.16b, v1.16b}, [%[src_argb]], #32  \n"
+      "1:                                        \n"
+      "ld1         {v0.16b, v1.16b}, [%[src_argb]], #32 \n"
 
-      "movi       v16.4s, #0                            \n"
-      "movi       v17.4s, #0                            \n"
-      "movi       v18.4s, #0                            \n"
-      "movi       v19.4s, #0                            \n"
-      "movi       v20.4s, #0                            \n"
-      "movi       v21.4s, #0                            \n"
-      "movi       v22.4s, #0                            \n"
-      "movi       v23.4s, #0                            \n"
+      "movi        v16.4s, #0                    \n"
+      "movi        v17.4s, #0                    \n"
+      "movi        v18.4s, #0                    \n"
+      "movi        v19.4s, #0                    \n"
+      "movi        v20.4s, #0                    \n"
+      "movi        v21.4s, #0                    \n"
+      "movi        v22.4s, #0                    \n"
+      "movi        v23.4s, #0                    \n"
 
       // 8 processed per loop.
-      "subs       %w2, %w2, #8                          \n"
-      "prfm       pldl1keep, [%[src_argb], 448]         \n"
+      "subs        %w2, %w2, #8                  \n"
+      "prfm        pldl1keep, [%[src_argb], 448] \n"
 
-      "sudot      v16.4s, v31.16b, v0.4b[0]             \n"
-      "sudot      v17.4s, v31.16b, v0.4b[1]             \n"
-      "sudot      v18.4s, v31.16b, v0.4b[2]             \n"
-      "sudot      v19.4s, v31.16b, v0.4b[3]             \n"
-      "sudot      v20.4s, v31.16b, v1.4b[0]             \n"
-      "sudot      v21.4s, v31.16b, v1.4b[1]             \n"
-      "sudot      v22.4s, v31.16b, v1.4b[2]             \n"
-      "sudot      v23.4s, v31.16b, v1.4b[3]             \n"
+      "sudot       v16.4s, v31.16b, v0.4b[0]     \n"
+      "sudot       v17.4s, v31.16b, v0.4b[1]     \n"
+      "sudot       v18.4s, v31.16b, v0.4b[2]     \n"
+      "sudot       v19.4s, v31.16b, v0.4b[3]     \n"
+      "sudot       v20.4s, v31.16b, v1.4b[0]     \n"
+      "sudot       v21.4s, v31.16b, v1.4b[1]     \n"
+      "sudot       v22.4s, v31.16b, v1.4b[2]     \n"
+      "sudot       v23.4s, v31.16b, v1.4b[3]     \n"
 
-      "shrn       v16.4h, v16.4s, #6                    \n"
-      "shrn       v18.4h, v18.4s, #6                    \n"
-      "shrn       v20.4h, v20.4s, #6                    \n"
-      "shrn       v22.4h, v22.4s, #6                    \n"
-      "shrn2      v16.8h, v17.4s, #6                    \n"
-      "shrn2      v18.8h, v19.4s, #6                    \n"
-      "shrn2      v20.8h, v21.4s, #6                    \n"
-      "shrn2      v22.8h, v23.4s, #6                    \n"
+      "shrn        v16.4h, v16.4s, #6            \n"
+      "shrn        v18.4h, v18.4s, #6            \n"
+      "shrn        v20.4h, v20.4s, #6            \n"
+      "shrn        v22.4h, v22.4s, #6            \n"
+      "shrn2       v16.8h, v17.4s, #6            \n"
+      "shrn2       v18.8h, v19.4s, #6            \n"
+      "shrn2       v20.8h, v21.4s, #6            \n"
+      "shrn2       v22.8h, v23.4s, #6            \n"
 
-      "uqxtn      v16.8b, v16.8h                        \n"
-      "uqxtn      v18.8b, v18.8h                        \n"
-      "uqxtn      v20.8b, v20.8h                        \n"
-      "uqxtn      v22.8b, v22.8h                        \n"
+      "uqxtn       v16.8b, v16.8h                \n"
+      "uqxtn       v18.8b, v18.8h                \n"
+      "uqxtn       v20.8b, v20.8h                \n"
+      "uqxtn       v22.8b, v22.8h                \n"
 
-      "stp        d16, d18, [%[dst_argb]], #16          \n"
-      "stp        d20, d22, [%[dst_argb]], #16          \n"
-      "b.gt       1b                                    \n"
+      "stp         d16, d18, [%[dst_argb]], #16  \n"
+      "stp         d20, d22, [%[dst_argb]], #16  \n"
+      "b.gt        1b                            \n"
       : [src_argb] "+r"(src_argb),      // %[src_argb]
         [dst_argb] "+r"(dst_argb),      // %[dst_argb]
         [width] "+r"(width)             // %[width]
@@ -4423,7 +4430,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
+  asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
       "ld1         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
@@ -4454,7 +4461,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile (
+  asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 8 ARGB
@@ -4479,7 +4486,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile (
+  asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 8 ARGB
@@ -4508,7 +4515,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
                    const uint8_t* src_sobely,
                    uint8_t* dst_argb,
                    int width) {
-  asm volatile (
+  asm volatile(
       "movi        v3.8b, #255                   \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
@@ -4535,7 +4542,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
                           const uint8_t* src_sobely,
                           uint8_t* dst_y,
                           int width) {
-  asm volatile (
+  asm volatile(
       // 16 pixel loop.
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 sobelx.
@@ -4563,7 +4570,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
                      const uint8_t* src_sobely,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile (
+  asm volatile(
       "movi        v3.8b, #255                   \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
@@ -4592,7 +4599,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
                     const uint8_t* src_y2,
                     uint8_t* dst_sobelx,
                     int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.8b}, [%0],%5              \n"  // top
       "ld1         {v1.8b}, [%0],%6              \n"
@@ -4633,7 +4640,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
                     const uint8_t* src_y1,
                     uint8_t* dst_sobely,
                     int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.8b}, [%0],%4              \n"  // left
       "ld1         {v1.8b}, [%1],%4              \n"
@@ -4702,7 +4709,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
                          float* dst,
                          float scale,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v1.8b}, [%0], #8             \n"  // load 8 bytes
       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
@@ -4727,7 +4734,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
 void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
                                float* dst,
                                int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v1.8h}, [%0], #16            \n"  // load 8 halffloats
       "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
@@ -4749,7 +4756,7 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
                                   int src_stride,       // stride in elements
                                   float* dst,
                                   int width) {
-  asm volatile (
+  asm volatile(
       "cmp         %w2, #8                       \n"  // Is there 8 rows?
       "b.lo        2f                            \n"
       "1:                                        \n"
@@ -4787,7 +4794,7 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
 void ConvertFP32ToFP16Row_NEON(const float* src,
                                uint16_t* dst,  // fp16
                                int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ldp         q2, q3, [%0], #32             \n"  // load 8 floats
       "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
@@ -4808,7 +4815,7 @@ float ScaleMaxSamples_NEON(const float* src,
                            float scale,
                            int width) {
   float fmax;
-  asm volatile (
+  asm volatile(
       "movi        v5.4s, #0                     \n"  // max
       "movi        v6.4s, #0                     \n"
 
@@ -4838,7 +4845,7 @@ float ScaleSumSamples_NEON(const float* src,
                            float scale,
                            int width) {
   float fsum;
-  asm volatile (
+  asm volatile(
       "movi        v5.4s, #0                     \n"  // max
       "movi        v6.4s, #0                     \n"  // max
 
@@ -4865,7 +4872,7 @@ float ScaleSumSamples_NEON(const float* src,
 }
 
 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
       "prfm        pldl1keep, [%0, 448]          \n"
@@ -4889,7 +4896,7 @@ void GaussCol_NEON(const uint16_t* src0,
                    const uint16_t* src4,
                    uint32_t* dst,
                    int width) {
-  asm volatile (
+  asm volatile(
       "movi        v6.8h, #4                     \n"  // constant 4
       "movi        v7.8h, #6                     \n"  // constant 6
 
@@ -4931,7 +4938,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
   const uint32_t* src1 = src + 1;
   const uint32_t* src2 = src + 2;
   const uint32_t* src3 = src + 3;
-  asm volatile (
+  asm volatile(
       "movi        v6.4s, #4                     \n"  // constant 4
       "movi        v7.4s, #6                     \n"  // constant 6
 
@@ -4974,7 +4981,7 @@ void GaussCol_F32_NEON(const float* src0,
                        const float* src4,
                        float* dst,
                        int width) {
-  asm volatile (
+  asm volatile(
       "ld2r        {v6.4s, v7.4s}, [%7]          \n"  // constants 4 and 6
 
       "1:                                        \n"
@@ -5012,7 +5019,7 @@ void GaussCol_F32_NEON(const float* src0,
 
 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
 void GaussRow_F32_NEON(const float* src, float* dst, int width) {
-  asm volatile (
+  asm volatile(
       "ld3r        {v6.4s, v7.4s, v8.4s}, [%3]   \n"  // constants 4, 6, 1/256
 
       "1:                                        \n"
@@ -5051,7 +5058,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v2.16b}, [%0], #16           \n"  // load 16 Y values
       "ld2         {v0.8b, v1.8b}, [%1], #16     \n"  // load 8 VU values
@@ -5082,7 +5089,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-  asm volatile (
+  asm volatile(
       "ld1         {v5.16b,v6.16b,v7.16b}, [%4]  \n"  // 3 shuffler constants
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"    // load 16 Y values
@@ -5112,7 +5119,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
                       uint8_t* dst_uv,
                       int width) {
   const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
-  asm volatile (
+  asm volatile(
 
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
@@ -5141,7 +5148,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
                       uint8_t* dst_vu,
                       int width) {
   const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
-  asm volatile (
+  asm volatile(
 
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
@@ -5167,7 +5174,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
 
 // Copy row of AYUV Y's into Y
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
@@ -5183,7 +5190,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
 
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], 16            \n"  // load 16 UV values
       "ld1         {v1.16b}, [%0], 16            \n"
@@ -5208,7 +5215,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
                          int width) {
   const uint8_t* src_u_1 = src_u + src_stride_u;
   const uint8_t* src_v_1 = src_v + src_stride_v;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 U values
       "ld1         {v1.16b}, [%2], #16           \n"  // load 16 V values
@@ -5243,7 +5250,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
                         int depth,
                         int width) {
   int shift = depth - 16;  // Negative for right shift.
-  asm volatile (
+  asm volatile(
       "dup         v2.8h, %w4                    \n"
       "1:                                        \n"
       "ld2         {v0.8h, v1.8h}, [%0], #32     \n"  // load 8 UV
@@ -5266,7 +5273,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width) {
-  asm volatile (
+  asm volatile(
       "dup         v2.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"
@@ -5287,7 +5294,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        uint16_t* dst_y,
                        int scale,
                        int width) {
-  asm volatile (
+  asm volatile(
       "dup         v4.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q2, q3, [%0], #32             \n"
@@ -5321,7 +5328,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
   // saturate, then we can just use UZP2 to narrow rather than a pair of
   // saturating narrow instructions.
   int shift = 23 - __builtin_clz((int32_t)scale);
-  asm volatile (
+  asm volatile(
       "dup         v2.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index 62c6b2631..07606d7a8 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -47,7 +47,7 @@ extern "C" {
 // register) is set to round-to-nearest-up mode(0).
 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
   {                                                              \
-    asm volatile ("csrwi vxrm, 0");                               \
+    asm volatile("csrwi vxrm, 0");                               \
     ub = yuvconst->kUVCoeff[0];                                  \
     vr = yuvconst->kUVCoeff[1];                                  \
     ug = yuvconst->kUVCoeff[2];                                  \
@@ -1238,7 +1238,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y,
   vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
   // To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) sets to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   if (is_yb_positive) {
     v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
   } else {
@@ -1632,7 +1632,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
   }
   // To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   // Blend 50 / 50.
   if (y1_fraction == 128) {
     do {
diff --git a/source/row_sve.cc b/source/row_sve.cc
index 20b9c4bde..8076c9ebc 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -241,7 +241,7 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
                                    const int16_t* uvconstants) {
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   uint64_t vl;
-  asm volatile (
+  asm volatile(
       "ptrue    p0.b                                \n"
       "ld1rd    {z24.d}, p0/z, [%[uvconstants]]     \n"
       "ld1rd    {z25.d}, p0/z, [%[uvconstants], #8] \n"
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index da99febb9..c009e0574 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -10,8 +10,8 @@
 
 #include "libyuv/scale.h"
 
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -1233,10 +1233,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
   (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
   (void)dst_fourcc;
   const int abs_src_height = (src_height < 0) ? -src_height : src_height;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 ||
-      dst_width <= 0 || dst_height <= 0 ||
-      clip_width <= 0 || clip_height <= 0) {
+  if (!src_y || !src_u || !src_v || !dst_argb || src_width <= 0 ||
+      src_width > INT_MAX / 4 || src_height == 0 || dst_width <= 0 ||
+      dst_height <= 0 || clip_width <= 0 || clip_height <= 0) {
     return -1;
   }
   const uint64_t argb_buffer_size = (uint64_t)src_width * abs_src_height * 4;
@@ -1250,9 +1249,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
   I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
              argb_buffer, src_width * 4, src_width, src_height);
 
-  r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height, dst_argb,
-                    dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
-                    clip_width, clip_height, filtering);
+  r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height,
+                    dst_argb, dst_stride_argb, dst_width, dst_height, clip_x,
+                    clip_y, clip_width, clip_height, filtering);
   free(argb_buffer);
   return r;
 }
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index 27cdc17aa..c5dabd409 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       // 16 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -123,7 +123,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
@@ -154,7 +154,7 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst_ptr,
                             int dst_width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
@@ -195,7 +195,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -221,7 +221,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
@@ -254,7 +254,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
@@ -297,7 +297,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrld       $0x18,%%xmm5                  \n"
       "pslld       $0x10,%%xmm5                  \n"
@@ -328,7 +328,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
   intptr_t stridex3;
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "movdqa      %%xmm4,%%xmm5                 \n"
@@ -383,7 +383,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
       "vpslld      $0x10,%%ymm5,%%ymm5           \n"
@@ -416,7 +416,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
@@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
                           uint8_t* dst_ptr,
                           int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "movdqa      %0,%%xmm3                     \n"
       "movdqa      %1,%%xmm4                     \n"
       "movdqa      %2,%%xmm5                     \n"
@@ -481,7 +481,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf1),  // %1
         "m"(kShuf2)   // %2
   );
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm2               \n"
@@ -508,7 +508,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %0,%%xmm2                     \n"  // kShuf01
       "movdqa      %1,%%xmm3                     \n"  // kShuf11
       "movdqa      %2,%%xmm4                     \n"  // kShuf21
@@ -517,7 +517,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
   );
-  asm volatile (
+  asm volatile(
       "movdqa      %0,%%xmm5                     \n"  // kMadd01
       "movdqa      %1,%%xmm0                     \n"  // kMadd11
       "movdqa      %2,%%xmm1                     \n"  // kRound34
@@ -526,7 +526,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
   );
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@@ -572,7 +572,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %0,%%xmm2                     \n"  // kShuf01
       "movdqa      %1,%%xmm3                     \n"  // kShuf11
       "movdqa      %2,%%xmm4                     \n"  // kShuf21
@@ -581,7 +581,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
   );
-  asm volatile (
+  asm volatile(
       "movdqa      %0,%%xmm5                     \n"  // kMadd01
       "movdqa      %1,%%xmm0                     \n"  // kMadd11
       "movdqa      %2,%%xmm1                     \n"  // kRound34
@@ -591,7 +591,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kRound34)  // %2
   );
 
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@@ -641,7 +641,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
                           uint8_t* dst_ptr,
                           int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
@@ -671,7 +671,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %0,%%xmm2                     \n"
       "movdqa      %1,%%xmm3                     \n"
       "movdqa      %2,%%xmm4                     \n"
@@ -682,7 +682,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAb2),  // %2
         "m"(kScaleAb2)  // %3
   );
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm1          \n"
@@ -714,7 +714,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %0,%%xmm2                     \n"
       "movdqa      %1,%%xmm3                     \n"
       "movdqa      %2,%%xmm4                     \n"
@@ -724,7 +724,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAc3),   // %1
         "m"(kScaleAc33)  // %2
   );
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm6          \n"
@@ -782,7 +782,7 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
-  asm volatile (
+  asm volatile(
       "pxor        %%xmm0,%%xmm0                 \n"  // 0
       "pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
@@ -838,7 +838,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "pxor        %%xmm0,%%xmm0                 \n"  // 0
       // above line
@@ -951,7 +951,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  int dst_width) {
-  asm volatile (
+  asm volatile(
       "movdqa      %3,%%xmm5                     \n"
       "pcmpeqw     %%xmm4,%%xmm4                 \n"
       "psrlw       $15,%%xmm4                    \n"
@@ -1003,7 +1003,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    ptrdiff_t dst_stride,
                                    int dst_width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqw     %%xmm7,%%xmm7                 \n"
       "psrlw       $15,%%xmm7                    \n"
       "psllw       $3,%%xmm7                     \n"  // all 8
@@ -1101,7 +1101,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-  asm volatile (
+  asm volatile(
       "pxor        %%xmm5,%%xmm5                 \n"
       "pcmpeqd     %%xmm4,%%xmm4                 \n"
       "psrld       $31,%%xmm4                    \n"
@@ -1154,7 +1154,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-  asm volatile (
+  asm volatile(
       "pxor        %%xmm7,%%xmm7                 \n"
       "pcmpeqd     %%xmm6,%%xmm6                 \n"
       "psrld       $31,%%xmm6                    \n"
@@ -1262,7 +1262,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqw     %%xmm4,%%xmm4                 \n"
       "psrlw       $15,%%xmm4                    \n"
       "psllw       $1,%%xmm4                     \n"  // all 2
@@ -1303,7 +1303,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 ptrdiff_t dst_stride,
                                 int dst_width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
       "psllw       $3,%%xmm6                     \n"  // all 8
@@ -1388,7 +1388,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
@@ -1432,7 +1432,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
@@ -1514,7 +1514,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %3,%%ymm5                  \n"
       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
@@ -1566,7 +1566,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-  asm volatile (
+  asm volatile(
       "vbroadcastf128 %5,%%ymm5                  \n"
       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
@@ -1628,7 +1628,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrld      $31,%%ymm4,%%ymm4             \n"
       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
@@ -1678,7 +1678,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrld      $31,%%ymm6,%%ymm6             \n"
       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
@@ -1761,11 +1761,10 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-  asm volatile (
-      "pxor        %%xmm5,%%xmm5                 \n"
+      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
 
-      // 16 pixel loop.
-      LABELALIGN
+               // 16 pixel loop.
+               LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm3                   \n"
       "lea         0x10(%0),%0                   \n"  // src_ptr += 16
@@ -1781,11 +1780,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
       "lea         0x20(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+               : "+r"(src_ptr),   // %0
+                 "+r"(dst_ptr),   // %1
+                 "+r"(src_width)  // %2
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
 #ifdef HAS_SCALEADDROW_AVX2
@@ -1793,10 +1792,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-  asm volatile (
-      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
-      LABELALIGN
+               LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm3                   \n"
       "lea         0x20(%0),%0                   \n"  // src_ptr += 32
@@ -1811,11 +1809,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-      : "+r"(src_ptr),   // %0
-        "+r"(dst_ptr),   // %1
-        "+r"(src_width)  // %2
-      :
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+               : "+r"(src_ptr),   // %0
+                 "+r"(dst_ptr),   // %1
+                 "+r"(src_width)  // %2
+               :
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEADDROW_AVX2
 
@@ -1835,7 +1833,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
                            int x,
                            int dx) {
   intptr_t x0, x1, temp_pixel;
-  asm volatile (
+  asm volatile(
       "movd        %6,%%xmm2                     \n"
       "movd        %7,%%xmm3                     \n"
       "movl        $0x04040000,%k2               \n"
@@ -1932,7 +1930,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
                        int dx) {
   (void)x;
   (void)dx;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
       "lea         0x10(%1),%1                   \n"
@@ -1957,7 +1955,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1979,7 +1977,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -2003,7 +2001,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                uint8_t* dst_argb,
                                int dst_width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -2037,7 +2035,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "lea         0x00(,%1,4),%1                \n"
       "lea         0x00(%1,%1,2),%4              \n"
 
@@ -2074,7 +2072,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   intptr_t row1 = (intptr_t)(src_stride);
-  asm volatile (
+  asm volatile(
       "lea         0x00(,%1,4),%1                \n"
       "lea         0x00(%1,%1,2),%4              \n"
       "lea         0x00(%0,%5,1),%5              \n"
@@ -2117,7 +2115,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
                         int x,
                         int dx) {
   intptr_t x0, x1;
-  asm volatile (
+  asm volatile(
       "movd        %5,%%xmm2                     \n"
       "movd        %6,%%xmm3                     \n"
       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
@@ -2188,7 +2186,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
                            int dx) {
   (void)x;
   (void)dx;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
       "lea         0x10(%1),%1                   \n"
@@ -2226,7 +2224,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
                                int x,
                                int dx) {
   intptr_t x0, x1;
-  asm volatile (
+  asm volatile(
       "movdqa      %0,%%xmm4                     \n"
       "movdqa      %1,%%xmm5                     \n"
       :
@@ -2234,7 +2232,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
         "m"(kShuffleFractions)  // %1
   );
 
-  asm volatile (
+  asm volatile(
       "movd        %5,%%xmm2                     \n"
       "movd        %6,%%xmm3                     \n"
       "pcmpeqb     %%xmm6,%%xmm6                 \n"
@@ -2297,7 +2295,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_X86(int num, int div) {
-  asm volatile (
+  asm volatile(
       "cdq                                       \n"
       "shld        $0x10,%%eax,%%edx             \n"
       "shl         $0x10,%%eax                   \n"
@@ -2311,7 +2309,7 @@ int FixedDiv_X86(int num, int div) {
 
 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_X86(int num, int div) {
-  asm volatile (
+  asm volatile(
       "cdq                                       \n"
       "shld        $0x10,%%eax,%%edx             \n"
       "shl         $0x10,%%eax                   \n"
@@ -2343,7 +2341,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
                               int dst_width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
@@ -2383,7 +2381,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst_ptr,
                              int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
@@ -2427,7 +2425,7 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqw     %%xmm4,%%xmm4                 \n"
       "psrlw       $15,%%xmm4                    \n"
       "psllw       $1,%%xmm4                     \n"  // all 2
@@ -2468,7 +2466,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-  asm volatile (
+  asm volatile(
       "pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
       "psllw       $3,%%xmm6                     \n"  // all 8
@@ -2552,7 +2550,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
@@ -2595,7 +2593,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
@@ -2675,7 +2673,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    int dst_width) {
-  asm volatile (
+  asm volatile(
       "pxor        %%xmm5,%%xmm5                 \n"
       "pcmpeqd     %%xmm4,%%xmm4                 \n"
       "psrld       $31,%%xmm4                    \n"
@@ -2727,7 +2725,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
                                      uint16_t* dst_ptr,
                                      ptrdiff_t dst_stride,
                                      int dst_width) {
-  asm volatile (
+  asm volatile(
       "pxor        %%xmm7,%%xmm7                 \n"
       "pcmpeqd     %%xmm6,%%xmm6                 \n"
       "psrld       $31,%%xmm6                    \n"
@@ -2818,7 +2816,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrld      $31,%%ymm4,%%ymm4             \n"
       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
@@ -2867,7 +2865,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                     uint16_t* dst_ptr,
                                     ptrdiff_t dst_stride,
                                     int dst_width) {
-  asm volatile (
+  asm volatile(
       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrld      $31,%%ymm6,%%ymm6             \n"
       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index ba25fc6ec..88378c575 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
                         uint8_t* dst,
                         int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       // load even pixels into q0, odd into q1
       "vld2.8      {q0, q1}, [%0]!               \n"
@@ -50,7 +50,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
                               uint8_t* dst,
                               int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld2.8      {q0, q1}, [%0]!               \n"  // load 32 pixels
       "subs        %2, %2, #16                   \n"  // 16 processed per loop
@@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst,
                            int dst_width) {
-  asm volatile (
+  asm volatile(
       // change the stride to row 2 pointer
       "add         %1, %0                        \n"
       "1:                                        \n"
@@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
       "subs        %2, %2, #8                    \n"  // 8 processed per loop
@@ -121,7 +121,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
   const uint8_t* src_ptr1 = src_ptr + src_stride;
   const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load up 16x4
       "vld1.8      {q1}, [%3]!                   \n"
@@ -155,7 +155,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
       "subs        %2, %2, #24                   \n"
@@ -173,7 +173,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d24, #3                       \n"
       "add         %3, %0                        \n"
       "1:                                        \n"
@@ -230,7 +230,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile (
+  asm volatile(
       "vmov.u8     d24, #3                       \n"
       "add         %3, %0                        \n"
       "1:                                        \n"
@@ -282,7 +282,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "vld1.8      {q3}, [%3]                    \n"
       "1:                                        \n"
       "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"
@@ -306,7 +306,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                       int dst_width) {
   const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
 
-  asm volatile (
+  asm volatile(
       "vld1.16     {q13}, [%5]                   \n"
       "vld1.8      {q14}, [%6]                   \n"
       "vld1.8      {q15}, [%7]                   \n"
@@ -416,7 +416,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile (
+  asm volatile(
       "vld1.16     {q13}, [%4]                   \n"
       "vld1.8      {q14}, [%5]                   \n"
       "add         %3, %0                        \n"
@@ -509,7 +509,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
   const uint8_t* src_temp = src_ptr + 1;
-  asm volatile (
+  asm volatile(
       "vmov.u8     d30, #3                       \n"
 
       "1:                                        \n"
@@ -546,7 +546,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   const uint8_t* src_temp = src_ptr + 1;
   const uint8_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile (
+  asm volatile(
       "vmov.u16    q15, #3                       \n"
       "vmov.u8     d28, #3                       \n"
 
@@ -608,7 +608,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
-  asm volatile (
+  asm volatile(
       "vmov.u16    q15, #3                       \n"
 
       "1:                                        \n"
@@ -644,7 +644,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 1;
   const uint16_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile (
+  asm volatile(
       "vmov.u16    q15, #3                       \n"
 
       "1:                                        \n"
@@ -695,7 +695,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
-  asm volatile (
+  asm volatile(
       "vmov.u16    d31, #3                       \n"
 
       "1:                                        \n"
@@ -739,7 +739,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 1;
   const uint16_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile (
+  asm volatile(
       "vmov.u16    d31, #3                       \n"
       "vmov.u32    q14, #3                       \n"
 
@@ -791,7 +791,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   const uint8_t* src_temp = src_ptr + 2;
-  asm volatile (
+  asm volatile(
       "vmov.u8     d30, #3                       \n"
 
       "1:                                        \n"
@@ -828,7 +828,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   const uint8_t* src_temp = src_ptr + 2;
   const uint8_t* src_temp1 = src_ptr1 + 2;
 
-  asm volatile (
+  asm volatile(
       "vmov.u16    q15, #3                       \n"
       "vmov.u8     d28, #3                       \n"
 
@@ -890,7 +890,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width) {
   const uint16_t* src_temp = src_ptr + 2;
-  asm volatile (
+  asm volatile(
       "vmov.u16    d30, #3                       \n"
 
       "1:                                        \n"
@@ -935,7 +935,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 2;
   const uint16_t* src_temp1 = src_ptr1 + 2;
 
-  asm volatile (
+  asm volatile(
       "vmov.u16    d30, #3                       \n"
       "vmov.u32    q14, #3                       \n"
 
@@ -988,7 +988,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.16     {q1, q2}, [%1]                \n"  // load accumulator
       "vld1.8      {q0}, [%0]!                   \n"  // load 16 bytes
@@ -1086,7 +1086,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
                             uint8_t* dst,
                             int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
       "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
@@ -1114,7 +1114,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
       "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
@@ -1135,7 +1135,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst,
                                int dst_width) {
-  asm volatile (
+  asm volatile(
       // change the stride to row 2 pointer
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
@@ -1174,7 +1174,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
                                uint8_t* dst_argb,
                                int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "mov         r12, %3, lsl #2               \n"
       "1:                                        \n"
       "vld1.32     {d0[0]}, [%0], r12            \n"
@@ -1198,7 +1198,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
                                   int src_stepx,
                                   uint8_t* dst_argb,
                                   int dst_width) {
-  asm volatile (
+  asm volatile(
       "mov         r12, %4, lsl #2               \n"
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
@@ -1246,7 +1246,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
                         int dx) {
   int tmp;
   const uint8_t* src_tmp = src_argb;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       // clang-format off
       LOAD1_DATA32_LANE(d0, 0)
@@ -1349,7 +1349,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
                           uint8_t* dst,
                           int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
       "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
@@ -1368,7 +1368,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
                                 uint8_t* dst,
                                 int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
       "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
@@ -1387,7 +1387,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst,
                              int dst_width) {
-  asm volatile (
+  asm volatile(
       // change the stride to row 2 pointer
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
@@ -1422,7 +1422,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
   const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
   const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "vld1.16     {d0[0]}, [%0], %6             \n"
       "vld1.16     {d0[1]}, [%1], %6             \n"
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 69c51b1bb..de19989fc 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -26,7 +26,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
                         uint8_t* dst,
                         int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       // load even pixels into v0, odd into v1
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
@@ -48,7 +48,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
                               uint8_t* dst,
                               int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       // load even pixels into v0, odd into v1
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
@@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst,
                            int dst_width) {
-  asm volatile (
+  asm volatile(
       // change the stride to row 2 pointer
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
@@ -172,18 +172,18 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          int dst_width) {
   (void)src_stride;
   asm volatile(
-      "ld1         {v29.16b}, [%[kShuf34_0]]                        \n"
-      "ld1         {v30.16b}, [%[kShuf34_1]]                        \n"
-      "ld1         {v31.16b}, [%[kShuf34_2]]                        \n"
-      "1:                                                           \n"
+      "ld1         {v29.16b}, [%[kShuf34_0]]     \n"
+      "ld1         {v30.16b}, [%[kShuf34_1]]     \n"
+      "ld1         {v31.16b}, [%[kShuf34_2]]     \n"
+      "1:                                        \n"
       "ld1         {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n"
-      "subs        %w[width], %w[width], #48                        \n"
-      "tbl         v0.16b, {v0.16b, v1.16b}, v29.16b                \n"
-      "prfm        pldl1keep, [%[src_ptr], 448]                     \n"
-      "tbl         v1.16b, {v1.16b, v2.16b}, v30.16b                \n"
-      "tbl         v2.16b, {v2.16b, v3.16b}, v31.16b                \n"
-      "st1         {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48        \n"
-      "b.gt        1b                                               \n"
+      "subs        %w[width], %w[width], #48     \n"
+      "tbl         v0.16b, {v0.16b, v1.16b}, v29.16b \n"
+      "prfm        pldl1keep, [%[src_ptr], 448]  \n"
+      "tbl         v1.16b, {v1.16b, v2.16b}, v30.16b \n"
+      "tbl         v2.16b, {v2.16b, v3.16b}, v31.16b \n"
+      "st1         {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n"
+      "b.gt        1b                            \n"
       : [src_ptr] "+r"(src_ptr),      // %[src_ptr]
         [dst_ptr] "+r"(dst_ptr),      // %[dst_ptr]
         [width] "+r"(dst_width)       // %[width]
@@ -326,7 +326,7 @@ static const vec16 kMult38_Div664 = {
     65536 / 12, 65536 / 12, 65536 / 8, 65536 / 12, 65536 / 12, 65536 / 8, 0, 0};
 static const vec16 kMult38_Div996 = {65536 / 18, 65536 / 18, 65536 / 12,
                                      65536 / 18, 65536 / 18, 65536 / 12,
-                                     0, 0};
+                                     0,          0};
 
 // 32 -> 12
 void ScaleRowDown38_NEON(const uint8_t* src_ptr,
@@ -335,26 +335,26 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          int dst_width) {
   (void)src_stride;
   asm volatile(
-      "ld1     {v3.16b}, [%[kShuf38]]             \n"
-      "subs    %w[width], %w[width], #12          \n"
-      "b.eq    2f                                 \n"
+      "ld1         {v3.16b}, [%[kShuf38]]        \n"
+      "subs        %w[width], %w[width], #12     \n"
+      "b.eq        2f                            \n"
 
-      "1:                                         \n"
-      "ldp     q0, q1, [%[src_ptr]], #32          \n"
-      "subs    %w[width], %w[width], #12          \n"
-      "tbl     v2.16b, {v0.16b, v1.16b}, v3.16b   \n"
-      "prfm    pldl1keep, [%[src_ptr], 448]       \n"  // prefetch 7 lines ahead
-      "str     q2, [%[dst_ptr]]                   \n"
-      "add     %[dst_ptr], %[dst_ptr], #12        \n"
-      "b.gt    1b                                 \n"
+      "1:                                        \n"
+      "ldp         q0, q1, [%[src_ptr]], #32     \n"
+      "subs        %w[width], %w[width], #12     \n"
+      "tbl         v2.16b, {v0.16b, v1.16b}, v3.16b \n"
+      "prfm        pldl1keep, [%[src_ptr], 448]  \n"  // prefetch 7 lines ahead
+      "str         q2, [%[dst_ptr]]              \n"
+      "add         %[dst_ptr], %[dst_ptr], #12   \n"
+      "b.gt        1b                            \n"
 
       // Store exactly 12 bytes on the final iteration to avoid writing past
       // the end of the array.
-      "2:                                         \n"
-      "ldp     q0, q1, [%[src_ptr]]               \n"
-      "tbl     v2.16b, {v0.16b, v1.16b}, v3.16b   \n"
-      "st1     {v2.8b}, [%[dst_ptr]], #8          \n"
-      "st1     {v2.s}[2], [%[dst_ptr]]            \n"
+      "2:                                        \n"
+      "ldp         q0, q1, [%[src_ptr]]          \n"
+      "tbl         v2.16b, {v0.16b, v1.16b}, v3.16b \n"
+      "st1         {v2.8b}, [%[dst_ptr]], #8     \n"
+      "st1         {v2.s}[2], [%[dst_ptr]]       \n"
       : [src_ptr] "+r"(src_ptr),  // %[src_ptr]
         [dst_ptr] "+r"(dst_ptr),  // %[dst_ptr]
         [width] "+r"(dst_width)   // %[width]
@@ -378,49 +378,49 @@ void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
   const uint8_t* src_ptr1 = src_ptr + src_stride;
   const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
   asm volatile(
-      "ld1       {v27.16b}, [%[tblArray1]]              \n"
-      "ld1       {v28.16b}, [%[tblArray2]]              \n"
-      "ld1       {v29.16b}, [%[tblArray3]]              \n"
-      "ld1       {v31.16b}, [%[tblArray4]]              \n"
-      "ld1       {v30.16b}, [%[div996]]                 \n"
+      "ld1         {v27.16b}, [%[tblArray1]]     \n"
+      "ld1         {v28.16b}, [%[tblArray2]]     \n"
+      "ld1         {v29.16b}, [%[tblArray3]]     \n"
+      "ld1         {v31.16b}, [%[tblArray4]]     \n"
+      "ld1         {v30.16b}, [%[div996]]        \n"
 
-      "1:                                               \n"
-      "ldp       q20, q0, [%[src_ptr]], #32             \n"
-      "ldp       q21, q1, [%[src_ptr1]], #32            \n"
-      "ldp       q22, q2, [%[src_ptr2]], #32            \n"
+      "1:                                        \n"
+      "ldp         q20, q0, [%[src_ptr]], #32    \n"
+      "ldp         q21, q1, [%[src_ptr1]], #32   \n"
+      "ldp         q22, q2, [%[src_ptr2]], #32   \n"
 
-      "subs      %w[width], %w[width], #12              \n"
+      "subs        %w[width], %w[width], #12     \n"
 
       // Add across strided rows first.
-      "uaddl     v23.8h, v20.8b, v21.8b                 \n"
-      "uaddl     v3.8h, v0.8b, v1.8b                    \n"
-      "uaddl2    v24.8h, v20.16b, v21.16b               \n"
-      "uaddl2    v4.8h, v0.16b, v1.16b                  \n"
+      "uaddl       v23.8h, v20.8b, v21.8b        \n"
+      "uaddl       v3.8h, v0.8b, v1.8b           \n"
+      "uaddl2      v24.8h, v20.16b, v21.16b      \n"
+      "uaddl2      v4.8h, v0.16b, v1.16b         \n"
 
-      "uaddw     v23.8h, v23.8h, v22.8b                 \n"
-      "uaddw     v3.8h, v3.8h, v2.8b                    \n"
-      "uaddw2    v24.8h, v24.8h, v22.16b                \n"  // abcdefgh ...
-      "uaddw2    v4.8h, v4.8h, v2.16b                   \n"
+      "uaddw       v23.8h, v23.8h, v22.8b        \n"
+      "uaddw       v3.8h, v3.8h, v2.8b           \n"
+      "uaddw2      v24.8h, v24.8h, v22.16b       \n"  // abcdefgh ...
+      "uaddw2      v4.8h, v4.8h, v2.16b          \n"
 
       // Permute groups of {three,three,two} into separate vectors to sum.
-      "tbl       v20.16b, {v23.16b, v24.16b}, v27.16b   \n"  // a d g ...
-      "tbl       v0.16b, {v3.16b, v4.16b}, v27.16b      \n"
-      "tbl       v21.16b, {v23.16b, v24.16b}, v28.16b   \n"  // b e h ...
-      "tbl       v1.16b, {v3.16b, v4.16b}, v28.16b      \n"
-      "tbl       v22.16b, {v23.16b, v24.16b}, v29.16b   \n"  // c f 0...
-      "tbl       v2.16b, {v3.16b, v4.16b}, v29.16b      \n"
+      "tbl         v20.16b, {v23.16b, v24.16b}, v27.16b \n"  // a d g ...
+      "tbl         v0.16b, {v3.16b, v4.16b}, v27.16b \n"
+      "tbl         v21.16b, {v23.16b, v24.16b}, v28.16b \n"  // b e h ...
+      "tbl         v1.16b, {v3.16b, v4.16b}, v28.16b \n"
+      "tbl         v22.16b, {v23.16b, v24.16b}, v29.16b \n"  // c f 0...
+      "tbl         v2.16b, {v3.16b, v4.16b}, v29.16b \n"
 
-      "add       v23.8h, v20.8h, v21.8h                 \n"
-      "add       v3.8h, v0.8h, v1.8h                    \n"
-      "add       v24.8h, v23.8h, v22.8h                 \n"  // a+b+c d+e+f g+h
-      "add       v4.8h, v3.8h, v2.8h                    \n"
+      "add         v23.8h, v20.8h, v21.8h        \n"
+      "add         v3.8h, v0.8h, v1.8h           \n"
+      "add         v24.8h, v23.8h, v22.8h        \n"  // a+b+c d+e+f g+h
+      "add         v4.8h, v3.8h, v2.8h           \n"
 
-      "sqrdmulh  v24.8h, v24.8h, v30.8h                 \n"  // v /= {9,9,6}
-      "sqrdmulh  v25.8h, v4.8h, v30.8h                  \n"
-      "tbl       v21.16b, {v24.16b, v25.16b}, v31.16b   \n"  // Narrow.
-      "st1       {v21.d}[0], [%[dst_ptr]], #8           \n"
-      "st1       {v21.s}[2], [%[dst_ptr]], #4           \n"
-      "b.gt      1b                                     \n"
+      "sqrdmulh    v24.8h, v24.8h, v30.8h        \n"  // v /= {9,9,6}
+      "sqrdmulh    v25.8h, v4.8h, v30.8h         \n"
+      "tbl         v21.16b, {v24.16b, v25.16b}, v31.16b \n"  // Narrow.
+      "st1         {v21.d}[0], [%[dst_ptr]], #8  \n"
+      "st1         {v21.s}[2], [%[dst_ptr]], #4  \n"
+      "b.gt        1b                            \n"
       : [src_ptr] "+r"(src_ptr),                         // %[src_ptr]
         [dst_ptr] "+r"(dst_ptr),                         // %[dst_ptr]
         [src_ptr1] "+r"(src_ptr1),                       // %[src_ptr1]
@@ -446,41 +446,41 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                int dst_width) {
   const uint8_t* src_ptr1 = src_ptr + src_stride;
   asm volatile(
-      "ld1       {v28.16b}, [%[tblArray1]]             \n"
-      "ld1       {v29.16b}, [%[tblArray2]]             \n"
-      "ld1       {v31.16b}, [%[tblArray3]]             \n"
-      "ld1       {v30.8h}, [%[div664]]                 \n"
+      "ld1         {v28.16b}, [%[tblArray1]]     \n"
+      "ld1         {v29.16b}, [%[tblArray2]]     \n"
+      "ld1         {v31.16b}, [%[tblArray3]]     \n"
+      "ld1         {v30.8h}, [%[div664]]         \n"
 
-      "1:                                              \n"
-      "ldp       q20, q0, [%[src_ptr]], #32            \n"  // abcdefgh ...
-      "ldp       q21, q1, [%[src_ptr1]], #32           \n"  // ijklmnop ...
-      "subs      %w[width], %w[width], #12             \n"
+      "1:                                        \n"
+      "ldp         q20, q0, [%[src_ptr]], #32    \n"  // abcdefgh ...
+      "ldp         q21, q1, [%[src_ptr1]], #32   \n"  // ijklmnop ...
+      "subs        %w[width], %w[width], #12     \n"
 
       // Permute into groups of six values (three pairs) to be summed.
-      "tbl       v22.16b, {v20.16b}, v28.16b           \n"  // abdegh ...
-      "tbl       v2.16b, {v0.16b}, v28.16b             \n"
-      "tbl       v23.16b, {v21.16b}, v28.16b           \n"  // ijlmop ...
-      "tbl       v3.16b, {v1.16b}, v28.16b             \n"
-      "tbl       v24.16b, {v20.16b, v21.16b}, v29.16b  \n"  // ckfn00 ...
-      "tbl       v4.16b, {v0.16b, v1.16b}, v29.16b     \n"
+      "tbl         v22.16b, {v20.16b}, v28.16b   \n"  // abdegh ...
+      "tbl         v2.16b, {v0.16b}, v28.16b     \n"
+      "tbl         v23.16b, {v21.16b}, v28.16b   \n"  // ijlmop ...
+      "tbl         v3.16b, {v1.16b}, v28.16b     \n"
+      "tbl         v24.16b, {v20.16b, v21.16b}, v29.16b \n"  // ckfn00 ...
+      "tbl         v4.16b, {v0.16b, v1.16b}, v29.16b \n"
 
-      "uaddlp    v22.8h, v22.16b                       \n"  // a+b d+e g+h ...
-      "uaddlp    v2.8h, v2.16b                         \n"
-      "uaddlp    v23.8h, v23.16b                       \n"  // i+j l+m o+p ...
-      "uaddlp    v3.8h, v3.16b                         \n"
-      "uaddlp    v24.8h, v24.16b                       \n"  // c+k f+n   0 ...
-      "uaddlp    v4.8h, v4.16b                         \n"
-      "add       v20.8h, v22.8h, v23.8h                \n"
-      "add       v0.8h, v2.8h, v3.8h                   \n"
-      "add       v21.8h, v20.8h, v24.8h                \n"  // a+b+i+j+c+k ...
-      "add       v1.8h, v0.8h, v4.8h                   \n"
+      "uaddlp      v22.8h, v22.16b               \n"  // a+b d+e g+h ...
+      "uaddlp      v2.8h, v2.16b                 \n"
+      "uaddlp      v23.8h, v23.16b               \n"  // i+j l+m o+p ...
+      "uaddlp      v3.8h, v3.16b                 \n"
+      "uaddlp      v24.8h, v24.16b               \n"  // c+k f+n   0 ...
+      "uaddlp      v4.8h, v4.16b                 \n"
+      "add         v20.8h, v22.8h, v23.8h        \n"
+      "add         v0.8h, v2.8h, v3.8h           \n"
+      "add         v21.8h, v20.8h, v24.8h        \n"  // a+b+i+j+c+k ...
+      "add         v1.8h, v0.8h, v4.8h           \n"
 
-      "sqrdmulh  v21.8h, v21.8h, v30.8h                \n"  // v /= {6,6,4}
-      "sqrdmulh  v22.8h, v1.8h, v30.8h                 \n"
-      "tbl       v21.16b, {v21.16b, v22.16b}, v31.16b  \n"  // Narrow.
-      "st1       {v21.d}[0], [%[dst_ptr]], #8          \n"
-      "st1       {v21.s}[2], [%[dst_ptr]], #4          \n"
-      "b.gt      1b                                    \n"
+      "sqrdmulh    v21.8h, v21.8h, v30.8h        \n"  // v /= {6,6,4}
+      "sqrdmulh    v22.8h, v1.8h, v30.8h         \n"
+      "tbl         v21.16b, {v21.16b, v22.16b}, v31.16b \n"  // Narrow.
+      "st1         {v21.d}[0], [%[dst_ptr]], #8  \n"
+      "st1         {v21.s}[2], [%[dst_ptr]], #4  \n"
+      "b.gt        1b                            \n"
       : [src_ptr] "+r"(src_ptr),                         // %[src_ptr]
         [dst_ptr] "+r"(dst_ptr),                         // %[dst_ptr]
         [src_ptr1] "+r"(src_ptr1),                       // %[src_ptr1]
@@ -543,7 +543,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   const uint8_t* src_temp = src_ptr + 1;
   const uint8_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile (
+  asm volatile(
       "movi        v31.8b, #3                    \n"
       "movi        v30.8h, #3                    \n"
 
@@ -599,7 +599,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
-  asm volatile (
+  asm volatile(
       "movi        v31.8h, #3                    \n"
 
       "1:                                        \n"
@@ -636,7 +636,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 1;
   const uint16_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile (
+  asm volatile(
       "movi        v31.8h, #3                    \n"
 
       "1:                                        \n"
@@ -690,7 +690,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
-  asm volatile (
+  asm volatile(
       "movi        v31.8h, #3                    \n"
 
       "1:                                        \n"
@@ -735,7 +735,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 1;
   const uint16_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile (
+  asm volatile(
       "movi        v31.4h, #3                    \n"
       "movi        v30.4s, #3                    \n"
 
@@ -790,7 +790,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   const uint8_t* src_temp = src_ptr + 2;
-  asm volatile (
+  asm volatile(
       "movi        v31.8b, #3                    \n"
 
       "1:                                        \n"
@@ -829,7 +829,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   const uint8_t* src_temp = src_ptr + 2;
   const uint8_t* src_temp1 = src_ptr1 + 2;
 
-  asm volatile (
+  asm volatile(
       "movi        v31.8b, #3                    \n"
       "movi        v30.8h, #3                    \n"
 
@@ -885,7 +885,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width) {
   const uint16_t* src_temp = src_ptr + 2;
-  asm volatile (
+  asm volatile(
       "movi        v31.8h, #3                    \n"
 
       "1:                                        \n"
@@ -932,7 +932,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 2;
   const uint16_t* src_temp1 = src_ptr1 + 2;
 
-  asm volatile (
+  asm volatile(
       "movi        v31.4h, #3                    \n"
       "movi        v30.4s, #3                    \n"
 
@@ -987,7 +987,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v1.8h, v2.8h}, [%1]          \n"  // load accumulator
       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 bytes
@@ -1043,14 +1043,14 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
       "trn1        v21.8h, v2.8h, v0.8h          \n"
 
       "1:                                        \n" SCALE_FILTER_COLS_STEP_ADDR
-      "ldr        h6, [%[tmp_ptr]]               \n" SCALE_FILTER_COLS_STEP_ADDR
-      "ld1        {v6.h}[1], [%[tmp_ptr]]        \n" SCALE_FILTER_COLS_STEP_ADDR
-      "ld1        {v6.h}[2], [%[tmp_ptr]]        \n" SCALE_FILTER_COLS_STEP_ADDR
-      "ld1        {v6.h}[3], [%[tmp_ptr]]        \n" SCALE_FILTER_COLS_STEP_ADDR
-      "ld1        {v6.h}[4], [%[tmp_ptr]]        \n" SCALE_FILTER_COLS_STEP_ADDR
-      "ld1        {v6.h}[5], [%[tmp_ptr]]        \n" SCALE_FILTER_COLS_STEP_ADDR
-      "ld1        {v6.h}[6], [%[tmp_ptr]]        \n" SCALE_FILTER_COLS_STEP_ADDR
-      "ld1        {v6.h}[7], [%[tmp_ptr]]        \n"
+      "ldr         h6, [%[tmp_ptr]]              \n" SCALE_FILTER_COLS_STEP_ADDR
+      "ld1         {v6.h}[1], [%[tmp_ptr]]       \n" SCALE_FILTER_COLS_STEP_ADDR
+      "ld1         {v6.h}[2], [%[tmp_ptr]]       \n" SCALE_FILTER_COLS_STEP_ADDR
+      "ld1         {v6.h}[3], [%[tmp_ptr]]       \n" SCALE_FILTER_COLS_STEP_ADDR
+      "ld1         {v6.h}[4], [%[tmp_ptr]]       \n" SCALE_FILTER_COLS_STEP_ADDR
+      "ld1         {v6.h}[5], [%[tmp_ptr]]       \n" SCALE_FILTER_COLS_STEP_ADDR
+      "ld1         {v6.h}[6], [%[tmp_ptr]]       \n" SCALE_FILTER_COLS_STEP_ADDR
+      "ld1         {v6.h}[7], [%[tmp_ptr]]       \n"
 
       "subs        %w[width], %w[width], #8      \n"  // 8 processed per loop
       "trn1        v4.16b, v6.16b, v0.16b        \n"
@@ -1090,14 +1090,14 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
                             int dst_width) {
   (void)src_stride;
   asm volatile(
-      "1:                                                      \n"
+      "1:                                        \n"
       "ld1         {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n"
-      "subs        %w[width], %w[width], #8                    \n"
-      "prfm        pldl1keep, [%[src], 448]                    \n"
-      "uzp2        v0.4s, v0.4s, v1.4s                         \n"
-      "uzp2        v1.4s, v2.4s, v3.4s                         \n"
-      "st1         {v0.4s, v1.4s}, [%[dst]], #32               \n"
-      "b.gt        1b                                          \n"
+      "subs        %w[width], %w[width], #8      \n"
+      "prfm        pldl1keep, [%[src], 448]      \n"
+      "uzp2        v0.4s, v0.4s, v1.4s           \n"
+      "uzp2        v1.4s, v2.4s, v3.4s           \n"
+      "st1         {v0.4s, v1.4s}, [%[dst]], #32 \n"
+      "b.gt        1b                            \n"
       : [src] "+r"(src_ptr),     // %[src]
         [dst] "+r"(dst),         // %[dst]
         [width] "+r"(dst_width)  // %[width]
@@ -1113,15 +1113,15 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
   const uint8_t* src_argb1 = src_argb + 32;
   asm volatile(
       "1:                                        \n"
-      "ld2     {v0.4s, v1.4s}, [%[src]]          \n"
-      "add     %[src], %[src], #64               \n"
-      "ld2     {v2.4s, v3.4s}, [%[src1]]         \n"
-      "add     %[src1], %[src1], #64             \n"
-      "urhadd  v0.16b, v0.16b, v1.16b            \n"
-      "urhadd  v1.16b, v2.16b, v3.16b            \n"
-      "subs    %w[width], %w[width], #8          \n"
-      "st1     {v0.16b, v1.16b}, [%[dst]], #32   \n"
-      "b.gt    1b                                \n"
+      "ld2         {v0.4s, v1.4s}, [%[src]]      \n"
+      "add         %[src], %[src], #64           \n"
+      "ld2         {v2.4s, v3.4s}, [%[src1]]     \n"
+      "add         %[src1], %[src1], #64         \n"
+      "urhadd      v0.16b, v0.16b, v1.16b        \n"
+      "urhadd      v1.16b, v2.16b, v3.16b        \n"
+      "subs        %w[width], %w[width], #8      \n"
+      "st1         {v0.16b, v1.16b}, [%[dst]], #32 \n"
+      "b.gt        1b                            \n"
       : [src] "+r"(src_argb),    // %[src]
         [src1] "+r"(src_argb1),  // %[src1]
         [dst] "+r"(dst_argb),    // %[dst]
@@ -1135,21 +1135,21 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                                uint8_t* dst,
                                int dst_width) {
   const uint8_t* src_ptr1 = src_ptr + src_stride;
-  asm volatile (
-      "1:                                       \n"
-      "ld2     {v0.4s, v1.4s}, [%[src]], #32    \n"
-      "ld2     {v20.4s, v21.4s}, [%[src1]], #32 \n"
-      "uaddl   v2.8h, v0.8b, v1.8b              \n"
-      "uaddl2  v3.8h, v0.16b, v1.16b            \n"
-      "uaddl   v22.8h, v20.8b, v21.8b           \n"
-      "uaddl2  v23.8h, v20.16b, v21.16b         \n"
-      "add     v0.8h, v2.8h, v22.8h             \n"
-      "add     v1.8h, v3.8h, v23.8h             \n"
-      "rshrn   v0.8b, v0.8h, #2                 \n"
-      "rshrn   v1.8b, v1.8h, #2                 \n"
-      "subs    %w[width], %w[width], #4         \n"
-      "stp     d0, d1, [%[dst]], #16            \n"
-      "b.gt    1b                               \n"
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.4s, v1.4s}, [%[src]], #32 \n"
+      "ld2         {v20.4s, v21.4s}, [%[src1]], #32 \n"
+      "uaddl       v2.8h, v0.8b, v1.8b           \n"
+      "uaddl2      v3.8h, v0.16b, v1.16b         \n"
+      "uaddl       v22.8h, v20.8b, v21.8b        \n"
+      "uaddl2      v23.8h, v20.16b, v21.16b      \n"
+      "add         v0.8h, v2.8h, v22.8h          \n"
+      "add         v1.8h, v3.8h, v23.8h          \n"
+      "rshrn       v0.8b, v0.8h, #2              \n"
+      "rshrn       v1.8b, v1.8h, #2              \n"
+      "subs        %w[width], %w[width], #4      \n"
+      "stp         d0, d1, [%[dst]], #16         \n"
+      "b.gt        1b                            \n"
       : [src] "+r"(src_ptr), [src1] "+r"(src_ptr1), [dst] "+r"(dst),
         [width] "+r"(dst_width)
       :
@@ -1166,26 +1166,22 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
   const uint8_t* src_argb3 = src_argb + src_stepx * 12;
   int64_t i = 0;
   (void)src_stride;
-  asm volatile (
-      "1:                                     \n"
-      "ldr      w10, [%[src], %[i]]           \n"
-      "ldr      w11, [%[src1], %[i]]          \n"
-      "ldr      w12, [%[src2], %[i]]          \n"
-      "ldr      w13, [%[src3], %[i]]          \n"
-      "add      %[i], %[i], %[step]           \n"
-      "subs     %w[width], %w[width], #4      \n"
-      "prfm     pldl1keep, [%[src], 448]      \n"
-      "stp      w10, w11, [%[dst]], #8        \n"
-      "stp      w12, w13, [%[dst]], #8        \n"
-      "b.gt     1b                            \n"
-      : [src]"+r"(src_argb),
-        [src1]"+r"(src_argb1),
-        [src2]"+r"(src_argb2),
-        [src3]"+r"(src_argb3),
-        [dst]"+r"(dst_argb),
-        [width]"+r"(dst_width),
-        [i]"+r"(i)
-      : [step]"r"((int64_t)(src_stepx * 16))
+  asm volatile(
+      "1:                                        \n"
+      "ldr         w10, [%[src], %[i]]           \n"
+      "ldr         w11, [%[src1], %[i]]          \n"
+      "ldr         w12, [%[src2], %[i]]          \n"
+      "ldr         w13, [%[src3], %[i]]          \n"
+      "add         %[i], %[i], %[step]           \n"
+      "subs        %w[width], %w[width], #4      \n"
+      "prfm        pldl1keep, [%[src], 448]      \n"
+      "stp         w10, w11, [%[dst]], #8        \n"
+      "stp         w12, w13, [%[dst]], #8        \n"
+      "b.gt        1b                            \n"
+      : [src] "+r"(src_argb), [src1] "+r"(src_argb1), [src2] "+r"(src_argb2),
+        [src3] "+r"(src_argb3), [dst] "+r"(dst_argb), [width] "+r"(dst_width),
+        [i] "+r"(i)
+      : [step] "r"((int64_t)(src_stepx * 16))
       : "memory", "cc", "w10", "w11", "w12", "w13");
 }
 
@@ -1312,33 +1308,33 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
 
       "1:                                        \n"  //
       SCALE_ARGB_FILTER_COLS_STEP_ADDR
-      "ldr        d1, [%6]                       \n"  //
+      "ldr         d1, [%6]                      \n"  //
       SCALE_ARGB_FILTER_COLS_STEP_ADDR
-      "ldr        d2, [%6]                       \n"
-      "shrn       v4.4h, v5.4s, #9               \n"  //
+      "ldr         d2, [%6]                      \n"
+      "shrn        v4.4h, v5.4s, #9              \n"  //
       SCALE_ARGB_FILTER_COLS_STEP_ADDR
-      "ld1        {v1.d}[1], [%6]                \n"  //
+      "ld1         {v1.d}[1], [%6]               \n"  //
       SCALE_ARGB_FILTER_COLS_STEP_ADDR
-      "ld1        {v2.d}[1], [%6]                \n"
+      "ld1         {v2.d}[1], [%6]               \n"
 
-      "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
-      "and        v4.8b, v4.8b, v3.8b            \n"
-      "trn1       v0.4s, v1.4s, v2.4s            \n"
-      "tbl        v4.16b, {v4.16b}, v18.16b      \n"  // f
-      "trn2       v1.4s, v1.4s, v2.4s            \n"
-      "eor        v7.16b, v4.16b, v3.16b         \n"  // 0x7f ^ f
+      "subs        %w2, %w2, #4                  \n"  // 4 processed per loop
+      "and         v4.8b, v4.8b, v3.8b           \n"
+      "trn1        v0.4s, v1.4s, v2.4s           \n"
+      "tbl         v4.16b, {v4.16b}, v18.16b     \n"  // f
+      "trn2        v1.4s, v1.4s, v2.4s           \n"
+      "eor         v7.16b, v4.16b, v3.16b        \n"  // 0x7f ^ f
 
-      "umull      v16.8h, v1.8b, v4.8b           \n"
-      "umull2     v17.8h, v1.16b, v4.16b         \n"
-      "umlal      v16.8h, v0.8b, v7.8b           \n"
-      "umlal2     v17.8h, v0.16b, v7.16b         \n"
+      "umull       v16.8h, v1.8b, v4.8b          \n"
+      "umull2      v17.8h, v1.16b, v4.16b        \n"
+      "umlal       v16.8h, v0.8b, v7.8b          \n"
+      "umlal2      v17.8h, v0.16b, v7.16b        \n"
 
-      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
-      "shrn       v0.8b, v16.8h, #7              \n"
-      "shrn       v1.8b, v17.8h, #7              \n"
-      "add        v5.4s, v5.4s, v6.4s            \n"
-      "stp        d0, d1, [%0], #16              \n"  // store pixels
-      "b.gt       1b                             \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "shrn        v0.8b, v16.8h, #7             \n"
+      "shrn        v1.8b, v17.8h, #7             \n"
+      "add         v5.4s, v5.4s, v6.4s           \n"
+      "stp         d0, d1, [%0], #16             \n"  // store pixels
+      "b.gt        1b                            \n"
       : "+r"(dst_argb),                                       // %0
         "+r"(src_argb),                                       // %1
         "+r"(dst_width),                                      // %2
@@ -1360,34 +1356,34 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
                            int dst_width) {
   (void)src_stride;
   asm volatile(
-      "subs  %w[dst_width], %w[dst_width], #32   \n"
-      "b.lt  2f                                  \n"
+      "subs        %w[dst_width], %w[dst_width], #32 \n"
+      "b.lt        2f                            \n"
 
       "1:                                        \n"
-      "ldp   q0, q1, [%[src_ptr]]                \n"
-      "ldp   q2, q3, [%[src_ptr], #32]           \n"
-      "ldp   q4, q5, [%[src_ptr], #64]           \n"
-      "ldp   q6, q7, [%[src_ptr], #96]           \n"
-      "add   %[src_ptr], %[src_ptr], #128        \n"
-      "uzp2  v0.8h, v0.8h, v1.8h                 \n"
-      "uzp2  v1.8h, v2.8h, v3.8h                 \n"
-      "uzp2  v2.8h, v4.8h, v5.8h                 \n"
-      "uzp2  v3.8h, v6.8h, v7.8h                 \n"
-      "subs  %w[dst_width], %w[dst_width], #32   \n"  // 32 elems per iteration.
-      "stp   q0, q1, [%[dst_ptr]]                \n"
-      "stp   q2, q3, [%[dst_ptr], #32]           \n"
-      "add   %[dst_ptr], %[dst_ptr], #64         \n"
-      "b.ge  1b                                  \n"
+      "ldp         q0, q1, [%[src_ptr]]          \n"
+      "ldp         q2, q3, [%[src_ptr], #32]     \n"
+      "ldp         q4, q5, [%[src_ptr], #64]     \n"
+      "ldp         q6, q7, [%[src_ptr], #96]     \n"
+      "add         %[src_ptr], %[src_ptr], #128  \n"
+      "uzp2        v0.8h, v0.8h, v1.8h           \n"
+      "uzp2        v1.8h, v2.8h, v3.8h           \n"
+      "uzp2        v2.8h, v4.8h, v5.8h           \n"
+      "uzp2        v3.8h, v6.8h, v7.8h           \n"
+      "subs        %w[dst_width], %w[dst_width], #32 \n"  // 32 elems per iteration.
+      "stp         q0, q1, [%[dst_ptr]]          \n"
+      "stp         q2, q3, [%[dst_ptr], #32]     \n"
+      "add         %[dst_ptr], %[dst_ptr], #64   \n"
+      "b.ge        1b                            \n"
 
       "2:                                        \n"
-      "adds  %w[dst_width], %w[dst_width], #32   \n"
-      "b.eq  99f                                 \n"
+      "adds        %w[dst_width], %w[dst_width], #32 \n"
+      "b.eq        99f                           \n"
 
-      "ldp   q0, q1, [%[src_ptr]]                \n"
-      "ldp   q2, q3, [%[src_ptr], #32]           \n"
-      "uzp2  v0.8h, v0.8h, v1.8h                 \n"
-      "uzp2  v1.8h, v2.8h, v3.8h                 \n"
-      "stp   q0, q1, [%[dst_ptr]]                \n"
+      "ldp         q0, q1, [%[src_ptr]]          \n"
+      "ldp         q2, q3, [%[src_ptr], #32]     \n"
+      "uzp2        v0.8h, v0.8h, v1.8h           \n"
+      "uzp2        v1.8h, v2.8h, v3.8h           \n"
+      "stp         q0, q1, [%[dst_ptr]]          \n"
 
       "99:                                       \n"
       : [src_ptr] "+r"(src_ptr),     // %[src_ptr]
@@ -1403,15 +1399,15 @@ void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,
                                  int dst_width) {
   (void)src_stride;
   asm volatile(
-      "1:                                                \n"
-      "ld2         {v0.8h, v1.8h}, [%[src_ptr]], #32     \n"
-      "ld2         {v2.8h, v3.8h}, [%[src_ptr]], #32     \n"
-      "subs        %w[dst_width], %w[dst_width], #16     \n"
-      "urhadd      v0.8h, v0.8h, v1.8h                   \n"
-      "urhadd      v1.8h, v2.8h, v3.8h                   \n"
-      "prfm        pldl1keep, [%[src_ptr], 448]          \n"
-      "stp         q0, q1, [%[dst_ptr]], #32             \n"
-      "b.gt        1b                                    \n"
+      "1:                                        \n"
+      "ld2         {v0.8h, v1.8h}, [%[src_ptr]], #32 \n"
+      "ld2         {v2.8h, v3.8h}, [%[src_ptr]], #32 \n"
+      "subs        %w[dst_width], %w[dst_width], #16 \n"
+      "urhadd      v0.8h, v0.8h, v1.8h           \n"
+      "urhadd      v1.8h, v2.8h, v3.8h           \n"
+      "prfm        pldl1keep, [%[src_ptr], 448]  \n"
+      "stp         q0, q1, [%[dst_ptr]], #32     \n"
+      "b.gt        1b                            \n"
       : [src_ptr] "+r"(src_ptr),     // %[src_ptr]
         [dst_ptr] "+r"(dst),         // %[dst_ptr]
         [dst_width] "+r"(dst_width)  // %[dst_width]
@@ -1424,7 +1420,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint16_t* dst,
                               int dst_width) {
-  asm volatile (
+  asm volatile(
       // change the stride to row 2 pointer
       "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
       "1:                                        \n"
@@ -1455,7 +1451,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
                           uint8_t* dst,
                           int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@@ -1474,7 +1470,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
                                 uint8_t* dst,
                                 int dst_width) {
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@@ -1493,7 +1489,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst,
                              int dst_width) {
-  asm volatile (
+  asm volatile(
       // change the stride to row 2 pointer
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
@@ -1528,7 +1524,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
   const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
   const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
   (void)src_stride;
-  asm volatile (
+  asm volatile(
       "1:                                        \n"
       "ld1         {v0.h}[0], [%0], %6           \n"
       "ld1         {v1.h}[0], [%1], %6           \n"
diff --git a/source/scale_rgb.cc b/source/scale_rgb.cc
index 225fd21ec..5e69fe379 100644
--- a/source/scale_rgb.cc
+++ b/source/scale_rgb.cc
@@ -10,8 +10,8 @@
 
 #include "libyuv/scale.h" /* For FilterMode */
 
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -41,9 +41,9 @@ int RGBScale(const uint8_t* src_rgb,
              int dst_height,
              enum FilterMode filtering) {
   int r;
-  if (!src_rgb || !dst_rgb ||
-      src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 ||
-      dst_width <= 0 || dst_width > INT_MAX / 4 || dst_height <= 0) {
+  if (!src_rgb || !dst_rgb || src_width <= 0 || src_width > INT_MAX / 4 ||
+      src_height == 0 || dst_width <= 0 || dst_width > INT_MAX / 4 ||
+      dst_height <= 0) {
     return -1;
   }
   const int abs_src_height = (src_height < 0) ? -src_height : src_height;
diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc
index 9fe2b2773..4617e1a96 100644
--- a/source/scale_rvv.cc
+++ b/source/scale_rvv.cc
@@ -149,7 +149,7 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
   const uint32_t* src = (const uint32_t*)(src_argb);
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   do {
     vuint8m4_t v_odd, v_even, v_dst;
     vuint32m4_t v_odd_32, v_even_32;
@@ -214,7 +214,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
   const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   do {
     vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
     vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
@@ -311,7 +311,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
   const int stride_byte = src_stepx * 4;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   do {
     vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
     vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
@@ -389,7 +389,7 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
   (void)src_stride;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   do {
     vuint8m4_t v_s0, v_s1, v_dst;
     size_t vl = __riscv_vsetvl_e8m4(w);
@@ -444,7 +444,7 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
   size_t w = (size_t)dst_width;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   do {
     size_t vl = __riscv_vsetvl_e8m4(w);
     vuint8m4_t v_s0, v_s1, v_t0, v_t1;
@@ -577,7 +577,7 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
   size_t w = (size_t)dst_width;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   do {
     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
     vuint8m2_t v_t0, v_t1, v_t2, v_t3;
@@ -747,7 +747,7 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
   const uint8_t* t = src_ptr + src_stride;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   do {
     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
     vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
@@ -876,7 +876,7 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
   const uint8_t* t = src_ptr + src_stride;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   do {
     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
     vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
@@ -1539,7 +1539,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
   (void)src_stride;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   do {
     vuint8m4_t v_u0v0, v_u1v1, v_avg;
     vuint16m4_t v_u0v0_16, v_u1v1_16;
@@ -1608,7 +1608,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
   size_t w = (size_t)dst_width;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile ("csrwi vxrm, 0");
+  asm volatile("csrwi vxrm, 0");
   do {
     vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
     vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;
diff --git a/source/scale_sme.cc b/source/scale_sme.cc
index 6b22f24d0..fa74569d1 100644
--- a/source/scale_sme.cc
+++ b/source/scale_sme.cc
@@ -15,7 +15,6 @@ namespace libyuv {
 extern "C" {
 #endif
 
-
 #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
     defined(__aarch64__)
 
diff --git a/source/scale_uv.cc b/source/scale_uv.cc
index 7b318cf72..700d1b2b6 100644
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@@ -333,14 +333,14 @@ static void ScaleUVDownEven(int src_width,
 #endif
 #if defined(HAS_SCALEUVROWDOWNEVEN_RVV) || defined(HAS_SCALEUVROWDOWN4_RVV)
   if (TestCpuFlag(kCpuHasRVV) && !filtering) {
-    #if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
-      ScaleUVRowDownEven = ScaleUVRowDownEven_RVV;
-    #endif
-    #if defined(HAS_SCALEUVROWDOWN4_RVV)
-      if (col_step == 4) {
-        ScaleUVRowDownEven = ScaleUVRowDown4_RVV;
-      }
-    #endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
+    ScaleUVRowDownEven = ScaleUVRowDownEven_RVV;
+#endif
+#if defined(HAS_SCALEUVROWDOWN4_RVV)
+    if (col_step == 4) {
+      ScaleUVRowDownEven = ScaleUVRowDown4_RVV;
+    }
+#endif
   }
 #endif
 
diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc
index 8eadba39a..718afec36 100644
--- a/unit_test/convert_argb_test.cc
+++ b/unit_test/convert_argb_test.cc
@@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <time.h>
 
+#include "../unit_test/unit_test.h"
 #include "libyuv/basic_types.h"
 #include "libyuv/compare.h"
 #include "libyuv/convert.h"
@@ -19,7 +20,6 @@
 #include "libyuv/convert_from.h"
 #include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
-#include "../unit_test/unit_test.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "libyuv/video_common.h"
diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc
index 427614420..cb9bf1c40 100644
--- a/unit_test/cpu_test.cc
+++ b/unit_test/cpu_test.cc
@@ -67,16 +67,16 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
 #endif
 
 #ifdef __linux__
-static void KernelVersion(int *version) {
+static void KernelVersion(int* version) {
   struct utsname buffer;
   int i = 0;
 
   version[0] = version[1] = 0;
   if (uname(&buffer) == 0) {
-    char *v = buffer.release;
+    char* v = buffer.release;
     for (i = 0; *v && i < 2; ++v) {
       if (isdigit(*v)) {
-        version[i++] = (int) strtol(v, &v, 10);
+        version[i++] = (int)strtol(v, &v, 10);
       }
     }
   }
@@ -142,8 +142,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
 
     // Read and print the RVV vector length.
     if (has_rvv) {
-      register uint32_t vlenb __asm__ ("t0");
-      __asm__(".word 0xC22022F3"  /* CSRR t0, vlenb */ : "=r" (vlenb));
+      register uint32_t vlenb __asm__("t0");
+      __asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r"(vlenb));
       printf("RVV vector length: %d bytes\n", vlenb);
     }
   }
@@ -161,7 +161,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
 #if defined(__loongarch__)
   int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
   if (has_loongarch) {
-    int has_lsx  = TestCpuFlag(kCpuHasLSX);
+    int has_lsx = TestCpuFlag(kCpuHasLSX);
     int has_lasx = TestCpuFlag(kCpuHasLASX);
     printf("Has LOONGARCH 0x%x\n", has_loongarch);
     printf("Has LSX 0x%x\n", has_lsx);
@@ -169,8 +169,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
   }
 #endif  // defined(__loongarch__)
 
-#if defined(__i386__) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_X64)
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(_M_X64)
   int has_x86 = TestCpuFlag(kCpuHasX86);
   if (has_x86) {
     int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@@ -215,7 +215,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
     printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
     printf("Has AMXINT8 0x%x\n", has_amxint8);
   }
-#endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+#endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
+        // defined(_M_X64)
 }
 
 TEST_F(LibYUVBaseTest, TestCompilerMacros) {
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 30d660e4b..ca3cbe769 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -1570,18 +1570,21 @@ static int TestCopyPlane(int benchmark_width,
   // Disable all optimizations.
   MaskCpuFlags(disable_cpu_flags);
   for (int i = 0; i < benchmark_iterations; i++) {
-    CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width, benchmark_width, benchmark_height * invert);
+    CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width,
+              benchmark_width, benchmark_height * invert);
   }
 
   // Enable optimizations.
   MaskCpuFlags(benchmark_cpu_info);
   for (int i = 0; i < benchmark_iterations; i++) {
-    CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width, benchmark_width, benchmark_height * invert);
+    CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width,
+              benchmark_width, benchmark_height * invert);
   }
 
   int max_diff = 0;
   for (int i = 0; i < y_plane_size; ++i) {
-    int abs_diff = abs(static_cast<int>(dst_c[i]) - static_cast<int>(dst_opt[i]));
+    int abs_diff =
+        abs(static_cast<int>(dst_c[i]) - static_cast<int>(dst_opt[i]));
     if (abs_diff > max_diff) {
       max_diff = abs_diff;
     }
@@ -1596,29 +1599,29 @@ static int TestCopyPlane(int benchmark_width,
 
 TEST_F(LibYUVPlanarTest, CopyPlane_Any) {
   int max_diff = TestCopyPlane(benchmark_width_ + 1, benchmark_height_,
-                              benchmark_iterations_, disable_cpu_flags_,
-                              benchmark_cpu_info_, +1, 0);
+                               benchmark_iterations_, disable_cpu_flags_,
+                               benchmark_cpu_info_, +1, 0);
   EXPECT_LE(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, CopyPlane_Unaligned) {
   int max_diff =
       TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
-                   disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+                    disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
   EXPECT_LE(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, CopyPlane_Invert) {
   int max_diff =
       TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
-                   disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
+                    disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
   EXPECT_LE(max_diff, 0);
 }
 
 TEST_F(LibYUVPlanarTest, CopyPlane_Opt) {
   int max_diff =
       TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
-                   disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+                    disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
   EXPECT_LE(max_diff, 0);
 }
 
@@ -2499,17 +2502,19 @@ static int TestHalfFloatPlane(int benchmark_width,
   // Disable all optimizations.
   MaskCpuFlags(disable_cpu_flags);
   for (j = 0; j < benchmark_iterations; j++) {
-    HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2,
-                   reinterpret_cast<uint16_t*>(dst_c), benchmark_width * 2,
-                   scale, benchmark_width, benchmark_height * invert);
+    HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off),
+                   benchmark_width * 2, reinterpret_cast<uint16_t*>(dst_c),
+                   benchmark_width * 2, scale, benchmark_width,
+                   benchmark_height * invert);
   }
 
   // Enable optimizations.
   MaskCpuFlags(benchmark_cpu_info);
   for (j = 0; j < benchmark_iterations; j++) {
-    HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2,
-                   reinterpret_cast<uint16_t*>(dst_opt), benchmark_width * 2,
-                   scale, benchmark_width, benchmark_height * invert);
+    HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off),
+                   benchmark_width * 2, reinterpret_cast<uint16_t*>(dst_opt),
+                   benchmark_width * 2, scale, benchmark_width,
+                   benchmark_height * invert);
   }
 
   int max_diff = 0;
@@ -2536,23 +2541,23 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
+  int diff = TestHalfFloatPlane(
+      benchmark_width_, benchmark_height_, benchmark_iterations_,
+      disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
   EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
+  int diff = TestHalfFloatPlane(
+      benchmark_width_, benchmark_height_, benchmark_iterations_,
+      disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
   EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
+  int diff = TestHalfFloatPlane(
+      benchmark_width_, benchmark_height_, benchmark_iterations_,
+      disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
   EXPECT_EQ(0, diff);
 }
 
@@ -2564,59 +2569,57 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) {
-  int diff = TestHalfFloatPlane(benchmark_width_ + 1, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
+  int diff = TestHalfFloatPlane(
+      benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
+      disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
   EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
+  int diff = TestHalfFloatPlane(
+      benchmark_width_, benchmark_height_, benchmark_iterations_,
+      disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
   EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
+  int diff = TestHalfFloatPlane(
+      benchmark_width_, benchmark_height_, benchmark_iterations_,
+      disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
   EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
+  int diff = TestHalfFloatPlane(
+      benchmark_width_, benchmark_height_, benchmark_iterations_,
+      disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
   EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
+  int diff = TestHalfFloatPlane(
+      benchmark_width_, benchmark_height_, benchmark_iterations_,
+      disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
   EXPECT_EQ(0, diff);
 }
 
 #if defined(__arm__)
 static void EnableFlushDenormalToZero(void) {
   uint32_t cw;
-  asm volatile (
-    "vmrs   %0, fpscr                 \n"
-    "orr    %0, %0, #0x1000000        \n"
-    "vmsr   fpscr, %0                 \n"
-    : "=r"(cw)
-    ::"memory", "cc"); // Clobber List
+  asm volatile(
+      "vmrs   %0, fpscr                 \n"
+      "orr    %0, %0, #0x1000000        \n"
+      "vmsr   fpscr, %0                 \n"
+      : "=r"(cw)::"memory", "cc");  // Clobber List
 }
 
 static void DisableFlushDenormalToZero(void) {
   uint32_t cw;
-  asm volatile (
-    "vmrs   %0, fpscr                 \n"
-    "bic    %0, %0, #0x1000000        \n"
-    "vmsr   fpscr, %0                 \n"
-    : "=r"(cw)
-    ::"memory", "cc"); // Clobber List
+  asm volatile(
+      "vmrs   %0, fpscr                 \n"
+      "bic    %0, %0, #0x1000000        \n"
+      "vmsr   fpscr, %0                 \n"
+      : "=r"(cw)::"memory", "cc");  // Clobber List
 }
 
 // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
@@ -2626,18 +2629,18 @@ static void DisableFlushDenormalToZero(void) {
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) {
   // 32 bit arm rounding on denormal case is off by 1 compared to C.
   EnableFlushDenormalToZero();
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
+  int diff = TestHalfFloatPlane(
+      benchmark_width_, benchmark_height_, benchmark_iterations_,
+      disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
   DisableFlushDenormalToZero();
   EXPECT_EQ(0, diff);
 }
 
 TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) {
   EnableFlushDenormalToZero();
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
+  int diff = TestHalfFloatPlane(
+      benchmark_width_, benchmark_height_, benchmark_iterations_,
+      disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
   DisableFlushDenormalToZero();
   EXPECT_EQ(0, diff);
 }
@@ -3184,8 +3187,9 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
                 tmp_pixels_c_b, benchmark_width_, benchmark_width_,
                 benchmark_height_);
   MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g,
-                benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c,
-                benchmark_width_ * 3, benchmark_width_, benchmark_height_);
+                benchmark_width_, tmp_pixels_c_b, benchmark_width_,
+                dst_pixels_c, benchmark_width_ * 3, benchmark_width_,
+                benchmark_height_);
   MaskCpuFlags(benchmark_cpu_info_);
 
   SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_opt_r,
@@ -3244,8 +3248,9 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
                 tmp_pixels_c_b, benchmark_width_, benchmark_width_,
                 benchmark_height_);
   MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g,
-                benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c,
-                benchmark_width_ * 3, benchmark_width_, benchmark_height_);
+                benchmark_width_, tmp_pixels_c_b, benchmark_width_,
+                dst_pixels_c, benchmark_width_ * 3, benchmark_width_,
+                benchmark_height_);
   MaskCpuFlags(benchmark_cpu_info_);
 
   for (int i = 0; i < benchmark_iterations_; ++i) {
@@ -3446,8 +3451,8 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
 
   for (int i = 0; i < benchmark_iterations_; ++i) {
     MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g,
-                   benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL, 0,
-                   dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
+                   benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL,
+                   0, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
                    benchmark_height_);
   }
 
@@ -3502,8 +3507,8 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
   for (int i = 0; i < benchmark_iterations_; ++i) {
     SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_opt_r,
                    benchmark_width_, tmp_pixels_opt_g, benchmark_width_,
-                   tmp_pixels_opt_b, benchmark_width_, NULL, 0, benchmark_width_,
-                   benchmark_height_);
+                   tmp_pixels_opt_b, benchmark_width_, NULL, 0,
+                   benchmark_width_, benchmark_height_);
   }
 
   MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g,
diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index 8959addde..66fd4cf31 100644
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -320,16 +320,16 @@ TEST_FACTOR(3, 1, 3)
 
 #ifndef DISABLE_SLOW_TESTS
 // Test scale to a specified size with all 4 filters.
-#define TEST_SCALETO(name, width, height)         \
-  TEST_SCALETO1(, name, width, height, None, 0)   \
-  TEST_SCALETO1(, name, width, height, Linear, 3) \
+#define TEST_SCALETO(name, width, height)           \
+  TEST_SCALETO1(, name, width, height, None, 0)     \
+  TEST_SCALETO1(, name, width, height, Linear, 3)   \
   TEST_SCALETO1(, name, width, height, Bilinear, 3) \
   TEST_SCALETO1(, name, width, height, Box, 3)
 #else
 #if defined(ENABLE_FULL_TESTS)
-#define TEST_SCALETO(name, width, height)                  \
-  TEST_SCALETO1(DISABLED_, name, width, height, None, 0)   \
-  TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
+#define TEST_SCALETO(name, width, height)                    \
+  TEST_SCALETO1(DISABLED_, name, width, height, None, 0)     \
+  TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3)   \
   TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
   TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
 #else
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 088cd29a9..fd8fff802 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -1058,7 +1058,7 @@ TEST_SCALETO(Scale, 320, 240)
 TEST_SCALETO(Scale, 1280, 720)
 TEST_SCALETO(Scale, 1920, 1080)
 TEST_SCALETO(Scale, 1080, 1920)  // for rotated phones
-#endif  // DISABLE_SLOW_TESTS
+#endif                           // DISABLE_SLOW_TESTS
 #undef TEST_SCALETO1
 #undef TEST_SCALETO