diff --git a/README.chromium b/README.chromium
index 4c281834f..6412e18ae 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1888
+Version: 1889
 License: BSD
 License File: LICENSE
 Shipped: yes
diff --git a/include/libyuv/macros_msa.h b/include/libyuv/macros_msa.h
index b9a44fcce..6434a4da0 100644
--- a/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
@@ -20,9 +20,9 @@
   ({                                                   \
     const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
     uint32_t val_m;                                    \
-    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"     \
-                 : [val_m] "=r"(val_m)                 \
-                 : [psrc_lw_m] "m"(*psrc_lw_m));       \
+    asm("lw  %[val_m],  %[psrc_lw_m]  \n"              \
+        : [val_m] "=r"(val_m)                          \
+        : [psrc_lw_m] "m"(*psrc_lw_m));                \
     val_m;                                             \
   })
 
@@ -31,9 +31,9 @@
   ({                                                   \
     const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
     uint64_t val_m = 0;                                \
-    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"     \
-                 : [val_m] "=r"(val_m)                 \
-                 : [psrc_ld_m] "m"(*psrc_ld_m));       \
+    asm("ld  %[val_m],  %[psrc_ld_m]  \n"              \
+        : [val_m] "=r"(val_m)                          \
+        : [psrc_ld_m] "m"(*psrc_ld_m));                \
     val_m;                                             \
   })
 #else  // !(__mips == 64)
@@ -55,9 +55,9 @@
   ({                                                    \
     uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
     uint32_t val_m = (val);                             \
-    asm volatile("sw  %[val_m],  %[pdst_sw_m]  \n"      \
-                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
-                 : [val_m] "r"(val_m));                 \
+    asm("sw  %[val_m],  %[pdst_sw_m]  \n"               \
+        : [pdst_sw_m] "=m"(*pdst_sw_m)                  \
+        : [val_m] "r"(val_m));                          \
   })
 
 #if (__mips == 64)
@@ -65,9 +65,9 @@
   ({                                                    \
     uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
     uint64_t val_m = (val);                             \
-    asm volatile("sd  %[val_m],  %[pdst_sd_m]  \n"      \
-                 : [pdst_sd_m] "=m"(*pdst_sd_m)         \
-                 : [val_m] "r"(val_m));                 \
+    asm("sd  %[val_m],  %[pdst_sd_m]  \n"               \
+        : [pdst_sd_m] "=m"(*pdst_sd_m)                  \
+        : [val_m] "r"(val_m));                          \
   })
 #else  // !(__mips == 64)
 #define SD(val, pdst)                                        \
@@ -86,8 +86,7 @@
     uint8_t* psrc_lw_m = (uint8_t*)(psrc);      \
     uint32_t val_lw_m;                          \
                                                 \
-    __asm__ volatile(                           \
-        "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
+    asm("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
         "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
                                                 \
         : [val_lw_m] "=&r"(val_lw_m)            \
@@ -102,8 +101,7 @@
     uint8_t* psrc_ld_m = (uint8_t*)(psrc);      \
     uint64_t val_ld_m = 0;                      \
                                                 \
-    __asm__ volatile(                           \
-        "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
+    asm("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
         "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
                                                 \
         : [val_ld_m] "=&r"(val_ld_m)            \
@@ -130,9 +128,9 @@
   ({                                                    \
     uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
     uint32_t val_m = (val);                             \
-    asm volatile("usw  %[val_m],  %[pdst_sw_m]  \n"     \
-                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
-                 : [val_m] "r"(val_m));                 \
+    asm("usw  %[val_m],  %[pdst_sw_m]  \n"              \
+        : [pdst_sw_m] "=m"(*pdst_sw_m)                  \
+        : [val_m] "r"(val_m));                          \
   })
 
 #define SD(val, pdst)                                        \
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index d3099004a..001600c90 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1888
+#define LIBYUV_VERSION 1889
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc
index 970f950f4..492969259 100644
--- a/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@@ -29,7 +29,8 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                                int count) {
   uint64_t diff;
 
-      asm("xor         %3,%3                         \n"
+      asm volatile (
+      "xor         %3,%3                         \n"
       "xor         %%r8,%%r8                     \n"
       "xor         %%r9,%%r9                     \n"
       "xor         %%r10,%%r10                   \n"
@@ -76,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                                int count) {
   uint32_t diff = 0u;
 
-  asm(
+  asm volatile (
       // Process 16 bytes per loop.
       LABELALIGN
       "1:                                        \n"
@@ -120,7 +121,8 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
                                int count) {
   uint32_t diff;
 
-      asm("movdqa      %4,%%xmm2                     \n"
+  asm volatile (
+      "movdqa      %4,%%xmm2                     \n"
       "movdqa      %5,%%xmm3                     \n"
       "pxor        %%xmm0,%%xmm0                 \n"
       "pxor        %%xmm1,%%xmm1                 \n"
@@ -178,7 +180,8 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
                               int count) {
   uint32_t diff;
 
-      asm("vbroadcastf128 %4,%%ymm2                  \n"
+      asm volatile (
+      "vbroadcastf128 %4,%%ymm2                  \n"
       "vbroadcastf128 %5,%%ymm3                  \n"
       "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
       "vpxor       %%ymm1,%%ymm1,%%ymm1          \n"
@@ -231,7 +234,8 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count) {
   uint32_t sse;
-      asm("pxor        %%xmm0,%%xmm0                 \n"
+      asm volatile (
+      "pxor        %%xmm0,%%xmm0                 \n"
       "pxor        %%xmm5,%%xmm5                 \n"
 
       LABELALIGN
@@ -296,7 +300,8 @@ static const uvec32 kHashMul3 = {
 
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
   uint32_t hash;
-      asm("movd        %2,%%xmm0                     \n"
+      asm volatile (
+      "movd        %2,%%xmm0                     \n"
       "pxor        %%xmm7,%%xmm7                 \n"
       "movdqa      %4,%%xmm6                     \n"
 
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
index afdd60121..c2aea6074 100644
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -28,7 +28,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
                               int count) {
   uint32_t diff;
 
-  asm volatile(
+  asm volatile (
       "vmov.u16    q4, #0                        \n"  // accumulator
 
       "1:                                        \n"
@@ -58,7 +58,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count) {
   uint32_t sse;
-  asm volatile(
+  asm volatile (
       "vmov.u8     q8, #0                        \n"
       "vmov.u8     q10, #0                       \n"
       "vmov.u8     q9, #0                        \n"
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index b61b9f7ac..07292deff 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -26,7 +26,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
                               const uint8_t* src_b,
                               int count) {
   uint32_t diff;
-  asm volatile(
+  asm volatile (
       "movi        v4.8h, #0                     \n"
 
       "1:                                        \n"
@@ -55,7 +55,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count) {
   uint32_t sse;
-  asm volatile(
+  asm volatile (
       "movi        v16.16b, #0                   \n"
       "movi        v17.16b, #0                   \n"
       "movi        v18.16b, #0                   \n"
@@ -157,7 +157,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
                                       const uint8_t* src_b,
                                       int count) {
   uint32_t diff;
-  asm volatile(
+  asm volatile (
       "movi        v4.4s, #0                     \n"
       "movi        v5.4s, #0                     \n"
       "movi        v6.16b, #1                    \n"
@@ -190,7 +190,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
                                      int count) {
   // count is guaranteed to be a multiple of 32.
   uint32_t sse;
-  asm volatile(
+  asm volatile (
       "movi        v4.4s, #0                     \n"
       "movi        v5.4s, #0                     \n"
 
diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc
index 034161421..48926b687 100644
--- a/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
                         uint8_t* dst,
                         int dst_stride,
                         int width) {
-  asm(
+  asm volatile (
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
@@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
                              uint8_t* dst,
                              int dst_stride,
                              int width) {
-  asm(
+  asm volatile (
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
@@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
                          uint8_t* dst_b,
                          int dst_stride_b,
                          int width) {
-  asm(
+  asm volatile (
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
@@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
                           uint8_t* dst,
                           int dst_stride,
                           int width) {
-  asm(
+  asm volatile (
       // Main loop transpose 4x4.  Read a column, write a row.
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"  // a b c d
@@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
                           uint8_t* dst,
                           int dst_stride,
                           int width) {
-  asm(
+  asm volatile (
       // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
       "1:                                        \n"
       "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
diff --git a/source/rotate_msa.cc b/source/rotate_msa.cc
index d4e62b12e..99bdca65b 100644
--- a/source/rotate_msa.cc
+++ b/source/rotate_msa.cc
@@ -51,6 +51,16 @@ extern "C" {
     out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
   }
 
+void TransposeWx16_C(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width) {
+  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
+  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
+                 width);
+}
+
 void TransposeUVWx16_C(const uint8_t* src,
                        int src_stride,
                        uint8_t* dst_a,
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index e0defb480..334a9f998 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -27,7 +27,7 @@ void TransposeWx8_NEON(const uint8_t* src,
                        int dst_stride,
                        int width) {
   const uint8_t* temp;
-  asm(
+  asm volatile (
       // loops are on blocks of 8. loop will stop when
       // counter gets to or below 0. starting the counter
       // at w-8 allow for this
@@ -95,7 +95,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                          int dst_stride_b,
                          int width) {
   const uint8_t* temp;
-  asm(
+  asm volatile (
       // loops are on blocks of 8. loop will stop when
       // counter gets to or below 0. starting the counter
       // at w-8 allow for this
@@ -184,7 +184,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
   uint8_t* dst1 = dst + dst_stride;
   uint8_t* dst2 = dst1 + dst_stride;
   uint8_t* dst3 = dst2 + dst_stride;
-  asm volatile(
+  asm volatile (
       // Main loop transpose 4x4.  Read a column, write a row.
       "1:                                        \n"
       "vld4.32     {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc
index 0062d6746..dbf08edac 100644
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@@ -27,7 +27,8 @@ void TransposeWx16_NEON(const uint8_t* src,
                         int dst_stride,
                         int width) {
   const uint8_t* src_temp;
-  asm("1:                                                \n"
+  asm volatile (
+    "1:                                                \n"
       "mov   %[src_temp], %[src]                         \n"
 
       "ld1   {v16.16b}, [%[src_temp]], %[src_stride]     \n"
@@ -144,7 +145,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                          int dst_stride_b,
                          int width) {
   const uint8_t* temp;
-  asm(
+  asm volatile (
       // loops are on blocks of 8. loop will stop when
       // counter gets to or below 0. starting the counter
       // at w-8 allow for this
@@ -238,7 +239,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
   uint8_t* dst1 = dst + dst_stride;
   uint8_t* dst2 = dst1 + dst_stride;
   uint8_t* dst3 = dst2 + dst_stride;
-  asm volatile(
+  asm volatile (
       // Main loop transpose 4x4.  Read a column, write a row.
       "1:                                        \n"
       "ld4         {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index b601b60b4..f8f41860a 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -161,7 +161,8 @@ static const lvec8 kShuffleNV21 = {
 
 #ifdef HAS_J400TOARGBROW_SSE2
 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
 
       LABELALIGN
@@ -191,7 +192,8 @@ void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                           uint8_t* dst_argb,
                           int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
       "pslld       $0x18,%%xmm5                  \n"
       "movdqa      %3,%%xmm4                     \n"
 
@@ -228,7 +230,8 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
 }
 
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
       "pslld       $0x18,%%xmm5                  \n"
       "movdqa      %3,%%xmm4                     \n"
 
@@ -266,7 +269,8 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 
 // Same code as RAWToARGB with different shuffler and A in low bits
 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
       "psrld       $0x18,%%xmm5                  \n"
       "movdqa      %3,%%xmm4                     \n"
 
@@ -305,7 +309,8 @@ void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
                          uint8_t* dst_rgb24,
                          int width) {
-      asm("movdqa      %3,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm3                     \n"
       "movdqa      %4,%%xmm4                     \n"
       "movdqa      %5,%%xmm5                     \n"
 
@@ -334,7 +339,8 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
 }
 
 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("mov         $0x1080108,%%eax              \n"
+  asm volatile (
+      "mov         $0x1080108,%%eax              \n"
       "movd        %%eax,%%xmm5                  \n"
       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
       "mov         $0x20802080,%%eax             \n"
@@ -381,7 +387,8 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("mov         $0x1080108,%%eax              \n"
+  asm volatile (
+      "mov         $0x1080108,%%eax              \n"
       "movd        %%eax,%%xmm5                  \n"
       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
       "mov         $0x42004200,%%eax             \n"
@@ -431,7 +438,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("mov         $0xf0f0f0f,%%eax              \n"
+  asm volatile (
+      "mov         $0xf0f0f0f,%%eax              \n"
       "movd        %%eax,%%xmm4                  \n"
       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
       "movdqa      %%xmm4,%%xmm5                 \n"
@@ -467,7 +475,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-      asm("movdqa      %3,%%xmm6                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm6                     \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -504,7 +513,8 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-      asm("movdqa      %3,%%xmm6                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm6                     \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -545,7 +555,8 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
 
 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("vbroadcastf128 %3,%%ymm6                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm6                  \n"
       "vmovdqa     %4,%%ymm7                     \n"
 
       LABELALIGN
@@ -604,7 +615,8 @@ static const ulvec8 kPermARGBToRGB24_2 = {
     50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
 
 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
-      asm("vmovdqa     %3,%%ymm5                     \n"
+  asm volatile (
+      "vmovdqa     %3,%%ymm5                     \n"
       "vmovdqa     %4,%%ymm6                     \n"
       "vmovdqa     %5,%%ymm7                     \n"
 
@@ -637,7 +649,8 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_ARGBTORAWROW_AVX2
 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("vbroadcastf128 %3,%%ymm6                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm6                  \n"
       "vmovdqa     %4,%%ymm7                     \n"
 
       LABELALIGN
@@ -681,7 +694,8 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 #endif
 
 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("pcmpeqb     %%xmm3,%%xmm3                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm3,%%xmm3                 \n"
       "psrld       $0x1b,%%xmm3                  \n"
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrld       $0x1a,%%xmm4                  \n"
@@ -720,7 +734,8 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 uint8_t* dst,
                                 uint32_t dither4,
                                 int width) {
-      asm("movd        %3,%%xmm6                     \n"
+  asm volatile (
+      "movd        %3,%%xmm6                     \n"
       "punpcklbw   %%xmm6,%%xmm6                 \n"
       "movdqa      %%xmm6,%%xmm7                 \n"
       "punpcklwd   %%xmm6,%%xmm6                 \n"
@@ -767,7 +782,8 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
                                 uint32_t dither4,
                                 int width) {
-      asm("vbroadcastss %3,%%xmm6                    \n"
+  asm volatile (
+      "vbroadcastss %3,%%xmm6                    \n"
       "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
       "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
       "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
@@ -808,7 +824,8 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrld       $0x1b,%%xmm4                  \n"
       "movdqa      %%xmm4,%%xmm5                 \n"
       "pslld       $0x5,%%xmm5                   \n"
@@ -848,7 +865,8 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psllw       $0xc,%%xmm4                   \n"
       "movdqa      %%xmm4,%%xmm3                 \n"
       "psrlw       $0x8,%%xmm3                   \n"
@@ -910,7 +928,8 @@ static const uint32_t kMaskAG10 = 0xc000ff00;
 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
 
 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-      asm("movdqa      %3,%%xmm2                     \n"  // shuffler for RB
+  asm volatile (
+      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
       "movd        %4,%%xmm3                     \n"  // multipler for RB
       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
       "movd        %6,%%xmm5                     \n"  // mask for AG
@@ -948,7 +967,8 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-      asm("movdqa      %3,%%xmm2                     \n"  // shuffler for RB
+  asm volatile (
+      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
       "movd        %4,%%xmm3                     \n"  // multipler for RB
       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
       "movd        %6,%%xmm5                     \n"  // mask for AG
@@ -987,7 +1007,8 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_ARGBTOAR30ROW_AVX2
 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
@@ -1023,7 +1044,8 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_ABGRTOAR30ROW_AVX2
 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
@@ -1068,7 +1090,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
                          uint16_t* dst_ar64,
                          int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqa      %%xmm0,%%xmm1                 \n"
@@ -1090,7 +1112,8 @@ void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
 void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
                          uint16_t* dst_ab64,
                          int width) {
-      asm("movdqa      %3,%%xmm2                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm2                     \n"
       "movdqa      %4,%%xmm3                     \n" LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
@@ -1114,7 +1137,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
                          uint8_t* dst_argb,
                          int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1136,7 +1159,8 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
                          uint8_t* dst_argb,
                          int width) {
-      asm("movdqa      %3,%%xmm2                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm2                     \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -1162,7 +1186,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
@@ -1187,7 +1211,8 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
 void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
                         uint16_t* dst_ab64,
                         int width) {
-      asm("vbroadcastf128 %3,%%ymm2                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm2                  \n"
       "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
@@ -1214,7 +1239,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
                         uint8_t* dst_argb,
                         int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -1240,7 +1265,8 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
                         uint8_t* dst_argb,
                         int width) {
-      asm("vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -1334,7 +1360,8 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
 
@@ -1354,7 +1381,8 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16.
 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN RGBTOY(xmm5)
@@ -1371,7 +1399,8 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 // Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
 // Same as ABGRToYRow but different coefficients, no add 16.
 void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN RGBTOY(xmm5)
@@ -1388,7 +1417,8 @@ void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16.
 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN RGBTOY(xmm5)
@@ -1411,7 +1441,8 @@ static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-      asm("vbroadcastf128 %3,%%ymm4                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vbroadcastf128 %5,%%ymm7                  \n"
       "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
@@ -1431,7 +1462,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 #ifdef HAS_ABGRTOYROW_AVX2
 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-      asm("vbroadcastf128 %3,%%ymm4                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vbroadcastf128 %5,%%ymm7                  \n"
       "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
@@ -1451,7 +1483,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-      asm("vbroadcastf128 %3,%%ymm4                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
       ymm5) "vzeroupper                                \n"
@@ -1469,7 +1502,8 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 #ifdef HAS_ABGRTOYJROW_AVX2
 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
 void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-      asm("vbroadcastf128 %3,%%ymm4                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
       ymm5) "vzeroupper                                \n"
@@ -1487,7 +1521,8 @@ void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 #ifdef HAS_RGBATOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-      asm("vbroadcastf128 %3,%%ymm4                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
       ymm5) "vzeroupper                                \n"
@@ -1507,7 +1542,8 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-      asm("movdqa      %5,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -1579,7 +1615,8 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-      asm("vbroadcastf128 %5,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
       "sub         %1,%2                         \n"
@@ -1641,7 +1678,8 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-      asm("vbroadcastf128 %5,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
       "sub         %1,%2                         \n"
@@ -1703,7 +1741,8 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-      asm("vbroadcastf128 %5,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
       "sub         %1,%2                         \n"
@@ -1767,7 +1806,8 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-      asm("vbroadcastf128 %5,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
       "sub         %1,%2                         \n"
@@ -1830,7 +1870,8 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-      asm("movdqa      %5,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -1895,7 +1936,8 @@ void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-      asm("movdqa      %5,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -1959,7 +2001,8 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-      asm("movdqa      %4,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %4,%%xmm3                     \n"
       "movdqa      %5,%%xmm4                     \n"
       "movdqa      %6,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -2012,7 +2055,8 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
 #endif  // HAS_ARGBTOUV444ROW_SSSE3
 
 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
 
@@ -2032,7 +2076,8 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-      asm("movdqa      %5,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -2090,7 +2135,8 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
 }
 
 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
 
@@ -2106,7 +2152,8 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 }
 
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
 
@@ -2126,7 +2173,8 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-      asm("movdqa      %5,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -2188,7 +2236,8 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-      asm("movdqa      %5,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -2608,7 +2657,7 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  asm(YUVTORGB_SETUP(
+  asm volatile (YUVTORGB_SETUP(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA444
@@ -2934,7 +2983,7 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  asm(YUVTORGB_SETUP(
+  asm volatile (YUVTORGB_SETUP(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA210
@@ -2966,7 +3015,7 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  asm(YUVTORGB_SETUP(
+  asm volatile (YUVTORGB_SETUP(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA410
@@ -3032,7 +3081,7 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  asm(YUVTORGB_SETUP(
+  asm volatile (YUVTORGB_SETUP(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA422
@@ -3060,7 +3109,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm(YUVTORGB_SETUP(
+  asm volatile (YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:                                        \n" READNV12
@@ -3081,7 +3130,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm(YUVTORGB_SETUP(
+  asm volatile (YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:                                        \n" READNV21
@@ -3102,7 +3151,8 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-      asm("movdqa      %[kShuffleYUY2Y],%%xmm6       \n"
+  asm volatile (
+      "movdqa      %[kShuffleYUY2Y],%%xmm6       \n"
       "movdqa      %[kShuffleYUY2UV],%%xmm7      \n" YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
@@ -3123,7 +3173,8 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-      asm("movdqa      %[kShuffleUYVYY],%%xmm6       \n"
+  asm volatile (
+      "movdqa      %[kShuffleUYVYY],%%xmm6       \n"
       "movdqa      %[kShuffleUYVYUV],%%xmm7      \n" YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
@@ -3145,7 +3196,7 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm(YUVTORGB_SETUP(
+  asm volatile (YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:                                        \n" READP210
@@ -3166,7 +3217,7 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm(YUVTORGB_SETUP(
+  asm volatile (YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:                                        \n" READP410
@@ -4000,7 +4051,7 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  asm(YUVTORGB_SETUP_AVX2(
+  asm volatile (YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA210_AVX2
@@ -4035,7 +4086,7 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  asm(YUVTORGB_SETUP_AVX2(
+  asm volatile (YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA410_AVX2
@@ -4110,7 +4161,7 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  asm(YUVTORGB_SETUP_AVX2(
+  asm volatile (YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA444_AVX2
@@ -4144,7 +4195,7 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  asm(YUVTORGB_SETUP_AVX2(
+  asm volatile (YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA422_AVX2
@@ -4220,7 +4271,7 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm(YUVTORGB_SETUP_AVX2(
+  asm volatile (YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:                                        \n" READNV12_AVX2
@@ -4246,7 +4297,7 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm(YUVTORGB_SETUP_AVX2(
+  asm volatile (YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:                                        \n" READNV21_AVX2
@@ -4272,7 +4323,8 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-      asm("vbroadcastf128 %[kShuffleYUY2Y],%%ymm6    \n"
+  asm volatile (
+      "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6    \n"
       "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
@@ -4298,7 +4350,8 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-      asm("vbroadcastf128 %[kShuffleUYVYY],%%ymm6    \n"
+  asm volatile (
+      "vbroadcastf128 %[kShuffleUYVYY],%%ymm6    \n"
       "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
@@ -4325,7 +4378,7 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm(YUVTORGB_SETUP_AVX2(
+  asm volatile (YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:                                        \n" READP210_AVX2
@@ -4351,7 +4404,7 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm(YUVTORGB_SETUP_AVX2(
+  asm volatile (YUVTORGB_SETUP_AVX2(
       yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN "1:                                        \n" READP410_AVX2
@@ -4448,7 +4501,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-      asm("movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
+  asm volatile (
+      "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
       "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
       "pslld       $0x18,%%xmm4                  \n"
@@ -4492,7 +4546,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-      asm("vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
+  asm volatile (
+      "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
       "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
       "vpslld      $0x18,%%ymm4,%%ymm4           \n"
@@ -4535,7 +4590,8 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
 
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm("movdqa      %3,%%xmm5                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm5                     \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -4556,7 +4612,8 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_MIRRORROW_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm("vbroadcastf128 %3,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm5                  \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -4583,7 +4640,8 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
 
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm("movdqa      %3,%%xmm5                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm5                     \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -4604,7 +4662,8 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
 #ifdef HAS_MIRRORUVROW_AVX2
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm("vbroadcastf128 %3,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm5                  \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -4633,7 +4692,8 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
                             uint8_t* dst_v,
                             int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm("movdqa      %4,%%xmm1                     \n"
+  asm volatile (
+      "movdqa      %4,%%xmm1                     \n"
       "lea         -0x10(%0,%3,2),%0             \n"
       "sub         %1,%2                         \n"
 
@@ -4672,7 +4732,8 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
                           int width) {
   intptr_t temp_width = (intptr_t)(width);
   src_rgb24 += width * 3 - 48;
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN
@@ -4706,7 +4767,8 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
 
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm("lea         -0x10(%0,%2,4),%0             \n"
+  asm volatile (
+      "lea         -0x10(%0,%2,4),%0             \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -4730,7 +4792,8 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm("vmovdqu     %3,%%ymm5                     \n"
+  asm volatile (
+      "vmovdqu     %3,%%ymm5                     \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -4753,7 +4816,8 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
 
@@ -4790,7 +4854,8 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
 
@@ -4826,7 +4891,8 @@ void DetileRow_SSE2(const uint8_t* src,
                     ptrdiff_t src_tile_stride,
                     uint8_t* dst,
                     int width) {
-      asm("1:                                        \n"
+  asm volatile (
+      "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "sub         $0x10,%2                      \n"
       "lea         (%0,%3),%0                    \n"
@@ -4846,7 +4912,8 @@ void DetileRow_16_SSE2(const uint16_t* src,
                        ptrdiff_t src_tile_stride,
                        uint16_t* dst,
                        int width) {
-      asm("1:                                        \n"
+  asm volatile (
+      "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
       "lea         (%0,%3,2),%0                  \n"
@@ -4868,7 +4935,8 @@ void DetileRow_16_AVX(const uint16_t* src,
                       ptrdiff_t src_tile_stride,
                       uint16_t* dst,
                       int width) {
-      asm("1:                                        \n"
+  asm volatile (
+      "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "lea         (%0,%3,2),%0                  \n"
       "vmovdqu     %%ymm0,(%1)                   \n"
@@ -4892,7 +4960,8 @@ void DetileToYUY2_SSE2(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-      asm("1:                                        \n"
+  asm volatile (
+      "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"  // Load 16 Y
       "sub         $0x10,%3                      \n"
       "lea         (%0,%4),%0                    \n"
@@ -4930,7 +4999,8 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width) {
-      asm("movdqu      %4,%%xmm1                     \n"
+  asm volatile (
+      "movdqu      %4,%%xmm1                     \n"
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "lea         (%0, %5),%0                   \n"
@@ -4956,7 +5026,8 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_uv,
                          int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -4984,7 +5055,8 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -5012,7 +5084,8 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -5042,7 +5115,8 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
                         uint16_t* dst_uv,
                         int depth,
                         int width) {
-      asm("vmovd       %4,%%xmm3                     \n"
+  asm volatile (
+      "vmovd       %4,%%xmm3                     \n"
       "vmovd       %5,%%xmm4                     \n"
 
       "sub         %0,%1                         \n"
@@ -5080,7 +5154,8 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
                         int depth,
                         int width) {
   depth = 16 - depth;
-      asm("vmovd       %4,%%xmm3                     \n"
+  asm volatile (
+      "vmovd       %4,%%xmm3                     \n"
       "vbroadcastf128 %5,%%ymm4                  \n"
       "sub         %1,%2                         \n"
 
@@ -5125,7 +5200,8 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width) {
-      asm("vmovd       %3,%%xmm3                     \n"
+  asm volatile (
+      "vmovd       %3,%%xmm3                     \n"
       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
       "vbroadcastss %%xmm3,%%ymm3                \n"
       "sub         %0,%1                         \n"
@@ -5161,7 +5237,8 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
                        uint16_t* dst_y,
                        int scale,
                        int width) {
-      asm("vmovd       %3,%%xmm3                     \n"
+  asm volatile (
+      "vmovd       %3,%%xmm3                     \n"
       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
       "vbroadcastss %%xmm3,%%ymm3                \n"
       "sub         %0,%1                         \n"
@@ -5197,7 +5274,8 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y,
                            uint8_t* dst_y,
                            int scale,
                            int width) {
-      asm("movd        %3,%%xmm2                     \n"
+  asm volatile (
+      "movd        %3,%%xmm2                     \n"
       "punpcklwd   %%xmm2,%%xmm2                 \n"
       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
 
@@ -5226,7 +5304,8 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
                           uint8_t* dst_y,
                           int scale,
                           int width) {
-      asm("vmovd       %3,%%xmm2                     \n"
+  asm volatile (
+      "vmovd       %3,%%xmm2                     \n"
       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
       "vbroadcastss %%xmm2,%%ymm2                \n"
 
@@ -5261,7 +5340,8 @@ void Convert8To16Row_SSE2(const uint8_t* src_y,
                           uint16_t* dst_y,
                           int scale,
                           int width) {
-      asm("movd        %3,%%xmm2                     \n"
+  asm volatile (
+      "movd        %3,%%xmm2                     \n"
       "punpcklwd   %%xmm2,%%xmm2                 \n"
       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
 
@@ -5292,7 +5372,8 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
                           uint16_t* dst_y,
                           int scale,
                           int width) {
-      asm("vmovd       %3,%%xmm2                     \n"
+  asm volatile (
+      "vmovd       %3,%%xmm2                     \n"
       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
       "vbroadcastss %%xmm2,%%ymm2                \n"
 
@@ -5347,7 +5428,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -5421,7 +5502,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_rgb,
                        int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      (%1),%%xmm1                   \n"
@@ -5476,7 +5557,8 @@ void MergeARGBRow_SSE2(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
 
@@ -5516,7 +5598,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
 
       "movq        (%2),%%xmm0                   \n"  // B
@@ -5554,7 +5636,8 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
 
@@ -5642,7 +5725,8 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
-      asm("sub         %1,%2                         \n"
+  asm volatile (
+      "sub         %1,%2                         \n"
       "sub         %1,%3                         \n"
       "sub         %1,%4                         \n"
 
@@ -5692,7 +5776,7 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
 
       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
@@ -5741,7 +5825,8 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_b,
                         uint8_t* dst_a,
                         int width) {
-      asm("movdqa      %6,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %6,%%xmm3                     \n"
       "sub         %1,%2                         \n"
       "sub         %1,%3                         \n"
       "sub         %1,%4                         \n"
@@ -5786,7 +5871,8 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_g,
                         uint8_t* dst_b,
                         int width) {
-      asm("movdqa      %5,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %5,%%xmm3                     \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -5826,7 +5912,8 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
-      asm("sub         %1,%2                         \n"
+  asm volatile (
+      "sub         %1,%2                         \n"
       "sub         %1,%3                         \n"
       "sub         %1,%4                         \n"
       "vmovdqa     %7,%%ymm3                     \n"
@@ -5876,7 +5963,8 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-      asm("vmovdqa     %6,%%ymm3                     \n"
+  asm volatile (
+      "vmovdqa     %6,%%ymm3                     \n"
       "vbroadcastf128 %5,%%ymm4                  \n"
 
       LABELALIGN
@@ -5922,7 +6010,8 @@ void MergeXR30Row_AVX2(const uint16_t* src_r,
                        int depth,
                        int width) {
   int shift = depth - 10;
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
@@ -5987,7 +6076,8 @@ void MergeAR64Row_AVX2(const uint16_t* src_r,
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
   mask = (mask << 16) + mask;
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
       "vmovdqa     %8,%%ymm5                     \n"
@@ -6057,7 +6147,8 @@ void MergeXR64Row_AVX2(const uint16_t* src_r,
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
   mask = (mask << 16) + mask;
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "vmovdqa     %7,%%ymm5                     \n"
       "vmovd       %5,%%xmm6                     \n"
@@ -6119,7 +6210,8 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
                             int depth,
                             int width) {
   int shift = depth - 8;
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
       "vbroadcastf128 %7,%%ymm5                  \n"
@@ -6174,7 +6266,8 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
                             int depth,
                             int width) {
   int shift = depth - 8;
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "vbroadcastf128 %6,%%ymm5                  \n"
       "vmovd       %5,%%xmm6                     \n"
@@ -6217,7 +6310,8 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
 
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("test        $0xf,%0                       \n"
+  asm volatile (
+      "test        $0xf,%0                       \n"
       "jne         2f                            \n"
       "test        $0xf,%1                       \n"
       "jne         2f                            \n"
@@ -6256,7 +6350,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_COPYROW_AVX
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -6279,7 +6373,8 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
 // Multiple of 1.
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
   size_t width_tmp = (size_t)(width);
-      asm("rep         movsb                         \n"
+  asm volatile (
+      "rep         movsb                         \n"
       : "+S"(src),       // %0
         "+D"(dst),       // %1
         "+c"(width_tmp)  // %2
@@ -6291,7 +6386,8 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("pcmpeqb     %%xmm0,%%xmm0                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm0,%%xmm0                 \n"
       "pslld       $0x18,%%xmm0                  \n"
       "pcmpeqb     %%xmm1,%%xmm1                 \n"
       "psrld       $0x8,%%xmm1                   \n"
@@ -6325,7 +6421,8 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
 
       LABELALIGN
@@ -6354,7 +6451,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0), %%xmm0                  \n"
       "movdqu      0x10(%0), %%xmm1              \n"
@@ -6383,7 +6480,8 @@ static const uvec8 kShuffleAlphaShort_AVX2 = {
 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-      asm("vmovdqa     %3,%%ymm4                     \n"
+  asm volatile (
+      "vmovdqa     %3,%%ymm4                     \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
 
       LABELALIGN
@@ -6418,7 +6516,8 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("pcmpeqb     %%xmm0,%%xmm0                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm0,%%xmm0                 \n"
       "pslld       $0x18,%%xmm0                  \n"
       "pcmpeqb     %%xmm1,%%xmm1                 \n"
       "psrld       $0x8,%%xmm1                   \n"
@@ -6454,7 +6553,8 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-      asm("vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
 
       LABELALIGN
@@ -6484,7 +6584,8 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width >> 2);
   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-      asm("rep         stosl                         \n"
+  asm volatile (
+      "rep         stosl                         \n"
       : "+D"(dst),       // %0
         "+c"(width_tmp)  // %1
       : "a"(v32)         // %2
@@ -6493,7 +6594,8 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
 
 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width);
-      asm("rep         stosb                         \n"
+  asm volatile (
+      "rep         stosb                         \n"
       : "+D"(dst),       // %0
         "+c"(width_tmp)  // %1
       : "a"(v8)          // %2
@@ -6502,7 +6604,8 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
 
 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
   size_t width_tmp = (size_t)(width);
-      asm("rep         stosl                         \n"
+  asm volatile (
+      "rep         stosl                         \n"
       : "+D"(dst_argb),  // %0
         "+c"(width_tmp)  // %1
       : "a"(v32)         // %2
@@ -6512,7 +6615,8 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
 
 #ifdef HAS_YUY2TOYROW_SSE2
 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
 
       LABELALIGN
@@ -6538,7 +6642,7 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
                         int stride_yuy2,
                         uint8_t* dst_uv,
                         int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -6566,7 +6670,8 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
 
@@ -6604,7 +6709,8 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
 
@@ -6635,7 +6741,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
 }
 
 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -6659,7 +6765,8 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
 
@@ -6697,7 +6804,8 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
 
@@ -6730,7 +6838,8 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
 
 #ifdef HAS_YUY2TOYROW_AVX2
 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
 
       LABELALIGN
@@ -6758,7 +6867,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
                         int stride_yuy2,
                         uint8_t* dst_uv,
                         int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -6786,7 +6895,8 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
 
@@ -6825,7 +6935,8 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
 
@@ -6859,7 +6970,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
 }
 
 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -6884,7 +6995,8 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
 
@@ -6923,7 +7035,8 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
 
@@ -6967,7 +7080,8 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
                         const uint8_t* src_argb1,
                         uint8_t* dst_argb,
                         int width) {
-      asm("pcmpeqb     %%xmm7,%%xmm7                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
       "psrlw       $0xf,%%xmm7                   \n"
       "pcmpeqb     %%xmm6,%%xmm6                 \n"
       "psrlw       $0x8,%%xmm6                   \n"
@@ -7054,7 +7168,8 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0,
                          const uint8_t* alpha,
                          uint8_t* dst,
                          int width) {
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psllw       $0x8,%%xmm5                   \n"
       "mov         $0x80808080,%%eax             \n"
       "movd        %%eax,%%xmm6                  \n"
@@ -7105,7 +7220,8 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
                         const uint8_t* alpha,
                         uint8_t* dst,
                         int width) {
-      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
       "mov         $0x80808080,%%eax             \n"
       "vmovd       %%eax,%%xmm6                  \n"
@@ -7164,7 +7280,8 @@ static const vec8 kAttenuateShuffle = {6,    -128, 6,    -128, 6,  -128,
 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int width) {
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
       "pxor        %%xmm6,%%xmm6                 \n"
@@ -7218,7 +7335,8 @@ static const lvec8 kAttenuateShuffle_AVX2 = {
 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
-      asm("vmovdqa     %3,%%ymm4                     \n"
+  asm volatile (
+      "vmovdqa     %3,%%ymm4                     \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpslld      $0x18,%%ymm5,%%ymm5           \n"
       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"
@@ -7311,7 +7429,8 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "vbroadcastf128 %5,%%ymm5                  \n"
 
       // 8 pixel loop.
@@ -7372,7 +7491,8 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       // 8 pixel loop.
@@ -7433,7 +7553,8 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
-      asm("movdqa      %2,%%xmm2                     \n"
+  asm volatile (
+      "movdqa      %2,%%xmm2                     \n"
       "movdqa      %3,%%xmm3                     \n"
       "movdqa      %4,%%xmm4                     \n"
 
@@ -7493,7 +7614,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
                               uint8_t* dst_argb,
                               const int8_t* matrix_argb,
                               int width) {
-      asm("movdqu      (%3),%%xmm5                   \n"
+  asm volatile (
+      "movdqu      (%3),%%xmm5                   \n"
       "pshufd      $0x00,%%xmm5,%%xmm2           \n"
       "pshufd      $0x55,%%xmm5,%%xmm3           \n"
       "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
@@ -7557,7 +7679,8 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
                           int interval_size,
                           int interval_offset,
                           int width) {
-      asm("movd        %2,%%xmm2                     \n"
+  asm volatile (
+      "movd        %2,%%xmm2                     \n"
       "movd        %3,%%xmm3                     \n"
       "movd        %4,%%xmm4                     \n"
       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
@@ -7607,7 +7730,8 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        int width,
                        uint32_t value) {
-      asm("movd        %3,%%xmm2                     \n"
+  asm volatile (
+      "movd        %3,%%xmm2                     \n"
       "punpcklbw   %%xmm2,%%xmm2                 \n"
       "punpcklqdq  %%xmm2,%%xmm2                 \n"
 
@@ -7642,7 +7766,8 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-      asm("pxor        %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pxor        %%xmm5,%%xmm5                 \n"
 
       // 4 pixel loop.
       LABELALIGN
@@ -7679,7 +7804,8 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-      asm("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
       // 4 pixel loop.
       LABELALIGN
@@ -7831,7 +7957,8 @@ void SobelXRow_SSE2(const uint8_t* src_y0,
                     const uint8_t* src_y2,
                     uint8_t* dst_sobelx,
                     int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -7884,7 +8011,8 @@ void SobelYRow_SSE2(const uint8_t* src_y0,
                     const uint8_t* src_y1,
                     uint8_t* dst_sobely,
                     int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "pxor        %%xmm5,%%xmm5                 \n"
 
@@ -7936,7 +8064,8 @@ void SobelRow_SSE2(const uint8_t* src_sobelx,
                    const uint8_t* src_sobely,
                    uint8_t* dst_argb,
                    int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
 
@@ -7982,7 +8111,8 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
                           const uint8_t* src_sobely,
                           uint8_t* dst_y,
                           int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
 
@@ -8016,7 +8146,8 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx,
                      const uint8_t* src_sobely,
                      uint8_t* dst_argb,
                      int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       // 8 pixel loop.
@@ -8063,7 +8194,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
                                   int32_t* cumsum,
                                   const int32_t* previous_cumsum,
                                   int width) {
-      asm("pxor        %%xmm0,%%xmm0                 \n"
+  asm volatile (
+      "pxor        %%xmm0,%%xmm0                 \n"
       "pxor        %%xmm1,%%xmm1                 \n"
       "sub         $0x4,%3                       \n"
       "jl          49f                           \n"
@@ -8142,7 +8274,8 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
                                     int area,
                                     uint8_t* dst,
                                     int count) {
-      asm("movd        %5,%%xmm5                     \n"
+  asm volatile (
+      "movd        %5,%%xmm5                     \n"
       "cvtdq2ps    %%xmm5,%%xmm5                 \n"
       "rcpss       %%xmm5,%%xmm4                 \n"
       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
@@ -8276,7 +8409,8 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
                         int width) {
   intptr_t src_argb_stride_temp = src_argb_stride;
   intptr_t temp;
-      asm("movq        (%3),%%xmm2                   \n"
+  asm volatile (
+      "movq        (%3),%%xmm2                   \n"
       "movq        0x08(%3),%%xmm7               \n"
       "shl         $0x10,%1                      \n"
       "add         $0x4,%1                       \n"
@@ -8360,7 +8494,8 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
                           ptrdiff_t src_stride,
                           int width,
                           int source_y_fraction) {
-      asm("sub         %1,%0                         \n"
+  asm volatile (
+      "sub         %1,%0                         \n"
       "cmp         $0x0,%3                       \n"
       "je          100f                          \n"
       "cmp         $0x80,%3                      \n"
@@ -8440,7 +8575,8 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
                          ptrdiff_t src_stride,
                          int width,
                          int source_y_fraction) {
-      asm("sub         %1,%0                         \n"
+  asm volatile (
+      "sub         %1,%0                         \n"
       "cmp         $0x0,%3                       \n"
       "je          100f                          \n"
       "cmp         $0x80,%3                      \n"
@@ -8516,7 +8652,8 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           const uint8_t* shuffler,
                           int width) {
-      asm("movdqu      (%3),%%xmm5                   \n"
+  asm volatile (
+      "movdqu      (%3),%%xmm5                   \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -8544,7 +8681,8 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-      asm("vbroadcastf128 (%3),%%ymm5                \n"
+  asm volatile (
+      "vbroadcastf128 (%3),%%ymm5                \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -8573,7 +8711,8 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-      asm("sub         %1,%2                         \n"
+  asm volatile (
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -8607,7 +8746,8 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-      asm("sub         %1,%2                         \n"
+  asm volatile (
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -8641,7 +8781,8 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-      asm("sub         %1,%2                         \n"
+  asm volatile (
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -8678,7 +8819,8 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-      asm("sub         %1,%2                         \n"
+  asm volatile (
+      "sub         %1,%2                         \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -8714,7 +8856,8 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const float* poly,
                             int width) {
-      asm("pxor        %%xmm3,%%xmm3                 \n"
+  asm volatile (
+      "pxor        %%xmm3,%%xmm3                 \n"
 
       // 2 pixel loop.
       LABELALIGN
@@ -8768,7 +8911,8 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const float* poly,
                             int width) {
-      asm("vbroadcastf128 (%3),%%ymm4                \n"
+  asm volatile (
+      "vbroadcastf128 (%3),%%ymm4                \n"
       "vbroadcastf128 0x10(%3),%%ymm5            \n"
       "vbroadcastf128 0x20(%3),%%ymm6            \n"
       "vbroadcastf128 0x30(%3),%%ymm7            \n"
@@ -8810,7 +8954,8 @@ void HalfFloatRow_SSE2(const uint16_t* src,
                        float scale,
                        int width) {
   scale *= kScaleBias;
-      asm("movd        %3,%%xmm4                     \n"
+  asm volatile (
+      "movd        %3,%%xmm4                     \n"
       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
       "pxor        %%xmm5,%%xmm5                 \n"
       "sub         %0,%1                         \n"
@@ -8847,7 +8992,8 @@ void HalfFloatRow_AVX2(const uint16_t* src,
                        float scale,
                        int width) {
   scale *= kScaleBias;
-      asm("vbroadcastss %3, %%ymm4                   \n"
+  asm volatile (
+      "vbroadcastss %3, %%ymm4                   \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
       "sub         %0,%1                         \n"
 
@@ -8887,7 +9033,8 @@ void HalfFloatRow_F16C(const uint16_t* src,
                        uint16_t* dst,
                        float scale,
                        int width) {
-      asm("vbroadcastss %3, %%ymm4                   \n"
+  asm volatile (
+      "vbroadcastss %3, %%ymm4                   \n"
       "sub         %0,%1                         \n"
 
       // 16 pixel loop.
@@ -8921,7 +9068,8 @@ void HalfFloatRow_F16C(const uint16_t* src,
 
 #ifdef HAS_HALFFLOATROW_F16C
 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       // 16 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -9017,7 +9165,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
                                  uint32_t lumacoeff) {
   uintptr_t pixel_temp;
   uintptr_t table_temp;
-      asm("movd        %6,%%xmm3                     \n"
+  asm volatile (
+      "movd        %6,%%xmm3                     \n"
       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psllw       $0x8,%%xmm4                   \n"
@@ -9120,7 +9269,8 @@ void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
                           const uint8_t* src_vu,
                           uint8_t* dst_yuv24,
                           int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "movdqa      (%4),%%xmm4                   \n"  // 3 shuffler constants
       "movdqa      16(%4),%%xmm5                 \n"
       "movdqa      32(%4),%%xmm6                 \n"
@@ -9157,7 +9307,8 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "vbroadcastf128 (%4),%%ymm4                \n"  // 3 shuffler constants
       "vbroadcastf128 16(%4),%%ymm5              \n"
       "vbroadcastf128 32(%4),%%ymm6              \n"
@@ -9204,7 +9355,8 @@ void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
                            const uint8_t* src_vu,
                            uint8_t* dst_yuv24,
                            int width) {
-      asm("sub         %0,%1                         \n"
+  asm volatile (
+      "sub         %0,%1                         \n"
       "vmovdqa     (%4),%%ymm4                   \n"  // 3 shuffler constants
       "vmovdqa     32(%4),%%ymm5                 \n"
       "vmovdqa     64(%4),%%ymm6                 \n" LABELALIGN
@@ -9242,7 +9394,8 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
 
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-      asm("movdqu      %3,%%xmm5                     \n"
+  asm volatile (
+      "movdqu      %3,%%xmm5                     \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -9266,7 +9419,8 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
 
 #ifdef HAS_SWAPUVROW_AVX2
 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-      asm("vbroadcastf128 %3,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm5                  \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -9295,7 +9449,8 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
                           int src_stride_v,
                           uint8_t* dst_uv,
                           int width) {
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -9340,7 +9495,8 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
                          int src_stride_v,
                          uint8_t* dst_uv,
                          int width) {
-      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
@@ -9381,7 +9537,8 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
 }
 
 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
-      asm("pxor        %%xmm1,%%xmm1                 \n"
+  asm volatile (
+      "pxor        %%xmm1,%%xmm1                 \n"
 
       LABELALIGN
       "1:                                        \n"
diff --git a/source/row_lasx.cc b/source/row_lasx.cc
index be85022e8..6d49aa5e8 100644
--- a/source/row_lasx.cc
+++ b/source/row_lasx.cc
@@ -2037,7 +2037,7 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
                                   int width,
                                   const struct RgbConstants* rgbconstants) {
   int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
-  asm volatile(
+  asm volatile (
       "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
@@ -2099,7 +2099,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
                                   int width,
                                   const struct RgbConstants* rgbconstants) {
   int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
-  asm volatile(
+  asm volatile (
       "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
@@ -2163,7 +2163,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
       1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
       25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0,
       25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
-  asm volatile(
+  asm volatile (
       "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
       "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
diff --git a/source/row_lsx.cc b/source/row_lsx.cc
index fa088c9e7..09f206cab 100644
--- a/source/row_lsx.cc
+++ b/source/row_lsx.cc
@@ -2805,8 +2805,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
                                  uint8_t* dst_y,
                                  int width,
                                  const struct RgbConstants* rgbconstants) {
-  asm volatile(
-      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+  asm("vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
       "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
       "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
@@ -2864,8 +2863,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
                                  uint8_t* dst_y,
                                  int width,
                                  const struct RgbConstants* rgbconstants) {
-  asm volatile(
-      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+  asm("vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
       "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
       "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
@@ -2922,8 +2920,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
                       7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
                       0,  13, 0,  16, 0,  19, 0,  22, 0,  25, 0,  28, 0,
                       31, 0,  2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
-  asm volatile(
-      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
+  asm("vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
       "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
       "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
       "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 8adbbbb5d..ef9e1c3c5 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -140,7 +140,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV444 YUVTORGB
@@ -164,7 +164,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "1:                                        \n" READYUV444 YUVTORGB
           RGBTORGB8
@@ -187,7 +187,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
@@ -212,7 +212,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "1:                                        \n" READYUV444 YUVTORGB
           RGBTORGB8
@@ -238,7 +238,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
           RGBTORGB8
@@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
@@ -285,7 +285,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
@@ -316,7 +316,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV422 YUVTORGB
@@ -348,7 +348,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 YUVTORGB
           RGBTORGB8
@@ -381,7 +381,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "vmov.u8     d7, #0x0f                     \n"  // vbic bits to clear
@@ -404,7 +404,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUV400 YUVTORGB
@@ -421,7 +421,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
 }
 
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d23, #255                     \n"
       "1:                                        \n"
       "vld1.8      {d20}, [%0]!                  \n"
@@ -442,7 +442,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
@@ -463,7 +463,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
@@ -484,7 +484,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
@@ -505,7 +505,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
@@ -526,7 +526,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                           uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
@@ -546,7 +546,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
@@ -565,7 +565,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
       "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
@@ -585,7 +585,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pairs of UV
       "subs        %3, %3, #16                   \n"  // 16 processed per loop
@@ -609,7 +609,7 @@ void DetileRow_NEON(const uint8_t* src,
                     ptrdiff_t src_tile_stride,
                     uint8_t* dst,
                     int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q0}, [%0], %3                \n"  // load 16 bytes
       "subs        %2, %2, #16                   \n"  // 16 processed per loop
@@ -629,7 +629,7 @@ void DetileRow_16_NEON(const uint16_t* src,
                        ptrdiff_t src_tile_stride,
                        uint16_t* dst,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.16     {q0, q1}, [%0], %3            \n"  // load 16 pixels
       "subs        %2, %2, #16                   \n"  // 16 processed per loop
@@ -650,7 +650,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld2.8      {d0, d1}, [%0], %4            \n"
       "subs        %3, %3, #16                   \n"
@@ -675,7 +675,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
       "pld         [%0, #1792]                   \n"
@@ -701,7 +701,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
       "vld1.8      {q1}, [%1], %5                \n"  // Load 8 UV
@@ -723,7 +723,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
 #endif
 
 void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q14}, [%0]!                  \n"  // Load lower bits.
       "vld1.8      {q9}, [%0]!                   \n"  // Load upper bits row
@@ -767,7 +767,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load U
       "vld1.8      {q1}, [%1]!                   \n"  // load V
@@ -789,7 +789,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB
       "vld3.8      {d1, d3, d5}, [%0]!           \n"  // next 8 RGB
@@ -814,7 +814,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_rgb,
                       int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load R
       "vld1.8      {q1}, [%1]!                   \n"  // load G
@@ -840,7 +840,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
@@ -868,7 +868,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q2}, [%0]!                   \n"  // load R
       "vld1.8      {q1}, [%1]!                   \n"  // load G
@@ -895,7 +895,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
@@ -920,7 +920,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     q3, #255                      \n"  // load A(255)
       "1:                                        \n"
       "vld1.8      {q2}, [%0]!                   \n"  // load R
@@ -947,7 +947,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
                        int depth,
                        int width) {
   int shift = 10 - depth;
-  asm volatile(
+  asm volatile (
       "vmov.u32    q14, #1023                    \n"
       "vdup.32     q15, %5                       \n"
       "1:                                        \n"
@@ -984,7 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
                           uint8_t* dst_ar30,
                           int /* depth */,
                           int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u32    q14, #1023                    \n"
       "1:                                        \n"
       "vld1.16     {d4}, [%2]!                   \n"  // B
@@ -1021,7 +1021,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
                        int width) {
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
-  asm volatile(
+  asm volatile (
 
       "vdup.u16    q15, %6                       \n"
       "vdup.u16    q14, %7                       \n"
@@ -1061,7 +1061,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
                        int width) {
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
-  asm volatile(
+  asm volatile (
 
       "vmov.u8     q3, #0xff                     \n"  // A (0xffff)
       "vdup.u16    q15, %5                       \n"
@@ -1098,7 +1098,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
                             int depth,
                             int width) {
   int shift = 8 - depth;
-  asm volatile(
+  asm volatile (
 
       "vdup.16     q15, %6                       \n"
       "1:                                        \n"
@@ -1134,7 +1134,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
                             int depth,
                             int width) {
   int shift = 8 - depth;
-  asm volatile(
+  asm volatile (
 
       "vdup.16     q15, %5                       \n"
       "vmov.u8     d6, #0xff                     \n"  // A (0xff)
@@ -1162,7 +1162,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
 
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 32
       "subs        %2, %2, #32                   \n"  // 32 processed per loop
@@ -1178,7 +1178,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
-  asm volatile(
+  asm volatile (
       "vdup.8      q0, %2                        \n"  // duplicate 16 bytes
       "1:                                        \n"
       "subs        %1, %1, #16                   \n"  // 16 bytes per loop
@@ -1192,7 +1192,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
 
 // ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
-  asm volatile(
+  asm volatile (
       "vdup.u32    q0, %2                        \n"  // duplicate 4 ints
       "1:                                        \n"
       "subs        %1, %1, #4                    \n"  // 4 pixels per loop
@@ -1205,7 +1205,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
 }
 
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
+  asm volatile (
       // Start at end of source row.
       "add         %0, %0, %2                    \n"
       "sub         %0, %0, #32                   \n"  // 32 bytes per loop
@@ -1227,7 +1227,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  asm volatile(
+  asm volatile (
       // Start at end of source row.
       "mov         r12, #-16                     \n"
       "add         %0, %0, %2, lsl #1            \n"
@@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  asm volatile(
+  asm volatile (
       // Start at end of source row.
       "mov         r12, #-16                     \n"
       "add         %0, %0, %3, lsl #1            \n"
@@ -1272,7 +1272,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
 }
 
 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       "add         %0, %0, %2, lsl #2            \n"
       "sub         %0, #32                       \n"
 
@@ -1296,7 +1296,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_rgb24,
                          int width) {
   src_rgb24 += width * 3 - 24;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld3.8      {d0, d1, d2}, [%0], %3        \n"  // src -= 24
       "subs        %2, #8                        \n"  // 8 pixels per loop.
@@ -1315,7 +1315,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d4, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RGB24.
@@ -1331,7 +1331,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d4, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
@@ -1348,7 +1348,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 }
 
 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d0, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
@@ -1364,7 +1364,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
   );
 }
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@@ -1395,7 +1395,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d3, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
@@ -1441,7 +1441,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d3, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
@@ -1470,7 +1470,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                             uint8_t* dst_argb,
                             int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d3, #255                      \n"  // Alpha
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
@@ -1489,7 +1489,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_rgb24,
                          int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of ARGB.
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
@@ -1506,7 +1506,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d1, d2, d3, d4}, [%0]!       \n"  // load 8 pixels of ARGB.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@@ -1522,7 +1522,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
       "subs        %2, %2, #16                   \n"  // 16 processed per loop.
@@ -1537,7 +1537,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of UYVY.
       "subs        %2, %2, #16                   \n"  // 16 processed per loop.
@@ -1555,7 +1555,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
       "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
@@ -1575,7 +1575,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
       "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
@@ -1596,7 +1596,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile(
+  asm volatile (
       "add         %1, %0, %1                    \n"  // stride + src_yuy2
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
@@ -1623,7 +1623,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile(
+  asm volatile (
       "add         %1, %0, %1                    \n"  // stride + src_uyvy
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
@@ -1649,7 +1649,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
                         int stride_yuy2,
                         uint8_t* dst_uv,
                         int width) {
-  asm volatile(
+  asm volatile (
       "add         %1, %0, %1                    \n"  // stride + src_yuy2
       "1:                                        \n"
       "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
@@ -1673,7 +1673,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-  asm volatile(
+  asm volatile (
       "vld1.8      {q2}, [%3]                    \n"  // shuffler
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 4 pixels.
@@ -1695,7 +1695,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 Ys
       "vld1.8      {d1}, [%1]!                   \n"  // load 8 Us
@@ -1717,7 +1717,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld2.8      {d1, d3}, [%0]!               \n"  // load 16 Ys
       "vld1.8      {d0}, [%1]!                   \n"  // load 8 Us
@@ -1737,7 +1737,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                           uint8_t* dst_rgb565,
                           int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@@ -1755,7 +1755,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
                                 uint32_t dither4,
                                 int width) {
-  asm volatile(
+  asm volatile (
       "vdup.32     d7, %2                        \n"  // dither4
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%1]!       \n"  // load 8 pixels of ARGB.
@@ -1776,7 +1776,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb1555,
                             int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
       "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@@ -1793,7 +1793,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb4444,
                             int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d7, #0x0f                     \n"  // bits to clear with
                                                       // vbic.
       "1:                                        \n"
@@ -1812,7 +1812,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels
@@ -1838,7 +1838,7 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                                uint8_t* dst_v,
                                int width,
                                const struct RgbUVConstants* rgbuvconstants) {
-  asm volatile(
+  asm volatile (
 
       "vld1.8      {d0}, [%4]                    \n"  // load rgbuvconstants
       "vdup.u8     d24, d0[0]                    \n"  // UB  0.875  coefficient
@@ -2366,7 +2366,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-  asm volatile(
+  asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
       "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                       // coefficient
@@ -2432,7 +2432,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  asm volatile(
+  asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
       "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                       // coefficient
@@ -2498,7 +2498,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  asm volatile(
+  asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_argb
       "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                       // coefficient
@@ -2550,7 +2550,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
 }
 
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
       "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
       "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
@@ -2576,7 +2576,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                          uint8_t* dst_y,
                          int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
       "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
       "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
@@ -2602,7 +2602,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
       "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
       "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
@@ -2628,7 +2628,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"
       "vld1.8      {q2}, [%0]!                   \n"
@@ -2651,7 +2651,7 @@ static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ab64,
                         int width) {
-  asm volatile(
+  asm volatile (
       "vld1.8      {q4}, [%3]                    \n"  // shuffler
 
       "1:                                        \n"
@@ -2677,7 +2677,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
 void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"
       "vld1.16     {q1}, [%0]!                   \n"
@@ -2703,7 +2703,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
 void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile(
+  asm volatile (
       "vld1.8      {d8}, [%3]                    \n"  // shuffler
 
       "1:                                        \n"
@@ -2756,7 +2756,7 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_y,
                            int width,
                            const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm volatile (
       "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
       "vdup.u8     d20, d0[0]                    \n"
       "vdup.u8     d21, d0[1]                    \n"
@@ -2806,7 +2806,7 @@ void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
                            uint8_t* dst_y,
                            int width,
                            const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm volatile (
       "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
       "vdup.u8     d20, d0[0]                    \n"
       "vdup.u8     d21, d0[1]                    \n"
@@ -2850,7 +2850,7 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                           uint8_t* dst_y,
                           int width,
                           const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm volatile (
       "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
       "vdup.u8     d20, d0[0]                    \n"
       "vdup.u8     d21, d0[1]                    \n"
@@ -2902,7 +2902,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
                          int dst_width,
                          int source_y_fraction) {
   int y1_fraction = source_y_fraction;
-  asm volatile(
+  asm volatile (
       "cmp         %4, #0                        \n"
       "beq         100f                          \n"
       "add         %2, %1                        \n"
@@ -2964,7 +2964,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
   int y0_fraction = 256 - y1_fraction;
   const uint16_t* src_ptr1 = src_ptr + src_stride;
 
-  asm volatile(
+  asm volatile (
       "cmp         %4, #0                        \n"
       "beq         100f                          \n"
       "cmp         %4, #128                      \n"
@@ -3019,7 +3019,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm volatile (
       "subs        %3, #8                        \n"
       "blt         89f                           \n"
       // Blend 8 pixels.
@@ -3078,7 +3078,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u16    q15, #0x00ff                  \n"  // 255 for rounding up
 
       // Attenuate 8 pixels.
@@ -3107,7 +3107,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
                           int interval_size,
                           int interval_offset,
                           int width) {
-  asm volatile(
+  asm volatile (
       "vdup.u16    q8, %2                        \n"
       "vshr.u16    q8, q8, #1                    \n"  // scale >>= 1
       "vdup.u16    q9, %3                        \n"  // interval multiply.
@@ -3149,7 +3149,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        int width,
                        uint32_t value) {
-  asm volatile(
+  asm volatile (
       "vdup.u32    q0, %3                        \n"  // duplicate scale value.
       "vzip.u8     d0, d1                        \n"  // d0 aarrggbb.
       "vshr.u16    q0, q0, #1                    \n"  // scale / 2.
@@ -3183,7 +3183,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
 // Similar to ARGBToYJ but stores ARGB.
 // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
       "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
       "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
@@ -3210,7 +3210,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d20, #17                      \n"  // BB coefficient
       "vmov.u8     d21, #68                      \n"  // BG coefficient
       "vmov.u8     d22, #35                      \n"  // BR coefficient
@@ -3251,7 +3251,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              const int8_t* matrix_argb,
                              int width) {
-  asm volatile(
+  asm volatile (
       "vld1.8      {q2}, [%3]                    \n"  // load 3 ARGB vectors.
       "vmovl.s8    q0, d4                        \n"  // B,G coefficients s16.
       "vmovl.s8    q1, d5                        \n"  // R,A coefficients s16.
@@ -3310,7 +3310,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile(
+  asm volatile (
       // 8 pixel loop.
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
@@ -3339,7 +3339,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile(
+  asm volatile (
       // 8 pixel loop.
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
@@ -3362,7 +3362,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile(
+  asm volatile (
       // 8 pixel loop.
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
@@ -3389,7 +3389,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
                    const uint8_t* src_sobely,
                    uint8_t* dst_argb,
                    int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d3, #255                      \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
@@ -3414,7 +3414,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
                           const uint8_t* src_sobely,
                           uint8_t* dst_y,
                           int width) {
-  asm volatile(
+  asm volatile (
       // 16 pixel loop.
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 16 sobelx.
@@ -3440,7 +3440,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
                      const uint8_t* src_sobely,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d3, #255                      \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
@@ -3467,7 +3467,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
                     const uint8_t* src_y2,
                     uint8_t* dst_sobelx,
                     int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {d0}, [%0],%5                 \n"  // top
       "vld1.8      {d1}, [%0],%6                 \n"
@@ -3505,7 +3505,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
                     const uint8_t* src_y1,
                     uint8_t* dst_sobely,
                     int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {d0}, [%0],%4                 \n"  // left
       "vld1.8      {d1}, [%1],%4                 \n"
@@ -3542,7 +3542,7 @@ void HalfFloat1Row_NEON(const uint16_t* src,
                         uint16_t* dst,
                         float /*unused*/,
                         int width) {
-  asm volatile(
+  asm volatile (
 
       "1:                                        \n"
       "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
@@ -3568,7 +3568,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
                        uint16_t* dst,
                        float scale,
                        int width) {
-  asm volatile(
+  asm volatile (
 
       "1:                                        \n"
       "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
@@ -3594,7 +3594,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
                          float* dst,
                          float scale,
                          int width) {
-  asm volatile(
+  asm volatile (
 
       "1:                                        \n"
       "vld1.8      {d2}, [%0]!                   \n"  // load 8 bytes
@@ -3623,7 +3623,7 @@ void GaussCol_NEON(const uint16_t* src0,
                    const uint16_t* src4,
                    uint32_t* dst,
                    int width) {
-  asm volatile(
+  asm volatile (
       "vmov.u16    d6, #4                        \n"  // constant 4
       "vmov.u16    d7, #6                        \n"  // constant 6
 
@@ -3660,7 +3660,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
   const uint32_t* src1 = src + 1;
   const uint32_t* src2 = src + 2;
   const uint32_t* src3 = src + 3;
-  asm volatile(
+  asm volatile (
       "vmov.u32    q10, #4                       \n"  // constant 4
       "vmov.u32    q11, #6                       \n"  // constant 6
 
@@ -3698,7 +3698,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q2}, [%0]!                   \n"  // load 16 Y values
       "vld2.8      {d0, d2}, [%1]!               \n"  // load 8 VU values
@@ -3722,7 +3722,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
                       int src_stride_ayuv,
                       uint8_t* dst_uv,
                       int width) {
-  asm volatile(
+  asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
@@ -3753,7 +3753,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
                       int src_stride_ayuv,
                       uint8_t* dst_vu,
                       int width) {
-  asm volatile(
+  asm volatile (
       "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
@@ -3783,7 +3783,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
 // Copy row of AYUV Y's into Y.
 // Similar to ARGBExtractAlphaRow_NEON
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels
       "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV pixels
@@ -3799,7 +3799,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
 
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 UV values
       "vld2.8      {d1, d3}, [%0]!               \n"
@@ -3822,7 +3822,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
                          int width) {
   const uint8_t* src_u_1 = src_u + src_stride_u;
   const uint8_t* src_v_1 = src_v + src_stride_v;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load 16 U values
       "vld1.8      {q1}, [%2]!                   \n"  // load 16 V values
@@ -3853,7 +3853,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
                         int depth,
                         int width) {
   int shift = depth - 16;  // Negative for right shift.
-  asm volatile(
+  asm volatile (
       "vdup.16     q2, %4                        \n"
       "1:                                        \n"
       "vld2.16     {q0, q1}, [%0]!               \n"  // load 8 UV
@@ -3877,7 +3877,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                         int depth,
                         int width) {
   int shift = 16 - depth;
-  asm volatile(
+  asm volatile (
       "vdup.16     q2, %4                        \n"
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"  // load 8 U
@@ -3899,7 +3899,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width) {
-  asm volatile(
+  asm volatile (
       "vdup.16     q2, %3                        \n"
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"
@@ -3921,7 +3921,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        uint16_t* dst_y,
                        int scale,
                        int width) {
-  asm volatile(
+  asm volatile (
       "vdup.16     d8, %3                        \n"
       "1:                                        \n"
       "vld1.16     {q2, q3}, [%0]!               \n"
@@ -3953,7 +3953,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
                           int scale,
                           int width) {
   int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
-  asm volatile(
+  asm volatile (
       "vdup.16     q2, %3                        \n"
       "1:                                        \n"
       "vld1.16     {q0}, [%0]!                   \n"
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 372b1efc2..70b44d226 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -210,7 +210,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n" /* A */
       "1:                                        \n" READYUV444 I4XXTORGB
@@ -234,7 +234,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "1:                                        \n" READYUV444 I4XXTORGB
           RGBTORGB8
@@ -261,7 +261,7 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   uint16_t limit = 0x3ff0;
   uint16_t alpha = 0xc000;
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
       "dup      v22.8h, %w[limit]                  \n"
       "dup      v23.8h, %w[alpha]                  \n"
       "1:                                          \n" READYUV210 NVTORGB
@@ -289,7 +289,7 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   uint16_t limit = 0x3ff0;
   uint16_t alpha = 0xc000;
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
       "dup      v22.8h, %w[limit]                  \n"
       "dup      v23.8h, %w[alpha]                  \n"
       "1:                                          \n" READYUV410 NVTORGB
@@ -313,7 +313,7 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
       "movi        v19.8b, #255             \n"
       "1:                                   \n" READYUV210 NVTORGB RGBTORGB8
       "subs        %w[width], %w[width], #8 \n"
@@ -335,7 +335,7 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
       "movi        v19.8b, #255             \n"
       "1:                                   \n" READYUV410 NVTORGB RGBTORGB8
       "subs        %w[width], %w[width], #8 \n"
@@ -357,7 +357,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n" /* A */
       "1:                                        \n" READYUV422 I4XXTORGB
@@ -382,7 +382,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "1:                                        \n"
       "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV444
@@ -408,7 +408,7 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
       "1:                                     \n"
       "ld1        {v19.16b}, [%[src_a]], #16  \n" READYUV410
       "uqshrn     v19.8b, v19.8h, #2          \n" NVTORGB RGBTORGB8
@@ -433,7 +433,7 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
       "1:                                        \n"
       "ld1         {v19.16b}, [%[src_a]], #16    \n" READYUV210
       "uqshrn      v19.8b, v19.8h, #2            \n" NVTORGB RGBTORGB8
@@ -458,7 +458,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                              uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "1:                                        \n"
       "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV422
@@ -483,7 +483,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "movi        v15.8b, #255                  \n" /* A */
       "1:                                        \n" READYUV422 I4XXTORGB
@@ -507,7 +507,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 I4XXTORGB
           RGBTORGB8
@@ -549,7 +549,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
                           uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 I4XXTORGB
           RGBTORGB8_TOP
@@ -591,7 +591,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
       "movi    v19.8h, #0x80, lsl #8             \n"
       "1:                                        \n"  //
       READYUV422 I4XXTORGB RGBTORGB8_TOP
@@ -621,7 +621,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "1:                                        \n" READYUV422 I4XXTORGB
           RGBTORGB8
@@ -645,7 +645,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "movi        v1.16b, #128                  \n"
       "movi        v19.8b, #255                  \n"
@@ -668,7 +668,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
 
 #if LIBYUV_USE_ST4
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       "movi        v23.8b, #255                  \n"
       "1:                                        \n"
       "ld1         {v20.8b}, [%0], #8            \n"
@@ -686,7 +686,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 }
 #else
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       "movi        v20.8b, #255                  \n"
       "1:                                        \n"
       "ldr         d16, [%0], #8                 \n"
@@ -711,7 +711,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n"
       "ldr         q2, [%[kNV12Table]]           \n"
@@ -734,7 +734,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n"
       "ldr         q2, [%[kNV12Table]]           \n"
@@ -757,7 +757,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "ldr         q2, [%[kNV12Table]]           \n"
       "1:                                        \n" READNV12 NVTORGB RGBTORGB8
@@ -779,7 +779,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "ldr         q2, [%[kNV12Table]]           \n"
       "1:                                        \n" READNV12 NVTORGB RGBTORGB8
@@ -801,7 +801,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                           uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "ldr         q2, [%[kNV12Table]]           \n"
       "1:                                        \n" READNV12 NVTORGB
@@ -825,7 +825,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "movi        v19.8b, #255                   \n"
       "ldr         q2, [%[kNV21InterleavedTable]] \n"
@@ -846,7 +846,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
       YUVTORGB_SETUP
       "movi        v19.8b, #255                   \n"
       "ldr         q2, [%[kNV12InterleavedTable]] \n"
@@ -868,7 +868,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
@@ -893,7 +893,7 @@ void DetileRow_NEON(const uint8_t* src,
                     ptrdiff_t src_tile_stride,
                     uint8_t* dst,
                     int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], %3            \n"  // load 16 bytes
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
@@ -913,7 +913,7 @@ void DetileRow_16_NEON(const uint16_t* src,
                        ptrdiff_t src_tile_stride,
                        uint16_t* dst,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.8h,v1.8h}, [%0], %3       \n"  // load 16 pixels
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
@@ -934,7 +934,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld2         {v0.8b,v1.8b}, [%0], %4       \n"
       "subs        %w3, %w3, #16                 \n"
@@ -959,7 +959,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
       "prfm        pldl1keep, [%0, 1792]         \n"
@@ -985,7 +985,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
       "ld1         {v1.16b}, [%1], %5            \n"  // load 8 UVs
@@ -1010,7 +1010,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
 // Unpack MT2T into tiled P010 64 pixels at a time. See
 // tinyurl.com/mtk-10bit-video-format for format documentation.
 void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v7.16b}, [%0], #16           \n"
       "ld1         {v0.16b-v3.16b}, [%0], #64    \n"
@@ -1051,7 +1051,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load U
       "ld1         {v1.16b}, [%1], #16           \n"  // load V
@@ -1075,7 +1075,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                         int depth,
                         int width) {
   int shift = 16 - depth;
-  asm volatile(
+  asm volatile (
       "dup         v2.8h, %w4                    \n"
       "1:                                        \n"
       "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
@@ -1100,7 +1100,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load U
       "ld1         {v1.16b}, [%1], #16           \n"  // load V
@@ -1126,7 +1126,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                         int depth,
                         int width) {
   int shift = 16 - depth;
-  asm volatile(
+  asm volatile (
       "dup         v4.8h, %w4                    \n"
       "1:                                        \n"
       "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
@@ -1155,7 +1155,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
@@ -1180,7 +1180,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_rgb,
                       int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load R
       "ld1         {v1.16b}, [%1], #16           \n"  // load G
@@ -1208,7 +1208,7 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
       "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
@@ -1237,7 +1237,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%2], #16           \n"  // load B
       "ld1         {v1.16b}, [%1], #16           \n"  // load G
@@ -1268,7 +1268,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%2], #16           \n"  // load B
       "ld1         {v1.16b}, [%1], #16           \n"  // load G
@@ -1308,7 +1308,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
@@ -1333,7 +1333,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm volatile (
       "movi        v3.16b, #255                  \n"  // load A(255)
       "1:                                        \n"
       "ld1         {v2.16b}, [%0], #16           \n"  // load R
@@ -1362,7 +1362,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
                        int depth,
                        int width) {
   int shift = 10 - depth;
-  asm volatile(
+  asm volatile (
       "movi        v30.16b, #255                 \n"
       "ushr        v30.4s, v30.4s, #22           \n"  // 1023
       "dup         v31.4s, %w5                   \n"
@@ -1403,7 +1403,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
   // Neon has no "shift left and accumulate/orr", so use a multiply-add to
   // perform the shift instead.
   int limit = 1023;
-  asm volatile(
+  asm volatile (
       "dup    v5.8h, %w[limit]          \n"
       "movi   v6.8h, #16                \n"  // 1 << 4
       "movi   v7.8h, #4, lsl #8         \n"  // 1 << 10
@@ -1439,7 +1439,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
                        int width) {
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
-  asm volatile(
+  asm volatile (
 
       "dup         v30.8h, %w7                   \n"
       "dup         v31.8h, %w6                   \n"
@@ -1482,7 +1482,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
                        int width) {
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
-  asm volatile(
+  asm volatile (
 
       "movi        v3.16b, #0xff                 \n"  // A (0xffff)
       "dup         v30.8h, %w6                   \n"
@@ -1523,7 +1523,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
                             int width) {
   // Shift is 8 - depth, +8 so the result is in the top half of each lane.
   int shift = 16 - depth;
-  asm volatile(
+  asm volatile (
       "dup         v31.8h, %w6                   \n"
       "1:                                        \n"
       "ldr         q0, [%0], #16                 \n"  // B
@@ -1561,7 +1561,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
                             int width) {
   // Shift is 8 - depth, +8 so the result is in the top half of each lane.
   int shift = 16 - depth;
-  asm volatile(
+  asm volatile (
       "dup         v31.8h, %w5                   \n"
       "movi        v3.16b, #0xff                 \n"  // A (0xff)
       "1:                                        \n"
@@ -1590,7 +1590,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
 
 // Copy multiple of 32.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"
       "prfm        pldl1keep, [%0, 448]          \n"
@@ -1607,7 +1607,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
-  asm volatile(
+  asm volatile (
       "dup         v0.16b, %w2                   \n"  // duplicate 16 bytes
       "1:                                        \n"
       "subs        %w1, %w1, #16                 \n"  // 16 bytes per loop
@@ -1620,7 +1620,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
 }
 
 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
-  asm volatile(
+  asm volatile (
       "dup         v0.4s, %w2                    \n"  // duplicate 4 ints
       "1:                                        \n"
       "subs        %w1, %w1, #4                  \n"  // 4 ints per loop
@@ -1637,7 +1637,7 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
 
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
+  asm volatile (
       // Start at end of source row.
       "ld1         {v3.16b}, [%3]                \n"  // shuffler
       "add         %0, %0, %w2, sxtw             \n"
@@ -1662,7 +1662,7 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
                                        6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
 
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  asm volatile(
+  asm volatile (
       // Start at end of source row.
       "ld1         {v4.16b}, [%3]                \n"  // shuffler
       "add         %0, %0, %w2, sxtw #1          \n"
@@ -1686,7 +1686,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
                            int width) {
-  asm volatile(
+  asm volatile (
       // Start at end of source row.
       "ld1         {v4.16b}, [%4]                \n"  // shuffler
       "add         %0, %0, %w3, sxtw #1          \n"
@@ -1715,7 +1715,7 @@ static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
                                          4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};
 
 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       // Start at end of source row.
       "ld1         {v4.16b}, [%3]                \n"  // shuffler
       "add         %0, %0, %w2, sxtw #2          \n"
@@ -1738,7 +1738,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
 void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_rgb24,
                          int width) {
-  asm volatile(
+  asm volatile (
       "ld1         {v3.16b}, [%4]                \n"  // shuffler
       "add         %0, %0, %w2, sxtw #1          \n"  // Start at end of row.
       "add         %0, %0, %w2, sxtw             \n"
@@ -1763,7 +1763,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile(
+  asm volatile (
       "movi        v4.8b, #255                   \n"  // Alpha
       "1:                                        \n"
       "ld3         {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of
@@ -1781,7 +1781,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       "movi        v5.8b, #255                   \n"  // Alpha
       "1:                                        \n"
       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
@@ -1800,7 +1800,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 }
 
 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  asm volatile(
+  asm volatile (
       "movi        v0.8b, #255                   \n"  // Alpha
       "1:                                        \n"
       "ld3         {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
@@ -1819,7 +1819,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
 }
 
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
       "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
@@ -1849,7 +1849,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile(
+  asm volatile (
       "movi        v3.8b, #255                   \n"  // Alpha
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
@@ -1892,7 +1892,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
       "prfm        pldl1keep, [%0, 448]          \n"
@@ -1926,7 +1926,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                             uint8_t* dst_argb,
                             int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v1.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@@ -1955,7 +1955,7 @@ static void ABCDToAR30Row_NEON(const uint8_t* src_abcd,
                                uint8_t* dst_ar30,
                                int width,
                                const uint8_t* indices) {
-  asm volatile(
+  asm volatile (
       "movi      v2.4s, #0xf, msl 16             \n"  // 0xfffff
       "ldr       q3, [%[kAR30Row_BoxShifts]]     \n"
       "ldp       q4, q5, [%[indices]]            \n"
@@ -1997,7 +1997,7 @@ void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_rgb24,
                          int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
@@ -2013,7 +2013,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@@ -2031,7 +2031,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
@@ -2047,7 +2047,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
@@ -2066,7 +2066,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
@@ -2087,7 +2087,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
@@ -2110,7 +2110,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                       uint8_t* dst_v,
                       int width) {
   const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
       "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
@@ -2138,7 +2138,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
                       uint8_t* dst_v,
                       int width) {
   const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
       "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
@@ -2165,7 +2165,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_uv,
                         int width) {
   const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels
       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
@@ -2188,7 +2188,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-  asm volatile(
+  asm volatile (
       "ld1         {v2.16b}, [%3]                \n"  // shuffler
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 4 pixels.
@@ -2210,7 +2210,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld2         {v0.8b, v1.8b}, [%0], #16     \n"  // load 16 Ys
       "subs        %w4, %w4, #16                 \n"  // 16 pixels
@@ -2234,7 +2234,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld2         {v1.8b,v2.8b}, [%0], #16      \n"  // load 16 Ys
       "mov         v3.8b, v2.8b                  \n"
@@ -2256,7 +2256,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                           uint8_t* dst_rgb565,
                           int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
                                                                  // pixels
@@ -2275,7 +2275,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                 uint8_t* dst_rgb,
                                 uint32_t dither4,
                                 int width) {
-  asm volatile(
+  asm volatile (
       "dup         v1.4s, %w3                    \n"  // dither4
       "1:                                        \n"
       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
@@ -2296,7 +2296,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb1555,
                             int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
                                                                  // pixels
@@ -2314,7 +2314,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb4444,
                             int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
                                                                  // pixels
@@ -2333,7 +2333,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
       "mov         v1.16b, v0.16b                \n"
@@ -2356,7 +2356,7 @@ static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ab64,
                         int width) {
-  asm volatile(
+  asm volatile (
       "ldr         q4, [%3]                      \n"  // shuffler
       "1:                                        \n"
       "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
@@ -2379,7 +2379,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 8 ARGB pixels
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@@ -2404,7 +2404,7 @@ static const uvec8 kShuffleARGBToAB64[2] = {
 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
                         uint16_t* dst_ab64,
                         int width) {
-  asm volatile(
+  asm volatile (
       "ldp         q6, q7, [%3]                  \n"  // 2 shufflers
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 8 pixels
@@ -2430,7 +2430,7 @@ static const uvec8 kShuffleAR64ToARGB = {1,  3,  5,  7,  9,  11, 13, 15,
 void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile(
+  asm volatile (
       "ldr         q4, [%3]                      \n"  // shuffler
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
@@ -2454,7 +2454,7 @@ static const uvec8 kShuffleAB64ToARGB = {5,  3,  1,  7,  13, 11, 9,  15,
 void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile(
+  asm volatile (
       "ldr         q4, [%3]                      \n"  // shuffler
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
@@ -2475,7 +2475,7 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
       "prfm        pldl1keep, [%0, 448]          \n"
@@ -2501,7 +2501,7 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                                uint8_t* dst_v,
                                int width,
                                const struct RgbUVConstants* rgbuvconstants) {
-  asm volatile(
+  asm volatile (
       "ldr         d0, [%4]                      \n"  // load rgbuvconstants
       "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
       "dup         v25.16b, v0.b[1]              \n"  // UG -0.5781 coefficient
@@ -3009,7 +3009,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
                         uint8_t* dst_v,
                         int width) {
   const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
-  asm volatile(
+  asm volatile (
       RGBTOUV_SETUP_REG
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
@@ -3067,7 +3067,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
                           uint8_t* dst_v,
                           int width) {
   const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
-  asm volatile(
+  asm volatile (
       RGBTOUV_SETUP_REG
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
@@ -3125,7 +3125,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                           uint8_t* dst_v,
                           int width) {
   const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
-  asm volatile(
+  asm volatile (
       RGBTOUV_SETUP_REG  // sets v20-v25
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
@@ -3179,7 +3179,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
 }
 
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
       "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
       "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
       "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
@@ -3207,7 +3207,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                          uint8_t* dst_y,
                          int width) {
-  asm volatile(
+  asm volatile (
       "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
       "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
       "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
@@ -3234,7 +3234,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width) {
-  asm volatile(
+  asm volatile (
       "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
       "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
       "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
@@ -3268,7 +3268,7 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_y,
                            int width,
                            const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm volatile (
       "ldr         d0, [%3]                      \n"  // load rgbconstants
       "dup         v6.16b, v0.b[0]               \n"
       "dup         v7.16b, v0.b[1]               \n"
@@ -3302,7 +3302,7 @@ ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb,
                               uint8_t* dst_y,
                               int width,
                               const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm volatile (
       "ldr         d0, [%3]                      \n"  // load rgbconstants
       "dup         v16.4s, v0.s[0]               \n"
       "dup         v17.8h,  v0.h[2]              \n"
@@ -3404,7 +3404,7 @@ void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
                            uint8_t* dst_y,
                            int width,
                            const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm volatile (
       "ldr         d0, [%3]                      \n"  // load rgbconstants
       "dup         v6.16b, v0.b[0]               \n"
       "dup         v7.16b, v0.b[1]               \n"
@@ -3476,7 +3476,7 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                           uint8_t* dst_y,
                           int width,
                           const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm volatile (
       "ldr         d0, [%3]                      \n"  // load rgbconstants
       "dup         v5.16b, v0.b[0]               \n"
       "dup         v6.16b, v0.b[1]               \n"
@@ -3528,7 +3528,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   const uint8_t* src_ptr1 = src_ptr + src_stride;
-  asm volatile(
+  asm volatile (
       "cmp         %w4, #0                       \n"
       "b.eq        100f                          \n"
       "cmp         %w4, #128                     \n"
@@ -3594,7 +3594,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
   int y0_fraction = 256 - y1_fraction;
   const uint16_t* src_ptr1 = src_ptr + src_stride;
 
-  asm volatile(
+  asm volatile (
       "cmp         %w4, #0                       \n"
       "b.eq        100f                          \n"
       "cmp         %w4, #128                     \n"
@@ -3666,7 +3666,7 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
   const uint16_t* src_ptr1 = src_ptr + src_stride;
   int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
 
-  asm volatile(
+  asm volatile (
       "dup         v6.8h, %w6                    \n"
       "cmp         %w4, #0                       \n"
       "b.eq        100f                          \n"
@@ -3734,7 +3734,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
                        const uint8_t* src_argb1,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm volatile (
       "subs        %w3, %w3, #8                  \n"
       "b.lt        89f                           \n"
       // Blend 8 pixels.
@@ -3805,7 +3805,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
-  asm volatile(
+  asm volatile (
       "movi        v7.8h, #0x00ff                \n"  // 255 for rounding up
 
       // Attenuate 8 pixels.
@@ -3835,7 +3835,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
                           int interval_size,
                           int interval_offset,
                           int width) {
-  asm volatile(
+  asm volatile (
       "dup         v4.8h, %w2                    \n"
       "ushr        v4.8h, v4.8h, #1              \n"  // scale >>= 1
       "dup         v5.8h, %w3                    \n"  // interval multiply.
@@ -3878,7 +3878,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        int width,
                        uint32_t value) {
-  asm volatile(
+  asm volatile (
       "dup         v0.4s, %w3               \n"  // duplicate scale value.
       "zip1        v0.16b, v0.16b, v0.16b   \n"  // v0.16b aarrggbbaarrggbb.
       "ushr        v0.8h, v0.8h, #1         \n"  // scale / 2.
@@ -3913,7 +3913,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
 // Similar to ARGBToYJ but stores ARGB.
 // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       "movi        v24.8b, #29                   \n"  // B * 0.1140 coefficient
       "movi        v25.8b, #150                  \n"  // G * 0.5870 coefficient
       "movi        v26.8b, #77                   \n"  // R * 0.2990 coefficient
@@ -3942,7 +3942,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
       "movi        v20.8b, #17                   \n"  // BB coefficient
       "movi        v21.8b, #68                   \n"  // BG coefficient
       "movi        v22.8b, #35                   \n"  // BR coefficient
@@ -3984,7 +3984,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              const int8_t* matrix_argb,
                              int width) {
-  asm volatile(
+  asm volatile (
       "ld1         {v2.16b}, [%3]                \n"  // load 3 ARGB vectors.
       "sxtl        v0.8h, v2.8b                  \n"  // B,G coefficients s16.
       "sxtl2       v1.8h, v2.16b                 \n"  // R,A coefficients s16.
@@ -4100,7 +4100,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile(
+  asm volatile (
       // 8 pixel loop.
       "1:                                        \n"
       "ld1         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
@@ -4131,7 +4131,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile(
+  asm volatile (
       // 8 pixel loop.
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 8 ARGB
@@ -4156,7 +4156,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile(
+  asm volatile (
       // 8 pixel loop.
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"  // load 8 ARGB
@@ -4185,7 +4185,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
                    const uint8_t* src_sobely,
                    uint8_t* dst_argb,
                    int width) {
-  asm volatile(
+  asm volatile (
       "movi        v3.8b, #255                   \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
@@ -4212,7 +4212,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
                           const uint8_t* src_sobely,
                           uint8_t* dst_y,
                           int width) {
-  asm volatile(
+  asm volatile (
       // 16 pixel loop.
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 sobelx.
@@ -4240,7 +4240,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
                      const uint8_t* src_sobely,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile(
+  asm volatile (
       "movi        v3.8b, #255                   \n"  // alpha
       // 8 pixel loop.
       "1:                                        \n"
@@ -4269,7 +4269,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
                     const uint8_t* src_y2,
                     uint8_t* dst_sobelx,
                     int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.8b}, [%0],%5              \n"  // top
       "ld1         {v1.8b}, [%0],%6              \n"
@@ -4310,7 +4310,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
                     const uint8_t* src_y1,
                     uint8_t* dst_sobely,
                     int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.8b}, [%0],%4              \n"  // left
       "ld1         {v1.8b}, [%1],%4              \n"
@@ -4346,7 +4346,7 @@ void HalfFloat1Row_NEON(const uint16_t* src,
                         uint16_t* dst,
                         float /*unused*/,
                         int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
@@ -4370,7 +4370,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
                        uint16_t* dst,
                        float scale,
                        int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
@@ -4396,7 +4396,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
                          float* dst,
                          float scale,
                          int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v1.8b}, [%0], #8             \n"  // load 8 bytes
       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
@@ -4421,7 +4421,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
 void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
                                float* dst,
                                int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v1.8h}, [%0], #16            \n"  // load 8 halffloats
       "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
@@ -4443,7 +4443,7 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
                                   int src_stride,       // stride in elements
                                   float* dst,
                                   int width) {
-  asm volatile(
+  asm volatile (
       "cmp         %w2, #8                       \n"  // Is there 8 rows?
       "b.lo        2f                            \n"
       "1:                                        \n"
@@ -4481,7 +4481,7 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
 void ConvertFP32ToFP16Row_NEON(const float* src,
                                uint16_t* dst,  // fp16
                                int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ldp         q2, q3, [%0], #32             \n"  // load 8 floats
       "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
@@ -4502,7 +4502,7 @@ float ScaleMaxSamples_NEON(const float* src,
                            float scale,
                            int width) {
   float fmax;
-  asm volatile(
+  asm volatile (
       "movi        v5.4s, #0                     \n"  // max
       "movi        v6.4s, #0                     \n"
 
@@ -4532,7 +4532,7 @@ float ScaleSumSamples_NEON(const float* src,
                            float scale,
                            int width) {
   float fsum;
-  asm volatile(
+  asm volatile (
       "movi        v5.4s, #0                     \n"  // max
       "movi        v6.4s, #0                     \n"  // max
 
@@ -4559,7 +4559,7 @@ float ScaleSumSamples_NEON(const float* src,
 }
 
 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
       "prfm        pldl1keep, [%0, 448]          \n"
@@ -4583,7 +4583,7 @@ void GaussCol_NEON(const uint16_t* src0,
                    const uint16_t* src4,
                    uint32_t* dst,
                    int width) {
-  asm volatile(
+  asm volatile (
       "movi        v6.8h, #4                     \n"  // constant 4
       "movi        v7.8h, #6                     \n"  // constant 6
 
@@ -4625,7 +4625,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
   const uint32_t* src1 = src + 1;
   const uint32_t* src2 = src + 2;
   const uint32_t* src3 = src + 3;
-  asm volatile(
+  asm volatile (
       "movi        v6.4s, #4                     \n"  // constant 4
       "movi        v7.4s, #6                     \n"  // constant 6
 
@@ -4668,7 +4668,7 @@ void GaussCol_F32_NEON(const float* src0,
                        const float* src4,
                        float* dst,
                        int width) {
-  asm volatile(
+  asm volatile (
       "ld2r        {v6.4s, v7.4s}, [%7]          \n"  // constants 4 and 6
 
       "1:                                        \n"
@@ -4706,7 +4706,7 @@ void GaussCol_F32_NEON(const float* src0,
 
 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
 void GaussRow_F32_NEON(const float* src, float* dst, int width) {
-  asm volatile(
+  asm volatile (
       "ld3r        {v6.4s, v7.4s, v8.4s}, [%3]   \n"  // constants 4, 6, 1/256
 
       "1:                                        \n"
@@ -4745,7 +4745,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v2.16b}, [%0], #16           \n"  // load 16 Y values
       "ld2         {v0.8b, v1.8b}, [%1], #16     \n"  // load 8 VU values
@@ -4776,7 +4776,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-  asm volatile(
+  asm volatile (
       "ld1         {v5.16b,v6.16b,v7.16b}, [%4]  \n"  // 3 shuffler constants
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"    // load 16 Y values
@@ -4806,7 +4806,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
                       uint8_t* dst_uv,
                       int width) {
   const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
-  asm volatile(
+  asm volatile (
 
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
@@ -4835,7 +4835,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
                       uint8_t* dst_vu,
                       int width) {
   const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
-  asm volatile(
+  asm volatile (
 
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
@@ -4861,7 +4861,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
 
 // Copy row of AYUV Y's into Y
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
@@ -4877,7 +4877,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
 
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], 16            \n"  // load 16 UV values
       "ld1         {v1.16b}, [%0], 16            \n"
@@ -4902,7 +4902,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
                          int width) {
   const uint8_t* src_u_1 = src_u + src_stride_u;
   const uint8_t* src_v_1 = src_v + src_stride_v;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 U values
       "ld1         {v1.16b}, [%2], #16           \n"  // load 16 V values
@@ -4937,7 +4937,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
                         int depth,
                         int width) {
   int shift = depth - 16;  // Negative for right shift.
-  asm volatile(
+  asm volatile (
       "dup         v2.8h, %w4                    \n"
       "1:                                        \n"
       "ld2         {v0.8h, v1.8h}, [%0], #32     \n"  // load 8 UV
@@ -4960,7 +4960,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width) {
-  asm volatile(
+  asm volatile (
       "dup         v2.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"
@@ -4981,7 +4981,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        uint16_t* dst_y,
                        int scale,
                        int width) {
-  asm volatile(
+  asm volatile (
       "dup         v4.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q2, q3, [%0], #32             \n"
@@ -5015,7 +5015,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
   // saturate, then we can just use UZP2 to narrow rather than a pair of
   // saturating narrow instructions.
   int shift = 23 - __builtin_clz((int32_t)scale);
-  asm volatile(
+  asm volatile (
       "dup         v2.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index 3b04ec5f7..0533866c0 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -47,7 +47,7 @@ extern "C" {
 // register) is set to round-to-nearest-up mode(0).
 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
   {                                                              \
-    asm volatile("csrwi vxrm, 0");                               \
+    asm volatile ("csrwi vxrm, 0");                               \
     ub = yuvconst->kUVCoeff[0];                                  \
     vr = yuvconst->kUVCoeff[1];                                  \
     ug = yuvconst->kUVCoeff[2];                                  \
@@ -1238,7 +1238,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y,
   vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
   // To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) sets to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   if (is_yb_positive) {
     v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
   } else {
@@ -1632,7 +1632,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
   }
   // To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   // Blend 50 / 50.
   if (y1_fraction == 128) {
     do {
diff --git a/source/row_sve.cc b/source/row_sve.cc
index f6c1fe624..89a86d53b 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -139,7 +139,8 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
+      "cnth     %[vl]                                   \n"
       "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
       "dup      z19.b, #255                             \n" /* A */
       "subs     %w[width], %w[width], %w[vl]            \n"
@@ -181,7 +182,8 @@ void I400ToARGBRow_SVE2(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
+      "cnth     %[vl]                                   \n"
       "ptrue    p0.b                                    \n"
       "dup      z19.b, #255                             \n"  // A
       YUVTORGB_SVE_SETUP
@@ -229,7 +231,8 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
+      "cnth     %[vl]                                   \n"
       "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
       "dup      z19.b, #255                             \n" /* A */
       "subs     %w[width], %w[width], %w[vl]            \n"
@@ -273,7 +276,8 @@ void I422ToRGBARow_SVE2(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
+      "cnth     %[vl]                                   \n"
       "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
       "dup      z19.b, #255                             \n"  // A
       "subs     %w[width], %w[width], %w[vl]            \n"
@@ -318,7 +322,8 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
                              const struct YuvConstants* yuvconstants,
                              int width) {
   uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
+      "cnth     %[vl]                                   \n"
       "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
       "subs     %w[width], %w[width], %w[vl]            \n"
       "b.lt     2f                                      \n"
@@ -366,7 +371,8 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
                              const struct YuvConstants* yuvconstants,
                              int width) {
   uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
+      "cnth     %[vl]                                   \n"
       "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
       "subs     %w[width], %w[width], %w[vl]            \n"
       "b.lt     2f                                      \n"
@@ -416,11 +422,13 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
                                     uint32_t nv_v_start,
                                     uint32_t nv_v_step) {
   uint64_t vl;
-  asm("cnth %0" : "=r"(vl));
+  asm volatile (
+      "cnth %0" : "=r"(vl));
   int width_last_y = width & (vl - 1);
   width_last_y = width_last_y == 0 ? vl : width_last_y;
   int width_last_uv = width_last_y + (width_last_y & 1);
-  asm("ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
+  asm volatile (
+      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
       "index    z22.s, %w[nv_u_start], %w[nv_u_step]    \n"
       "index    z23.s, %w[nv_v_start], %w[nv_v_step]    \n"
       "dup      z19.b, #255                             \n"  // A
@@ -534,7 +542,7 @@ void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
                             const int16_t* uvconstants) {
   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   uint64_t vl;
-  asm volatile(
+  asm volatile (
       "ptrue    p0.b                                \n"
       "ld1rd    {z24.d}, p0/z, [%[uvconstants]]     \n"
       "ld1rd    {z25.d}, p0/z, [%[uvconstants], #8] \n"
@@ -746,7 +754,8 @@ void ARGBToRGB565Row_SVE2(const uint8_t* src_argb,
   unsigned bsl_mask = 0x7e0;
   uint64_t vl;
   width *= 2;
-  asm("mov     z3.h, #3                     \n"
+  asm volatile (
+      "mov     z3.h, #3                     \n"
       "dup     z4.h, %w[bsl_mask]           \n"
 
       "cntb    %[vl]                        \n"
@@ -787,7 +796,8 @@ void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb,
   unsigned bsl_mask = 0x7e0;
   uint64_t vl;
   width *= 2;
-  asm("mov     z3.h, #3                     \n"
+  asm volatile (
+      "mov     z3.h, #3                     \n"
       "dup     z4.h, %w[bsl_mask]           \n"
       "dup     z2.s, %w[dither4]            \n"
       "zip1    z2.b, z2.b, z2.b             \n"
@@ -844,7 +854,8 @@ void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555,
                             uint8_t* dst_argb,
                             int width) {
   uint64_t vl;
-  asm("mov     z4.h, #0x0300                           \n"
+  asm volatile (
+      "mov     z4.h, #0x0300                           \n"
       "ptrue   p0.b                                    \n"
 
       "cnth    %x[vl]                                  \n"
@@ -912,7 +923,8 @@ void AYUVToUVRow_SVE2(const uint8_t* src_ayuv,
   // Output a row of UV values, filtering 2x2 rows of AYUV.
   const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
   int vl;
-  asm("cntb    %x[vl]                            \n"
+  asm volatile (
+      "cntb    %x[vl]                            \n"
       "subs    %w[width], %w[width], %w[vl]      \n"
       "b.lt    2f                                \n"
 
@@ -950,7 +962,8 @@ void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
   // Output a row of VU values, filtering 2x2 rows of AYUV.
   const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
   int vl;
-  asm("cntb    %x[vl]                            \n"
+  asm volatile (
+      "cntb    %x[vl]                            \n"
       "cmp     %w[width], %w[vl]                 \n"
       "subs    %w[width], %w[width], %w[vl]      \n"
       "b.lt    2f                                \n"
@@ -990,10 +1003,12 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
   uint32_t nv_v_start = 0x0003'0003U;
   uint32_t nv_v_step = 0x0004'0004U;
   uint64_t vl;
-  asm("cnth %0" : "=r"(vl));
+  asm volatile (
+      "cnth %0" : "=r"(vl));
   int width_last_y = width & (vl - 1);
   int width_last_uv = width_last_y + (width_last_y & 1);
-  asm("ptrue    p0.b                                    \n"
+  asm volatile (
+      "ptrue    p0.b                                    \n"
       "index    z22.s, %w[nv_u_start], %w[nv_u_step]    \n"
       "index    z23.s, %w[nv_v_start], %w[nv_v_step]    \n"
       "dup      z19.b, #255                             \n"  // A
@@ -1047,10 +1062,12 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
   uint32_t nv_v_start = 0x0002'0002U;
   uint32_t nv_v_step = 0x0004'0004U;
   uint64_t vl;
-  asm("cnth %0" : "=r"(vl));
+  asm volatile (
+      "cnth %0" : "=r"(vl));
   int width_last_y = width & (vl - 1);
   int width_last_uv = width_last_y + (width_last_y & 1);
-  asm("ptrue    p0.b                                    \n"
+  asm volatile (
+      "ptrue    p0.b                                    \n"
       "index    z22.s, %w[nv_u_start], %w[nv_u_step]    \n"
       "index    z23.s, %w[nv_v_start], %w[nv_v_step]    \n"
       "dup      z19.b, #255                             \n"  // A
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index 304770d0c..9dfe64a93 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -193,7 +193,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf1),  // %1
         "m"(kShuf2)   // %2
   );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm2               \n"
@@ -515,7 +515,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
   );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@@ -578,7 +578,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kRound34)  // %2
   );
 
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@@ -667,7 +667,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAb2),  // %2
         "m"(kScaleAb2)  // %3
   );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm1          \n"
@@ -708,7 +708,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAc3),   // %1
         "m"(kScaleAc33)  // %2
   );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm6          \n"
@@ -821,7 +821,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "pxor        %%xmm0,%%xmm0                 \n"  // 0
       // above line
@@ -1900,7 +1900,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
                        int dx) {
   (void)x;
   (void)dx;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
       "lea         0x10(%1),%1                   \n"
@@ -1925,7 +1925,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int dst_width) {
   (void)src_stride;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1947,7 +1947,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   (void)src_stride;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1971,7 +1971,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                uint8_t* dst_argb,
                                int dst_width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -2153,7 +2153,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
                            int dx) {
   (void)x;
   (void)dx;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
       "lea         0x10(%1),%1                   \n"
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index ccc751062..309d7b0bf 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -28,7 +28,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
                         uint8_t* dst,
                         int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       // load even pixels into q0, odd into q1
       "vld2.8      {q0, q1}, [%0]!               \n"
@@ -49,7 +49,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
                               uint8_t* dst,
                               int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld2.8      {q0, q1}, [%0]!               \n"  // load 32 pixels
       "subs        %2, %2, #16                   \n"  // 16 processed per loop
@@ -69,7 +69,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst,
                            int dst_width) {
-  asm volatile(
+  asm volatile (
       // change the stride to row 2 pointer
       "add         %1, %0                        \n"
       "1:                                        \n"
@@ -100,7 +100,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
       "subs        %2, %2, #8                    \n"  // 8 processed per loop
@@ -120,7 +120,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
   const uint8_t* src_ptr1 = src_ptr + src_stride;
   const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.8      {q0}, [%0]!                   \n"  // load up 16x4
       "vld1.8      {q1}, [%3]!                   \n"
@@ -154,7 +154,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
       "subs        %2, %2, #24                   \n"
@@ -172,7 +172,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d24, #3                       \n"
       "add         %3, %0                        \n"
       "1:                                        \n"
@@ -229,7 +229,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
+  asm volatile (
       "vmov.u8     d24, #3                       \n"
       "add         %3, %0                        \n"
       "1:                                        \n"
@@ -281,7 +281,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "vld1.8      {q3}, [%3]                    \n"
       "1:                                        \n"
       "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"
@@ -305,7 +305,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                       int dst_width) {
   const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
 
-  asm volatile(
+  asm volatile (
       "vld1.16     {q13}, [%5]                   \n"
       "vld1.8      {q14}, [%6]                   \n"
       "vld1.8      {q15}, [%7]                   \n"
@@ -415,7 +415,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
+  asm volatile (
       "vld1.16     {q13}, [%4]                   \n"
       "vld1.8      {q14}, [%5]                   \n"
       "add         %3, %0                        \n"
@@ -508,7 +508,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
   const uint8_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
       "vmov.u8     d30, #3                       \n"
 
       "1:                                        \n"
@@ -545,7 +545,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   const uint8_t* src_temp = src_ptr + 1;
   const uint8_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile(
+  asm volatile (
       "vmov.u16    q15, #3                       \n"
       "vmov.u8     d28, #3                       \n"
 
@@ -607,7 +607,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
       "vmov.u16    q15, #3                       \n"
 
       "1:                                        \n"
@@ -643,7 +643,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 1;
   const uint16_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile(
+  asm volatile (
       "vmov.u16    q15, #3                       \n"
 
       "1:                                        \n"
@@ -694,7 +694,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
       "vmov.u16    d31, #3                       \n"
 
       "1:                                        \n"
@@ -738,7 +738,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 1;
   const uint16_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile(
+  asm volatile (
       "vmov.u16    d31, #3                       \n"
       "vmov.u32    q14, #3                       \n"
 
@@ -790,7 +790,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   const uint8_t* src_temp = src_ptr + 2;
-  asm volatile(
+  asm volatile (
       "vmov.u8     d30, #3                       \n"
 
       "1:                                        \n"
@@ -827,7 +827,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   const uint8_t* src_temp = src_ptr + 2;
   const uint8_t* src_temp1 = src_ptr1 + 2;
 
-  asm volatile(
+  asm volatile (
       "vmov.u16    q15, #3                       \n"
       "vmov.u8     d28, #3                       \n"
 
@@ -889,7 +889,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width) {
   const uint16_t* src_temp = src_ptr + 2;
-  asm volatile(
+  asm volatile (
       "vmov.u16    d30, #3                       \n"
 
       "1:                                        \n"
@@ -934,7 +934,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 2;
   const uint16_t* src_temp1 = src_ptr1 + 2;
 
-  asm volatile(
+  asm volatile (
       "vmov.u16    d30, #3                       \n"
       "vmov.u32    q14, #3                       \n"
 
@@ -987,7 +987,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.16     {q1, q2}, [%1]                \n"  // load accumulator
       "vld1.8      {q0}, [%0]!                   \n"  // load 16 bytes
@@ -1086,7 +1086,7 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
                           ptrdiff_t src_stride,
                           int dst_width,
                           int source_y_fraction) {
-  asm volatile(
+  asm volatile (
       "cmp         %4, #0                        \n"
       "beq         100f                          \n"
       "add         %2, %1                        \n"
@@ -1170,7 +1170,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
                             uint8_t* dst,
                             int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
       "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
@@ -1198,7 +1198,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
       "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
@@ -1219,7 +1219,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst,
                                int dst_width) {
-  asm volatile(
+  asm volatile (
       // change the stride to row 2 pointer
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
@@ -1258,7 +1258,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
                                uint8_t* dst_argb,
                                int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "mov         r12, %3, lsl #2               \n"
       "1:                                        \n"
       "vld1.32     {d0[0]}, [%0], r12            \n"
@@ -1282,7 +1282,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
                                   int src_stepx,
                                   uint8_t* dst_argb,
                                   int dst_width) {
-  asm volatile(
+  asm volatile (
       "mov         r12, %4, lsl #2               \n"
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
@@ -1330,7 +1330,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
                         int dx) {
   int tmp;
   const uint8_t* src_tmp = src_argb;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       // clang-format off
       LOAD1_DATA32_LANE(d0, 0)
@@ -1433,7 +1433,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
                           uint8_t* dst,
                           int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
       "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
@@ -1452,7 +1452,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
                                 uint8_t* dst,
                                 int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
       "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
@@ -1471,7 +1471,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst,
                              int dst_width) {
-  asm volatile(
+  asm volatile (
       // change the stride to row 2 pointer
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
@@ -1506,7 +1506,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
   const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
   const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "vld1.16     {d0[0]}, [%0], %6             \n"
       "vld1.16     {d0[1]}, [%1], %6             \n"
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index 12b4b4d09..da1e3d436 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -26,7 +26,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
                         uint8_t* dst,
                         int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       // load even pixels into v0, odd into v1
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
@@ -48,7 +48,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
                               uint8_t* dst,
                               int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       // load even pixels into v0, odd into v1
       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
@@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst,
                            int dst_width) {
-  asm volatile(
+  asm volatile (
       // change the stride to row 2 pointer
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
@@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
@@ -122,7 +122,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
   const uint8_t* src_ptr1 = src_ptr + src_stride;
   const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.16b}, [%0], #16           \n"  // load up 16x4
       "ld1         {v1.16b}, [%2], #16           \n"
@@ -159,7 +159,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
       "subs        %w2, %w2, #24                 \n"
@@ -178,7 +178,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
+  asm volatile (
       "movi        v20.8b, #3                    \n"
       "add         %3, %3, %0                    \n"
       "1:                                        \n"
@@ -237,7 +237,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
+  asm volatile (
       "movi        v20.8b, #3                    \n"
       "add         %3, %3, %0                    \n"
       "1:                                        \n"
@@ -292,7 +292,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "ld1         {v3.16b}, [%3]                \n"
       "1:                                        \n"
       "ld1         {v0.16b,v1.16b}, [%0], #32    \n"
@@ -317,7 +317,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
   const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
   ptrdiff_t tmp_src_stride = src_stride;
 
-  asm volatile(
+  asm volatile (
       "ld1         {v29.8h}, [%5]                \n"
       "ld1         {v30.16b}, [%6]               \n"
       "ld1         {v31.8h}, [%7]                \n"
@@ -439,7 +439,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                int dst_width) {
   // TODO(fbarchard): use src_stride directly for clang 3.5+.
   ptrdiff_t tmp_src_stride = src_stride;
-  asm volatile(
+  asm volatile (
       "ld1         {v30.8h}, [%4]                \n"
       "ld1         {v31.16b}, [%5]               \n"
       "add         %2, %2, %0                    \n"
@@ -539,7 +539,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
   const uint8_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
       "movi        v31.8b, #3                    \n"
 
       "1:                                        \n"
@@ -578,7 +578,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   const uint8_t* src_temp = src_ptr + 1;
   const uint8_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile(
+  asm volatile (
       "movi        v31.8b, #3                    \n"
       "movi        v30.8h, #3                    \n"
 
@@ -634,7 +634,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
       "movi        v31.8h, #3                    \n"
 
       "1:                                        \n"
@@ -671,7 +671,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 1;
   const uint16_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile(
+  asm volatile (
       "movi        v31.8h, #3                    \n"
 
       "1:                                        \n"
@@ -725,7 +725,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
   const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
       "movi        v31.8h, #3                    \n"
 
       "1:                                        \n"
@@ -770,7 +770,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 1;
   const uint16_t* src_temp1 = src_ptr1 + 1;
 
-  asm volatile(
+  asm volatile (
       "movi        v31.4h, #3                    \n"
       "movi        v30.4s, #3                    \n"
 
@@ -825,7 +825,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   const uint8_t* src_temp = src_ptr + 2;
-  asm volatile(
+  asm volatile (
       "movi        v31.8b, #3                    \n"
 
       "1:                                        \n"
@@ -864,7 +864,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
   const uint8_t* src_temp = src_ptr + 2;
   const uint8_t* src_temp1 = src_ptr1 + 2;
 
-  asm volatile(
+  asm volatile (
       "movi        v31.8b, #3                    \n"
       "movi        v30.8h, #3                    \n"
 
@@ -920,7 +920,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width) {
   const uint16_t* src_temp = src_ptr + 2;
-  asm volatile(
+  asm volatile (
       "movi        v31.8h, #3                    \n"
 
       "1:                                        \n"
@@ -967,7 +967,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
   const uint16_t* src_temp = src_ptr + 2;
   const uint16_t* src_temp1 = src_ptr1 + 2;
 
-  asm volatile(
+  asm volatile (
       "movi        v31.4h, #3                    \n"
       "movi        v30.4s, #3                    \n"
 
@@ -1022,7 +1022,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v1.8h, v2.8h}, [%1]          \n"  // load accumulator
       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 bytes
@@ -1123,7 +1123,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
                             uint8_t* dst,
                             int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
       "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
@@ -1145,7 +1145,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
       "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
@@ -1169,7 +1169,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                                uint8_t* dst,
                                int dst_width) {
   const uint8_t* src_ptr1 = src_ptr + src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                       \n"
       "ld2     {v0.4s, v1.4s}, [%[src]], #32    \n"
       "ld2     {v20.4s, v21.4s}, [%[src1]], #32 \n"
@@ -1200,7 +1200,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
   const uint8_t* src_argb3 = src_argb + src_stepx * 12;
   int64_t i = 0;
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                     \n"
       "ldr      w10, [%[src], %[i]]           \n"
       "ldr      w11, [%[src1], %[i]]          \n"
@@ -1232,7 +1232,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
                                   int src_stepx,
                                   uint8_t* dst_argb,
                                   int dst_width) {
-  asm volatile(
+  asm volatile (
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
       "ld1         {v0.8b}, [%0], %4             \n"  // Read 4 2x2 -> 2x1
@@ -1287,7 +1287,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
   int64_t x64 = (int64_t)x;    // NOLINT
   int64_t dx64 = (int64_t)dx;  // NOLINT
   int64_t tmp64;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       // clang-format off
       LOAD1_DATA32_LANE(v0, 0)
@@ -1394,7 +1394,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint16_t* dst,
                               int dst_width) {
-  asm volatile(
+  asm volatile (
       // change the stride to row 2 pointer
       "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
       "1:                                        \n"
@@ -1426,7 +1426,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
                          ptrdiff_t src_stride,
                          uint16_t* dst,
                          int dst_width) {
-  asm volatile(
+  asm volatile (
       "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
       "movi        v0.8h, #9                     \n"  // constants
       "movi        v1.4s, #3                     \n"
@@ -1477,7 +1477,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
                           uint8_t* dst,
                           int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@@ -1496,7 +1496,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
                                 uint8_t* dst,
                                 int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@@ -1515,7 +1515,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst,
                              int dst_width) {
-  asm volatile(
+  asm volatile (
       // change the stride to row 2 pointer
       "add         %1, %1, %0                    \n"
       "1:                                        \n"
@@ -1550,7 +1550,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
   const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
   const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
   (void)src_stride;
-  asm volatile(
+  asm volatile (
       "1:                                        \n"
       "ld1         {v0.h}[0], [%0], %6           \n"
       "ld1         {v1.h}[0], [%1], %6           \n"
diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc
index 5a6f6e5fc..6ed58de2f 100644
--- a/source/scale_rvv.cc
+++ b/source/scale_rvv.cc
@@ -100,7 +100,7 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
   const uint32_t* src = (const uint32_t*)(src_argb);
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   do {
     vuint8m4_t v_odd, v_even, v_dst;
     vuint32m4_t v_odd_32, v_even_32;
@@ -165,7 +165,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
   const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   do {
     vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
     vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
@@ -262,7 +262,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
   const int stride_byte = src_stepx * 4;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   do {
     vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
     vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
@@ -340,7 +340,7 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
   (void)src_stride;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   do {
     vuint8m4_t v_s0, v_s1, v_dst;
     size_t vl = __riscv_vsetvl_e8m4(w);
@@ -395,7 +395,7 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
   size_t w = (size_t)dst_width;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   do {
     size_t vl = __riscv_vsetvl_e8m4(w);
     vuint8m4_t v_s0, v_s1, v_t0, v_t1;
@@ -528,7 +528,7 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
   size_t w = (size_t)dst_width;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   do {
     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
     vuint8m2_t v_t0, v_t1, v_t2, v_t3;
@@ -698,7 +698,7 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
   const uint8_t* t = src_ptr + src_stride;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   do {
     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
     vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
@@ -827,7 +827,7 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
   const uint8_t* t = src_ptr + src_stride;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   do {
     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
     vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
@@ -1490,7 +1490,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
   (void)src_stride;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   do {
     vuint8m4_t v_u0v0, v_u1v1, v_avg;
     vuint16m4_t v_u0v0_16, v_u1v1_16;
@@ -1559,7 +1559,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
   size_t w = (size_t)dst_width;
   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
   // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
   do {
     vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
     vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;