From b0dfa70114d607fba655d5af6f481cc5d0a559b9 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Mon, 17 Jun 2024 12:51:24 -0700
Subject: [PATCH] RVV remove unused variables

- ARM Planar test use regular asm volatile syntax
- x86 row functions remove volatile from asm

Bug: 347111119, 347112532
Change-Id: I535b3dfa1a7a19824503bd95584a63b047b0e9a1
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5637058
Reviewed-by: Justin Green <greenjustin@google.com>
---
 source/compare_gcc.cc    |   48 +-
 source/rotate_gcc.cc     |   10 +-
 source/row_gcc.cc        | 1677 ++++++++++++++++----------------------
 source/row_rvv.cc        |    3 -
 source/scale_gcc.cc      |  350 ++++----
 unit_test/planar_test.cc |   11 +-
 6 files changed, 890 insertions(+), 1209 deletions(-)

diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc
index 33cbe25de..970f950f4 100644
--- a/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@@ -21,14 +21,15 @@ extern "C" {
 // This module is for GCC x86 and x64.
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 
+// "memory" clobber prevents the reads from being removed
+
 #if defined(__x86_64__)
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                                const uint8_t* src_b,
                                int count) {
-  uint64_t diff = 0u;
+  uint64_t diff;
 
-  asm volatile(
-      "xor         %3,%3                         \n"
+      asm("xor         %3,%3                         \n"
       "xor         %%r8,%%r8                     \n"
       "xor         %%r9,%%r9                     \n"
       "xor         %%r10,%%r10                   \n"
@@ -63,9 +64,9 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
       : "+r"(src_a),  // %0
         "+r"(src_b),  // %1
         "+r"(count),  // %2
-        "=r"(diff)    // %3
+        "=&r"(diff)   // %3
       :
-      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
+      : "cc", "memory", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
 
   return (uint32_t)(diff);
 }
@@ -75,7 +76,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                                int count) {
   uint32_t diff = 0u;
 
-  asm volatile(
+  asm(
       // Process 16 bytes per loop.
       LABELALIGN
       "1:                                        \n"
@@ -104,7 +105,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
         "+r"(count),  // %2
         "+r"(diff)    // %3
       :
-      : "memory", "cc", "ecx", "edx");
+      : "cc", "memory", "ecx", "edx");
 
   return diff;
 }
@@ -117,10 +118,9 @@ static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
 uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
                                const uint8_t* src_b,
                                int count) {
-  uint32_t diff = 0u;
+  uint32_t diff;
 
-  asm volatile(
-      "movdqa      %4,%%xmm2                     \n"
+      asm("movdqa      %4,%%xmm2                     \n"
       "movdqa      %5,%%xmm3                     \n"
       "pxor        %%xmm0,%%xmm0                 \n"
       "pxor        %%xmm1,%%xmm1                 \n"
@@ -166,7 +166,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
         "=r"(diff)         // %3
       : "m"(kNibbleMask),  // %4
         "m"(kBitCount)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
         "xmm7");
 
   return diff;
@@ -176,10 +176,9 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
 uint32_t HammingDistance_AVX2(const uint8_t* src_a,
                               const uint8_t* src_b,
                               int count) {
-  uint32_t diff = 0u;
+  uint32_t diff;
 
-  asm volatile(
-      "vbroadcastf128 %4,%%ymm2                  \n"
+      asm("vbroadcastf128 %4,%%ymm2                  \n"
       "vbroadcastf128 %5,%%ymm3                  \n"
       "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
       "vpxor       %%ymm1,%%ymm1,%%ymm1          \n"
@@ -214,7 +213,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
       "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
       "vpermq      $0xaa,%%ymm0,%%ymm1           \n"
       "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovd       %%xmm0, %3                    \n"
+      "vmovd       %%xmm0,%3                     \n"
       "vzeroupper                                \n"
       : "+r"(src_a),       // %0
         "+r"(src_b),       // %1
@@ -222,7 +221,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
         "=r"(diff)         // %3
       : "m"(kNibbleMask),  // %4
         "m"(kBitCount)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 
   return diff;
 }
@@ -232,8 +231,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count) {
   uint32_t sse;
-  asm volatile(
-      "pxor        %%xmm0,%%xmm0                 \n"
+      asm("pxor        %%xmm0,%%xmm0                 \n"
       "pxor        %%xmm5,%%xmm5                 \n"
 
       LABELALIGN
@@ -261,13 +259,12 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
       "pshufd      $0x1,%%xmm0,%%xmm1            \n"
       "paddd       %%xmm1,%%xmm0                 \n"
       "movd        %%xmm0,%3                     \n"
-
       : "+r"(src_a),  // %0
         "+r"(src_b),  // %1
         "+r"(count),  // %2
-        "=g"(sse)     // %3
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+        "=r"(sse)     // %3
+      :
+      : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
   return sse;
 }
 
@@ -299,8 +296,7 @@ static const uvec32 kHashMul3 = {
 
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
   uint32_t hash;
-  asm volatile(
-      "movd        %2,%%xmm0                     \n"
+      asm("movd        %2,%%xmm0                     \n"
       "pxor        %%xmm7,%%xmm7                 \n"
       "movdqa      %4,%%xmm6                     \n"
 
@@ -341,13 +337,13 @@ uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
       : "+r"(src),        // %0
         "+r"(count),      // %1
         "+rm"(seed),      // %2
-        "=g"(hash)        // %3
+        "=r"(hash)        // %3
       : "m"(kHash16x33),  // %4
         "m"(kHashMul0),   // %5
         "m"(kHashMul1),   // %6
         "m"(kHashMul2),   // %7
         "m"(kHashMul3)    // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
         "xmm7");
   return hash;
 }
diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc
index fd5eee05f..034161421 100644
--- a/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
                         uint8_t* dst,
                         int dst_stride,
                         int width) {
-  asm volatile(
+  asm(
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
@@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
                              uint8_t* dst,
                              int dst_stride,
                              int width) {
-  asm volatile(
+  asm(
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
@@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
                          uint8_t* dst_b,
                          int dst_stride_b,
                          int width) {
-  asm volatile(
+  asm(
       // Read in the data from the source pointer.
       // First round of bit swap.
       LABELALIGN
@@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
                           uint8_t* dst,
                           int dst_stride,
                           int width) {
-  asm volatile(
+  asm(
       // Main loop transpose 4x4.  Read a column, write a row.
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"  // a b c d
@@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
                           uint8_t* dst,
                           int dst_stride,
                           int width) {
-  asm volatile(
+  asm(
       // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
       "1:                                        \n"
       "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index a8ff21344..b601b60b4 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -161,8 +161,7 @@ static const lvec8 kShuffleNV21 = {
 
 #ifdef HAS_J400TOARGBROW_SSE2
 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
 
       LABELALIGN
@@ -192,8 +191,7 @@ void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
       "pslld       $0x18,%%xmm5                  \n"
       "movdqa      %3,%%xmm4                     \n"
 
@@ -230,8 +228,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
 }
 
 void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0xff000000
       "pslld       $0x18,%%xmm5                  \n"
       "movdqa      %3,%%xmm4                     \n"
 
@@ -269,8 +266,7 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 
 // Same code as RAWToARGB with different shuffler and A in low bits
 void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"  // 0x000000ff
       "psrld       $0x18,%%xmm5                  \n"
       "movdqa      %3,%%xmm4                     \n"
 
@@ -309,8 +305,7 @@ void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
 void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
                          uint8_t* dst_rgb24,
                          int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm3                     \n"
+      asm("movdqa      %3,%%xmm3                     \n"
       "movdqa      %4,%%xmm4                     \n"
       "movdqa      %5,%%xmm5                     \n"
 
@@ -339,8 +334,7 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
 }
 
 void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "mov         $0x1080108,%%eax              \n"
+      asm("mov         $0x1080108,%%eax              \n"
       "movd        %%eax,%%xmm5                  \n"
       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
       "mov         $0x20802080,%%eax             \n"
@@ -387,8 +381,7 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "mov         $0x1080108,%%eax              \n"
+      asm("mov         $0x1080108,%%eax              \n"
       "movd        %%eax,%%xmm5                  \n"
       "pshufd      $0x0,%%xmm5,%%xmm5            \n"
       "mov         $0x42004200,%%eax             \n"
@@ -438,8 +431,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "mov         $0xf0f0f0f,%%eax              \n"
+      asm("mov         $0xf0f0f0f,%%eax              \n"
       "movd        %%eax,%%xmm4                  \n"
       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
       "movdqa      %%xmm4,%%xmm5                 \n"
@@ -475,9 +467,9 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-      asm volatile("movdqa      %3,%%xmm6                     \n"
+      asm("movdqa      %3,%%xmm6                     \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -504,18 +496,17 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
       "lea         0x30(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-               : "+r"(src),                    // %0
-                 "+r"(dst),                    // %1
-                 "+r"(width)                   // %2
-               : "m"(kShuffleMaskARGBToRGB24)  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6");
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskARGBToRGB24)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-      asm volatile("movdqa      %3,%%xmm6                     \n"
+      asm("movdqa      %3,%%xmm6                     \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -542,12 +533,11 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
       "lea         0x30(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-               : "+r"(src),                  // %0
-                 "+r"(dst),                  // %1
-                 "+r"(width)                 // %2
-               : "m"(kShuffleMaskARGBToRAW)  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6");
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskARGBToRAW)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
 #ifdef HAS_ARGBTORGB24ROW_AVX2
@@ -555,8 +545,7 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
 
 void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm6                  \n"
+      asm("vbroadcastf128 %3,%%ymm6                  \n"
       "vmovdqa     %4,%%ymm7                     \n"
 
       LABELALIGN
@@ -615,8 +604,7 @@ static const ulvec8 kPermARGBToRGB24_2 = {
     50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
 
 void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vmovdqa     %3,%%ymm5                     \n"
+      asm("vmovdqa     %3,%%ymm5                     \n"
       "vmovdqa     %4,%%ymm6                     \n"
       "vmovdqa     %5,%%ymm7                     \n"
 
@@ -649,8 +637,7 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_ARGBTORAWROW_AVX2
 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm6                  \n"
+      asm("vbroadcastf128 %3,%%ymm6                  \n"
       "vmovdqa     %4,%%ymm7                     \n"
 
       LABELALIGN
@@ -694,8 +681,7 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 #endif
 
 void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm3,%%xmm3                 \n"
+      asm("pcmpeqb     %%xmm3,%%xmm3                 \n"
       "psrld       $0x1b,%%xmm3                  \n"
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrld       $0x1a,%%xmm4                  \n"
@@ -734,8 +720,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
                                 uint8_t* dst,
                                 uint32_t dither4,
                                 int width) {
-  asm volatile(
-      "movd        %3,%%xmm6                     \n"
+      asm("movd        %3,%%xmm6                     \n"
       "punpcklbw   %%xmm6,%%xmm6                 \n"
       "movdqa      %%xmm6,%%xmm7                 \n"
       "punpcklwd   %%xmm6,%%xmm6                 \n"
@@ -782,8 +767,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
                                 uint8_t* dst,
                                 uint32_t dither4,
                                 int width) {
-  asm volatile(
-      "vbroadcastss %3,%%xmm6                    \n"
+      asm("vbroadcastss %3,%%xmm6                    \n"
       "vpunpcklbw  %%xmm6,%%xmm6,%%xmm6          \n"
       "vpermq      $0xd8,%%ymm6,%%ymm6           \n"
       "vpunpcklwd  %%ymm6,%%ymm6,%%ymm6          \n"
@@ -824,8 +808,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrld       $0x1b,%%xmm4                  \n"
       "movdqa      %%xmm4,%%xmm5                 \n"
       "pslld       $0x5,%%xmm5                   \n"
@@ -865,8 +848,7 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psllw       $0xc,%%xmm4                   \n"
       "movdqa      %%xmm4,%%xmm3                 \n"
       "psrlw       $0x8,%%xmm3                   \n"
@@ -928,8 +910,7 @@ static const uint32_t kMaskAG10 = 0xc000ff00;
 static const uint32_t kMulAG10 = 64 * 65536 + 1028;
 
 void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
+      asm("movdqa      %3,%%xmm2                     \n"  // shuffler for RB
       "movd        %4,%%xmm3                     \n"  // multipler for RB
       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
       "movd        %6,%%xmm5                     \n"  // mask for AG
@@ -967,8 +948,7 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 }
 
 void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm2                     \n"  // shuffler for RB
+      asm("movdqa      %3,%%xmm2                     \n"  // shuffler for RB
       "movd        %4,%%xmm3                     \n"  // multipler for RB
       "movd        %5,%%xmm4                     \n"  // mask for R10 B10
       "movd        %6,%%xmm5                     \n"  // mask for AG
@@ -1007,8 +987,7 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_ARGBTOAR30ROW_AVX2
 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      asm("vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
@@ -1044,8 +1023,7 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_ABGRTOAR30ROW_AVX2
 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      asm("vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
       "vbroadcastss %4,%%ymm3                    \n"  // multipler for RB
       "vbroadcastss %5,%%ymm4                    \n"  // mask for R10 B10
       "vbroadcastss %6,%%ymm5                    \n"  // mask for AG
@@ -1090,7 +1068,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
                          uint16_t* dst_ar64,
                          int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqa      %%xmm0,%%xmm1                 \n"
@@ -1102,18 +1080,17 @@ void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
       "lea         0x20(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_ar64),  // %1
-                 "+r"(width)      // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
                          uint16_t* dst_ab64,
                          int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm2                     \n"
+      asm("movdqa      %3,%%xmm2                     \n"
       "movdqa      %4,%%xmm3                     \n" LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
@@ -1137,7 +1114,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1149,19 +1126,19 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_ar64),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(width)      // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(src_ar64),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
                          uint8_t* dst_argb,
                          int width) {
-      asm volatile("movdqa      %3,%%xmm2                     \n"
+      asm("movdqa      %3,%%xmm2                     \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1174,18 +1151,18 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_ab64),          // %0
-                 "+r"(dst_argb),          // %1
-                 "+r"(width)              // %2
-               : "m"(kShuffleARGBToABGR)  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_ab64),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleARGBToABGR)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 
 #ifdef HAS_ARGBTOAR64ROW_AVX2
 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
@@ -1198,11 +1175,11 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_ar64),  // %1
-                 "+r"(width)      // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_ar64),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 #endif
 
@@ -1210,8 +1187,7 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
 void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
                         uint16_t* dst_ab64,
                         int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm2                  \n"
+      asm("vbroadcastf128 %3,%%ymm2                  \n"
       "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
@@ -1238,7 +1214,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -1252,11 +1228,11 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_ar64),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(width)      // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(src_ar64),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 #endif
 
@@ -1264,7 +1240,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
                         uint8_t* dst_argb,
                         int width) {
-      asm volatile("vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
+      asm("vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -1279,16 +1255,14 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_ab64),          // %0
-                 "+r"(dst_argb),          // %1
-                 "+r"(width)              // %2
-               : "m"(kShuffleARGBToABGR)  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_ab64),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleARGBToABGR)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif
 
-// clang-format off
-
 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
 // round parameter is register containing value to add before shift.
 #define RGBTOY(round)                            \
@@ -1313,8 +1287,10 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
   "phaddw    %%xmm0,%%xmm6                   \n" \
   "phaddw    %%xmm2,%%xmm1                   \n" \
   "prefetcht0 1280(%0)                       \n" \
-  "paddw     %%" #round ",%%xmm6             \n" \
-  "paddw     %%" #round ",%%xmm1             \n" \
+  "paddw     %%" #round                          \
+  ",%%xmm6             \n"                       \
+  "paddw     %%" #round                          \
+  ",%%xmm1             \n"                       \
   "psrlw     $0x8,%%xmm6                     \n" \
   "psrlw     $0x8,%%xmm1                     \n" \
   "packuswb  %%xmm1,%%xmm6                   \n" \
@@ -1341,8 +1317,10 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
   "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
   "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
   "prefetcht0 1280(%0)                       \n"                 \
-  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
-  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
+  "vpaddw     %%" #round                                         \
+  ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
+  "vpaddw     %%" #round                                         \
+  ",%%ymm2,%%ymm2     \n"                                        \
   "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
   "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
   "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
@@ -1353,13 +1331,10 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
   "jg        1b                              \n"                 \
   "vzeroupper                                \n"
 
-// clang-format on
-
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
 
@@ -1379,8 +1354,7 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16.
 void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN RGBTOY(xmm5)
@@ -1397,8 +1371,7 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 // Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
 // Same as ABGRToYRow but different coefficients, no add 16.
 void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN RGBTOY(xmm5)
@@ -1415,8 +1388,7 @@ void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16.
 void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN RGBTOY(xmm5)
@@ -1439,8 +1411,7 @@ static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
+      asm("vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vbroadcastf128 %5,%%ymm7                  \n"
       "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
@@ -1460,8 +1431,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 #ifdef HAS_ABGRTOYROW_AVX2
 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
 void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
+      asm("vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vbroadcastf128 %5,%%ymm7                  \n"
       "vmovdqu     %6,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
@@ -1481,8 +1451,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
+      asm("vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
       ymm5) "vzeroupper                                \n"
@@ -1500,8 +1469,7 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 #ifdef HAS_ABGRTOYJROW_AVX2
 // Convert 32 ABGR pixels (128 bytes) to 32 Y values.
 void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
+      asm("vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
       ymm5) "vzeroupper                                \n"
@@ -1519,8 +1487,7 @@ void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 #ifdef HAS_RGBATOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm4                  \n"
+      asm("vbroadcastf128 %3,%%ymm4                  \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu     %5,%%ymm6                     \n" LABELALIGN RGBTOY_AVX2(
       ymm5) "vzeroupper                                \n"
@@ -1540,8 +1507,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
+      asm("movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -1613,8 +1579,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
+      asm("vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
       "sub         %1,%2                         \n"
@@ -1676,8 +1641,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
+      asm("vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
       "sub         %1,%2                         \n"
@@ -1739,8 +1703,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
+      asm("vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
       "sub         %1,%2                         \n"
@@ -1804,8 +1767,7 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
+      asm("vbroadcastf128 %5,%%ymm5                  \n"
       "vbroadcastf128 %6,%%ymm6                  \n"
       "vbroadcastf128 %7,%%ymm7                  \n"
       "sub         %1,%2                         \n"
@@ -1868,8 +1830,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
+      asm("movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -1934,8 +1895,7 @@ void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
+      asm("movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -1999,8 +1959,7 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  asm volatile(
-      "movdqa      %4,%%xmm3                     \n"
+      asm("movdqa      %4,%%xmm3                     \n"
       "movdqa      %5,%%xmm4                     \n"
       "movdqa      %6,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -2053,8 +2012,7 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
 #endif  // HAS_ARGBTOUV444ROW_SSSE3
 
 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
 
@@ -2074,8 +2032,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
+      asm("movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -2133,8 +2090,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
 }
 
 void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
 
@@ -2150,8 +2106,7 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
 }
 
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
       "movdqa      %5,%%xmm7                     \n"
 
@@ -2171,8 +2126,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
+      asm("movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -2234,8 +2188,7 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
+      asm("movdqa      %5,%%xmm3                     \n"
       "movdqa      %6,%%xmm4                     \n"
       "movdqa      %7,%%xmm5                     \n"
       "sub         %1,%2                         \n"
@@ -2624,7 +2577,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -2655,33 +2608,26 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  // clang-format off
-  asm volatile (
-  YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
+  asm(YUVTORGB_SETUP(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-  LABELALIGN
-      "1:                                        \n"
-  READYUVA444
-  YUVTORGB(yuvconstants)
-  STOREARGB
+      LABELALIGN "1:                                        \n" READYUVA444
+          YUVTORGB(yuvconstants) STOREARGB
       "subl        $0x8,%[width]                 \n"
       "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+      : [y_buf] "+r"(y_buf),        // %[y_buf]
+        [u_buf] "+r"(u_buf),        // %[u_buf]
+        [v_buf] "+r"(v_buf),        // %[v_buf]
+        [a_buf] "+r"(a_buf),        // %[a_buf]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
 #if defined(__i386__)
-    [width]"+m"(width)     // %[width]
+        [width] "+m"(width)  // %[width]
 #else
-    [width]"+rm"(width)    // %[width]
+        [width] "+rm"(width)  // %[width]
 #endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
 }
 #endif  // HAS_I444ALPHATOARGBROW_SSSE3
 
@@ -2691,7 +2637,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
                                  uint8_t* dst_rgb24,
                                  const struct YuvConstants* yuvconstants,
                                  int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
       "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
@@ -2727,7 +2673,7 @@ void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf,
                                  uint8_t* dst_rgb24,
                                  const struct YuvConstants* yuvconstants,
                                  int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "movdqa      %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
       "movdqa      %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
@@ -2763,7 +2709,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -2792,7 +2738,7 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
                                 uint8_t* dst_ar30,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
@@ -2827,7 +2773,7 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -2857,7 +2803,7 @@ void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -2887,7 +2833,7 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_ar30,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -2922,7 +2868,7 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_ar30,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -2957,7 +2903,7 @@ void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -2988,8 +2934,7 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  asm volatile(
-      YUVTORGB_SETUP(
+  asm(YUVTORGB_SETUP(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA210
@@ -3021,32 +2966,26 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  // clang-format off
-  asm volatile(
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
+  asm(YUVTORGB_SETUP(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READYUVA410
-    YUVTORGB(yuvconstants)
-    STOREARGB
+      LABELALIGN "1:                                        \n" READYUVA410
+          YUVTORGB(yuvconstants) STOREARGB
       "subl        $0x8,%[width]                 \n"
       "jg          1b                            \n"
-    : [y_buf] "+r"(y_buf),  // %[y_buf]
-      [u_buf] "+r"(u_buf),  // %[u_buf]
-      [v_buf] "+r"(v_buf),  // %[v_buf]
-      [a_buf] "+r"(a_buf),
-      [dst_argb] "+r"(dst_argb),  // %[dst_argb]
+      : [y_buf] "+r"(y_buf),  // %[y_buf]
+        [u_buf] "+r"(u_buf),  // %[u_buf]
+        [v_buf] "+r"(v_buf),  // %[v_buf]
+        [a_buf] "+r"(a_buf),
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
 #if defined(__i386__)
-      [width] "+m"(width)  // %[width]
+        [width] "+m"(width)  // %[width]
 #else
-      [width] "+rm"(width)  // %[width]
+        [width] "+rm"(width)  // %[width]
 #endif
-    : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-      "xmm5");
-  // clang-format on
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
 }
 #endif
 
@@ -3057,7 +2996,7 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_ar30,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -3093,33 +3032,26 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
                                      uint8_t* dst_argb,
                                      const struct YuvConstants* yuvconstants,
                                      int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
+  asm(YUVTORGB_SETUP(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READYUVA422
-    YUVTORGB(yuvconstants)
-    STOREARGB
+      LABELALIGN "1:                                        \n" READYUVA422
+          YUVTORGB(yuvconstants) STOREARGB
       "subl        $0x8,%[width]                 \n"
       "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+      : [y_buf] "+r"(y_buf),        // %[y_buf]
+        [u_buf] "+r"(u_buf),        // %[u_buf]
+        [v_buf] "+r"(v_buf),        // %[v_buf]
+        [a_buf] "+r"(a_buf),        // %[a_buf]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
 #if defined(__i386__)
-    [width]"+m"(width)     // %[width]
+        [width] "+m"(width)  // %[width]
 #else
-    [width]"+rm"(width)    // %[width]
+        [width] "+rm"(width)  // %[width]
 #endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
 }
 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
 
@@ -3128,27 +3060,20 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm(YUVTORGB_SETUP(
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READNV12
-    YUVTORGB(yuvconstants)
-    STOREARGB
+      LABELALIGN "1:                                        \n" READNV12
+          YUVTORGB(yuvconstants) STOREARGB
       "sub         $0x8,%[width]                 \n"
       "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [y_buf] "+r"(y_buf),              // %[y_buf]
+        [uv_buf] "+r"(uv_buf),            // %[uv_buf]
+        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
+        [width] "+rm"(width)              // %[width]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
 }
 
 void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
@@ -3156,88 +3081,63 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP(yuvconstants)
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm(YUVTORGB_SETUP(
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READNV21
-    YUVTORGB(yuvconstants)
-    STOREARGB
+      LABELALIGN "1:                                        \n" READNV21
+          YUVTORGB(yuvconstants) STOREARGB
       "sub         $0x8,%[width]                 \n"
       "jg          1b                            \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [y_buf] "+r"(y_buf),               // %[y_buf]
+        [vu_buf] "+r"(vu_buf),             // %[vu_buf]
+        [dst_argb] "+r"(dst_argb),         // %[dst_argb]
+        [width] "+rm"(width)               // %[width]
+      : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
+        [kShuffleNV21] "m"(kShuffleNV21)
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
 }
 
 void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  // clang-format off
-  asm volatile (
-      "movdqa      %[kShuffleYUY2Y],%%xmm6       \n"
-      "movdqa      %[kShuffleYUY2UV],%%xmm7      \n"
-    YUVTORGB_SETUP(yuvconstants)
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("movdqa      %[kShuffleYUY2Y],%%xmm6       \n"
+      "movdqa      %[kShuffleYUY2UV],%%xmm7      \n" YUVTORGB_SETUP(
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READYUY2
-    YUVTORGB(yuvconstants)
-    STOREARGB
+      LABELALIGN "1:                                        \n" READYUY2
+          YUVTORGB(yuvconstants) STOREARGB
       "sub         $0x8,%[width]                 \n"
       "jg          1b                            \n"
-  : [yuy2_buf]"+r"(yuy2_buf),  // %[yuy2_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)        // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
-    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-  // clang-format on
+      : [yuy2_buf] "+r"(yuy2_buf),         // %[yuy2_buf]
+        [dst_argb] "+r"(dst_argb),         // %[dst_argb]
+        [width] "+rm"(width)               // %[width]
+      : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
+        [kShuffleYUY2Y] "m"(kShuffleYUY2Y), [kShuffleYUY2UV] "m"(kShuffleYUY2UV)
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5", "xmm6", "xmm7");
 }
 
 void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  // clang-format off
-  asm volatile (
-      "movdqa      %[kShuffleUYVYY],%%xmm6       \n"
-      "movdqa      %[kShuffleUYVYUV],%%xmm7      \n"
-    YUVTORGB_SETUP(yuvconstants)
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("movdqa      %[kShuffleUYVYY],%%xmm6       \n"
+      "movdqa      %[kShuffleUYVYUV],%%xmm7      \n" YUVTORGB_SETUP(
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READUYVY
-    YUVTORGB(yuvconstants)
-    STOREARGB
+      LABELALIGN "1:                                        \n" READUYVY
+          YUVTORGB(yuvconstants) STOREARGB
       "sub         $0x8,%[width]                 \n"
       "jg          1b                            \n"
-  : [uyvy_buf]"+r"(uyvy_buf),  // %[uyvy_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)        // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleUYVYY]"m"(kShuffleUYVYY),
-    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [uyvy_buf] "+r"(uyvy_buf),         // %[uyvy_buf]
+        [dst_argb] "+r"(dst_argb),         // %[dst_argb]
+        [width] "+rm"(width)               // %[width]
+      : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
+        [kShuffleUYVYY] "m"(kShuffleUYVYY), [kShuffleUYVYUV] "m"(kShuffleUYVYUV)
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+        "xmm5");
 }
 
 void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
@@ -3245,8 +3145,7 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile(
-      YUVTORGB_SETUP(
+  asm(YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:                                        \n" READP210
@@ -3267,8 +3166,7 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile(
-      YUVTORGB_SETUP(
+  asm(YUVTORGB_SETUP(
       yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       LABELALIGN "1:                                        \n" READP410
@@ -3289,7 +3187,7 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_ar30,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $14,%%xmm5                    \n"
@@ -3320,7 +3218,7 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
                                 uint8_t* dst_ar30,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $14,%%xmm5                    \n"
@@ -3352,7 +3250,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
                                 uint8_t* dst_rgba,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
@@ -3763,7 +3661,7 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@@ -3797,7 +3695,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@@ -3837,7 +3735,7 @@ void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
                                    uint8_t* dst_argb,
                                    const struct YuvConstants* yuvconstants,
                                    int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX512BW(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%xmm5,%%xmm5,%%xmm5          \n"
@@ -3876,7 +3774,7 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
                                uint8_t* dst_ar30,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
@@ -3916,7 +3814,7 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@@ -3951,7 +3849,7 @@ void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@@ -3986,7 +3884,7 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_ar30,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
@@ -4026,7 +3924,7 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_ar30,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
@@ -4066,7 +3964,7 @@ void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@@ -4102,8 +4000,7 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  asm volatile(
-      YUVTORGB_SETUP_AVX2(
+  asm(YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA210_AVX2
@@ -4138,8 +4035,7 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  asm volatile(
-      YUVTORGB_SETUP_AVX2(
+  asm(YUVTORGB_SETUP_AVX2(
       yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
       LABELALIGN "1:                                        \n" READYUVA410_AVX2
@@ -4173,7 +4069,7 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_ar30,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
@@ -4214,34 +4110,27 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  // clang-format off
-  asm volatile (
-  YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
+  asm(YUVTORGB_SETUP_AVX2(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-  LABELALIGN
-      "1:                                        \n"
-  READYUVA444_AVX2
-  YUVTORGB_AVX2(yuvconstants)
-  STOREARGB_AVX2
+      LABELALIGN "1:                                        \n" READYUVA444_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
       "subl        $0x10,%[width]                \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+      : [y_buf] "+r"(y_buf),        // %[y_buf]
+        [u_buf] "+r"(u_buf),        // %[u_buf]
+        [v_buf] "+r"(v_buf),        // %[v_buf]
+        [a_buf] "+r"(a_buf),        // %[a_buf]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
 #if defined(__i386__)
-    [width]"+m"(width)     // %[width]
+        [width] "+m"(width)  // %[width]
 #else
-    [width]"+rm"(width)    // %[width]
+        [width] "+rm"(width)  // %[width]
 #endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+        "xmm4", "xmm5");
 }
 #endif  // HAS_I444ALPHATOARGBROW_AVX2
 
@@ -4255,34 +4144,27 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
                                     uint8_t* dst_argb,
                                     const struct YuvConstants* yuvconstants,
                                     int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "sub         %[u_buf],%[v_buf]             \n"
+  asm(YUVTORGB_SETUP_AVX2(
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READYUVA422_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
+      LABELALIGN "1:                                        \n" READYUVA422_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
       "subl        $0x10,%[width]                \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+      : [y_buf] "+r"(y_buf),        // %[y_buf]
+        [u_buf] "+r"(u_buf),        // %[u_buf]
+        [v_buf] "+r"(v_buf),        // %[v_buf]
+        [a_buf] "+r"(a_buf),        // %[a_buf]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
 #if defined(__i386__)
-    [width]"+m"(width)     // %[width]
+        [width] "+m"(width)  // %[width]
 #else
-    [width]"+rm"(width)    // %[width]
+        [width] "+rm"(width)  // %[width]
 #endif
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+        "xmm4", "xmm5");
 }
 #endif  // HAS_I422ALPHATOARGBROW_AVX2
 
@@ -4295,7 +4177,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "sub         %[u_buf],%[v_buf]             \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@@ -4338,28 +4220,21 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm(YUVTORGB_SETUP_AVX2(
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READNV12_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
+      LABELALIGN "1:                                        \n" READNV12_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
       "sub         $0x10,%[width]                \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [y_buf] "+r"(y_buf),              // %[y_buf]
+        [uv_buf] "+r"(uv_buf),            // %[uv_buf]
+        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
+        [width] "+rm"(width)              // %[width]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2",
+        "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_NV12TOARGBROW_AVX2
 
@@ -4371,29 +4246,22 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm(YUVTORGB_SETUP_AVX2(
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READNV21_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
+      LABELALIGN "1:                                        \n" READNV21_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
       "sub         $0x10,%[width]                \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [y_buf] "+r"(y_buf),               // %[y_buf]
+        [vu_buf] "+r"(vu_buf),             // %[vu_buf]
+        [dst_argb] "+r"(dst_argb),         // %[dst_argb]
+        [width] "+rm"(width)               // %[width]
+      : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
+        [kShuffleNV21] "m"(kShuffleNV21)
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2",
+        "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_NV21TOARGBROW_AVX2
 
@@ -4404,31 +4272,22 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  // clang-format off
-  asm volatile (
-      "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6    \n"
-      "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7   \n"
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vbroadcastf128 %[kShuffleYUY2Y],%%ymm6    \n"
+      "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READYUY2_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
+      LABELALIGN "1:                                        \n" READYUY2_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
       "sub         $0x10,%[width]                \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
-    [dst_argb]"+r"(dst_argb),    // %[dst_argb]
-    [width]"+rm"(width)          // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
-    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-  // clang-format on
+      : [yuy2_buf] "+r"(yuy2_buf),         // %[yuy2_buf]
+        [dst_argb] "+r"(dst_argb),         // %[dst_argb]
+        [width] "+rm"(width)               // %[width]
+      : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
+        [kShuffleYUY2Y] "m"(kShuffleYUY2Y), [kShuffleYUY2UV] "m"(kShuffleYUY2UV)
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2",
+        "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_YUY2TOARGBROW_AVX2
 
@@ -4439,31 +4298,22 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  // clang-format off
-  asm volatile (
-      "vbroadcastf128 %[kShuffleUYVYY],%%ymm6    \n"
-      "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7   \n"
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vbroadcastf128 %[kShuffleUYVYY],%%ymm6    \n"
+      "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7   \n" YUVTORGB_SETUP_AVX2(
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READUYVY_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
+      LABELALIGN "1:                                        \n" READUYVY_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
       "sub         $0x10,%[width]                \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
-    [kShuffleUYVYY]"m"(kShuffleUYVYY),
-    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
-  // clang-format on
+      : [uyvy_buf] "+r"(uyvy_buf),         // %[uyvy_buf]
+        [dst_argb] "+r"(dst_argb),         // %[dst_argb]
+        [width] "+rm"(width)               // %[width]
+      : [yuvconstants] "r"(yuvconstants),  // %[yuvconstants]
+        [kShuffleUYVYY] "m"(kShuffleUYVYY), [kShuffleUYVYUV] "m"(kShuffleUYVYUV)
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+        "xmm4", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_UYVYTOARGBROW_AVX2
 
@@ -4475,28 +4325,21 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm(YUVTORGB_SETUP_AVX2(
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READP210_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
+      LABELALIGN "1:                                        \n" READP210_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
       "sub         $0x10,%[width]                \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [y_buf] "+r"(y_buf),              // %[y_buf]
+        [uv_buf] "+r"(uv_buf),            // %[uv_buf]
+        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
+        [width] "+rm"(width)              // %[width]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2",
+        "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_P210TOARGBROW_AVX2
 
@@ -4508,28 +4351,21 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  // clang-format off
-  asm volatile (
-    YUVTORGB_SETUP_AVX2(yuvconstants)
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm(YUVTORGB_SETUP_AVX2(
+      yuvconstants) "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
 
-    LABELALIGN
-      "1:                                        \n"
-    READP410_AVX2
-    YUVTORGB_AVX2(yuvconstants)
-    STOREARGB_AVX2
+      LABELALIGN "1:                                        \n" READP410_AVX2
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
       "sub         $0x10,%[width]                \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
-  // clang-format on
+      : [y_buf] "+r"(y_buf),              // %[y_buf]
+        [uv_buf] "+r"(uv_buf),            // %[uv_buf]
+        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
+        [width] "+rm"(width)              // %[width]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2",
+        "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_P410TOARGBROW_AVX2
 
@@ -4541,7 +4377,7 @@ void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_ar30,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
@@ -4578,7 +4414,7 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
                                uint8_t* dst_ar30,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm (
     YUVTORGB_SETUP_AVX2(yuvconstants)
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
@@ -4612,8 +4448,7 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
-      "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
+      asm("movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
       "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
       "pslld       $0x18,%%xmm4                  \n"
@@ -4657,8 +4492,7 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
-      "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
+      asm("vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
       "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
       "vpslld      $0x18,%%ymm4,%%ymm4           \n"
@@ -4701,9 +4535,9 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
 
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm volatile("movdqa      %3,%%xmm5                     \n"
+      asm("movdqa      %3,%%xmm5                     \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movdqu      -0x10(%0,%2,1),%%xmm0         \n"
       "pshufb      %%xmm5,%%xmm0                 \n"
@@ -4711,20 +4545,20 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
       "lea         0x10(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-               : "+r"(src),           // %0
-                 "+r"(dst),           // %1
-                 "+r"(temp_width)     // %2
-               : "m"(kShuffleMirror)  // %3
-               : "memory", "cc", "xmm0", "xmm5");
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
+      asm("vbroadcastf128 %3,%%ymm5                  \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     -0x20(%0,%2,1),%%ymm0         \n"
       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
@@ -4734,11 +4568,11 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src),           // %0
-                 "+r"(dst),           // %1
-                 "+r"(temp_width)     // %2
-               : "m"(kShuffleMirror)  // %3
-               : "memory", "cc", "xmm0", "xmm5");
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_AVX2
 
@@ -4749,9 +4583,9 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
 
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm volatile("movdqa      %3,%%xmm5                     \n"
+      asm("movdqa      %3,%%xmm5                     \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movdqu      -0x10(%0,%2,2),%%xmm0         \n"
       "pshufb      %%xmm5,%%xmm0                 \n"
@@ -4759,20 +4593,20 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
       "lea         0x10(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_uv),          // %0
-                 "+r"(dst_uv),          // %1
-                 "+r"(temp_width)       // %2
-               : "m"(kShuffleMirrorUV)  // %3
-               : "memory", "cc", "xmm0", "xmm5");
+      : "+r"(src_uv),          // %0
+        "+r"(dst_uv),          // %1
+        "+r"(temp_width)       // %2
+      : "m"(kShuffleMirrorUV)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORUVROW_SSSE3
 
 #ifdef HAS_MIRRORUVROW_AVX2
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
+      asm("vbroadcastf128 %3,%%ymm5                  \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     -0x20(%0,%2,2),%%ymm0         \n"
       "vpshufb     %%ymm5,%%ymm0,%%ymm0          \n"
@@ -4782,11 +4616,11 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_uv),          // %0
-                 "+r"(dst_uv),          // %1
-                 "+r"(temp_width)       // %2
-               : "m"(kShuffleMirrorUV)  // %3
-               : "memory", "cc", "xmm0", "xmm5");
+      : "+r"(src_uv),          // %0
+        "+r"(dst_uv),          // %1
+        "+r"(temp_width)       // %2
+      : "m"(kShuffleMirrorUV)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORUVROW_AVX2
 
@@ -4799,8 +4633,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
                             uint8_t* dst_v,
                             int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
-      "movdqa      %4,%%xmm1                     \n"
+      asm("movdqa      %4,%%xmm1                     \n"
       "lea         -0x10(%0,%3,2),%0             \n"
       "sub         %1,%2                         \n"
 
@@ -4839,8 +4672,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
                           int width) {
   intptr_t temp_width = (intptr_t)(width);
   src_rgb24 += width * 3 - 48;
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN
@@ -4874,9 +4706,9 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
 
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm volatile("lea         -0x10(%0,%2,4),%0             \n"
+      asm("lea         -0x10(%0,%2,4),%0             \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "pshufd      $0x1b,%%xmm0,%%xmm0           \n"
@@ -4885,11 +4717,11 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src),        // %0
-                 "+r"(dst),        // %1
-                 "+r"(temp_width)  // %2
-               :
-               : "memory", "cc", "xmm0");
+      : "+r"(src),        // %0
+        "+r"(dst),        // %1
+        "+r"(temp_width)  // %2
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBMIRRORROW_SSE2
 
@@ -4898,9 +4730,9 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-      asm volatile("vmovdqu     %3,%%ymm5                     \n"
+      asm("vmovdqu     %3,%%ymm5                     \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "vpermd      -0x20(%0,%2,4),%%ymm5,%%ymm0  \n"
       "vmovdqu     %%ymm0,(%1)                   \n"
@@ -4908,11 +4740,11 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src),                    // %0
-                 "+r"(dst),                    // %1
-                 "+r"(temp_width)              // %2
-               : "m"(kARGBShuffleMirror_AVX2)  // %3
-               : "memory", "cc", "xmm0", "xmm5");
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(temp_width)              // %2
+      : "m"(kARGBShuffleMirror_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
@@ -4921,8 +4753,7 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
 
@@ -4959,8 +4790,7 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
 
@@ -4996,8 +4826,7 @@ void DetileRow_SSE2(const uint8_t* src,
                     ptrdiff_t src_tile_stride,
                     uint8_t* dst,
                     int width) {
-  asm volatile(
-      "1:                                        \n"
+      asm("1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "sub         $0x10,%2                      \n"
       "lea         (%0,%3),%0                    \n"
@@ -5017,8 +4846,7 @@ void DetileRow_16_SSE2(const uint16_t* src,
                        ptrdiff_t src_tile_stride,
                        uint16_t* dst,
                        int width) {
-  asm volatile(
-      "1:                                        \n"
+      asm("1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
       "lea         (%0,%3,2),%0                  \n"
@@ -5040,8 +4868,7 @@ void DetileRow_16_AVX(const uint16_t* src,
                       ptrdiff_t src_tile_stride,
                       uint16_t* dst,
                       int width) {
-  asm volatile(
-      "1:                                        \n"
+      asm("1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "lea         (%0,%3,2),%0                  \n"
       "vmovdqu     %%ymm0,(%1)                   \n"
@@ -5065,8 +4892,7 @@ void DetileToYUY2_SSE2(const uint8_t* src_y,
                        ptrdiff_t src_uv_tile_stride,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile(
-      "1:                                        \n"
+      asm("1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"  // Load 16 Y
       "sub         $0x10,%3                      \n"
       "lea         (%0,%4),%0                    \n"
@@ -5104,8 +4930,7 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width) {
-  asm volatile(
-      "movdqu      %4,%%xmm1                     \n"
+      asm("movdqu      %4,%%xmm1                     \n"
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "lea         (%0, %5),%0                   \n"
@@ -5131,9 +4956,9 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u,
                          const uint8_t* src_v,
                          uint8_t* dst_uv,
                          int width) {
-      asm volatile("sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "vpmovzxbw   (%0),%%zmm0                   \n"
       "vpmovzxbw   0x00(%0,%1,1),%%zmm1          \n"
@@ -5145,12 +4970,12 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u,
       "sub         $0x20,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_u),   // %0
-                 "+r"(src_v),   // %1
-                 "+r"(dst_uv),  // %2
-                 "+r"(width)    // %3
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_AVX512BW
 
@@ -5159,9 +4984,9 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-      asm volatile("sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "vpmovzxbw   (%0),%%ymm0                   \n"
       "vpmovzxbw   0x00(%0,%1,1),%%ymm1          \n"
@@ -5173,12 +4998,12 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_u),   // %0
-                 "+r"(src_v),   // %1
-                 "+r"(dst_uv),  // %2
-                 "+r"(width)    // %3
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
@@ -5187,9 +5012,9 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
                      const uint8_t* src_v,
                      uint8_t* dst_uv,
                      int width) {
-      asm volatile("sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%1,1),%%xmm1          \n"
@@ -5202,12 +5027,12 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
       "lea         0x20(%2),%2                   \n"
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
-               : "+r"(src_u),   // %0
-                 "+r"(src_v),   // %1
-                 "+r"(dst_uv),  // %2
-                 "+r"(width)    // %3
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_SSE2
 
@@ -5217,12 +5042,9 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
                         uint16_t* dst_uv,
                         int depth,
                         int width) {
-  // clang-format off
-  asm volatile (
-      "vmovd       %4,%%xmm3                     \n"
+      asm("vmovd       %4,%%xmm3                     \n"
       "vmovd       %5,%%xmm4                     \n"
 
-
       "sub         %0,%1                         \n"
       // 8 pixels per loop.
 
@@ -5239,14 +5061,13 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
       "sub         $0x8,%3                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : "+r"(src_u),      // %0
-    "+r"(src_v),      // %1
-    "+r"(dst_uv),     // %2
-    "+r"(width)       // %3
-  : "r"(16 - depth),  // %4
-    "r"(32 - depth)   // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-  // clang-format on
+      : "+r"(src_u),      // %0
+        "+r"(src_v),      // %1
+        "+r"(dst_uv),     // %2
+        "+r"(width)       // %3
+      : "r"(16 - depth),  // %4
+        "r"(32 - depth)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
@@ -5259,14 +5080,12 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
                         int depth,
                         int width) {
   depth = 16 - depth;
-  // clang-format off
-  asm volatile (
-      "vmovd       %4,%%xmm3                     \n"
+      asm("vmovd       %4,%%xmm3                     \n"
       "vbroadcastf128 %5,%%ymm4                  \n"
       "sub         %1,%2                         \n"
 
-    // 16 pixels per loop.
-    LABELALIGN
+      // 16 pixels per loop.
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -5286,14 +5105,13 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
       "sub         $0x10,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : "+r"(src_uv),   // %0
-    "+r"(dst_u),    // %1
-    "+r"(dst_v),    // %2
-    "+r"(width)     // %3
-  : "r"(depth),     // %4
-    "m"(kSplitUVShuffle16) // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
-  // clang-format on
+      : "+r"(src_uv),           // %0
+        "+r"(dst_u),            // %1
+        "+r"(dst_v),            // %2
+        "+r"(width)             // %3
+      : "r"(depth),             // %4
+        "m"(kSplitUVShuffle16)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_SPLITUVROW_16_AVX2
 
@@ -5307,15 +5125,13 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width) {
-  // clang-format off
-  asm volatile (
-      "vmovd       %3,%%xmm3                     \n"
+      asm("vmovd       %3,%%xmm3                     \n"
       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
       "vbroadcastss %%xmm3,%%ymm3                \n"
       "sub         %0,%1                         \n"
 
-    // 32 pixels per loop.
-    LABELALIGN
+      // 32 pixels per loop.
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -5327,12 +5143,11 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm3");
-  // clang-format on
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm3");
 }
 #endif  // HAS_MULTIPLYROW_16_AVX2
 
@@ -5346,15 +5161,13 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
                        uint16_t* dst_y,
                        int scale,
                        int width) {
-  // clang-format off
-  asm volatile (
-      "vmovd       %3,%%xmm3                     \n"
+      asm("vmovd       %3,%%xmm3                     \n"
       "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
       "vbroadcastss %%xmm3,%%ymm3                \n"
       "sub         %0,%1                         \n"
 
-    // 32 pixels per loop.
-    LABELALIGN
+      // 32 pixels per loop.
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -5366,13 +5179,12 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width),    // %2
-    "+r"(scale)     // %3
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm3");
-  // clang-format on
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width),  // %2
+        "+r"(scale)   // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm3");
 }
 #endif  // HAS_MULTIPLYROW_16_AVX2
 
@@ -5385,14 +5197,12 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y,
                            uint8_t* dst_y,
                            int scale,
                            int width) {
-  // clang-format off
-  asm volatile (
-      "movd        %3,%%xmm2                     \n"
+      asm("movd        %3,%%xmm2                     \n"
       "punpcklwd   %%xmm2,%%xmm2                 \n"
       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
 
-    // 32 pixels per loop.
-    LABELALIGN
+      // 32 pixels per loop.
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -5404,12 +5214,11 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y,
       "add         $0x10,%1                      \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 
 #ifdef HAS_CONVERT16TO8ROW_AVX2
@@ -5417,14 +5226,12 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
                           uint8_t* dst_y,
                           int scale,
                           int width) {
-  // clang-format off
-  asm volatile (
-      "vmovd       %3,%%xmm2                     \n"
+      asm("vmovd       %3,%%xmm2                     \n"
       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
       "vbroadcastss %%xmm2,%%ymm2                \n"
 
-    // 32 pixels per loop.
-    LABELALIGN
+      // 32 pixels per loop.
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -5438,12 +5245,11 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_CONVERT16TO8ROW_AVX2
 
@@ -5455,14 +5261,12 @@ void Convert8To16Row_SSE2(const uint8_t* src_y,
                           uint16_t* dst_y,
                           int scale,
                           int width) {
-  // clang-format off
-  asm volatile (
-      "movd        %3,%%xmm2                     \n"
+      asm("movd        %3,%%xmm2                     \n"
       "punpcklwd   %%xmm2,%%xmm2                 \n"
       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
 
-    // 32 pixels per loop.
-    LABELALIGN
+      // 32 pixels per loop.
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqa      %%xmm0,%%xmm1                 \n"
@@ -5476,12 +5280,11 @@ void Convert8To16Row_SSE2(const uint8_t* src_y,
       "add         $0x20,%1                      \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 
 #ifdef HAS_CONVERT8TO16ROW_AVX2
@@ -5489,14 +5292,12 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
                           uint16_t* dst_y,
                           int scale,
                           int width) {
-  // clang-format off
-  asm volatile (
-      "vmovd       %3,%%xmm2                     \n"
+      asm("vmovd       %3,%%xmm2                     \n"
       "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
       "vbroadcastss %%xmm2,%%ymm2                \n"
 
-    // 32 pixels per loop.
-    LABELALIGN
+      // 32 pixels per loop.
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
@@ -5511,12 +5312,11 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-  : "+r"(src_y),   // %0
-    "+r"(dst_y),   // %1
-    "+r"(width)    // %2
-  : "r"(scale)     // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
-  // clang-format on
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_CONVERT8TO16ROW_AVX2
 
@@ -5547,7 +5347,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -5584,13 +5384,13 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
       "lea         0x30(%0),%0                   \n"
       "sub         $0x10,%4                      \n"
       "jg          1b                            \n"
-               : "+r"(src_rgb),             // %0
-                 "+r"(dst_r),               // %1
-                 "+r"(dst_g),               // %2
-                 "+r"(dst_b),               // %3
-                 "+r"(width)                // %4
-               : "r"(&kSplitRGBShuffle[0])  // %5
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_rgb),             // %0
+        "+r"(dst_r),               // %1
+        "+r"(dst_g),               // %2
+        "+r"(dst_b),               // %3
+        "+r"(width)                // %4
+      : "r"(&kSplitRGBShuffle[0])  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_SPLITRGBROW_SSSE3
 
@@ -5621,7 +5421,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_rgb,
                        int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      (%1),%%xmm1                   \n"
@@ -5659,13 +5459,13 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
       "lea         0x30(%3),%3                   \n"
       "sub         $0x10,%4                      \n"
       "jg          1b                            \n"
-               : "+r"(src_r),               // %0
-                 "+r"(src_g),               // %1
-                 "+r"(src_b),               // %2
-                 "+r"(dst_rgb),             // %3
-                 "+r"(width)                // %4
-               : "r"(&kMergeRGBShuffle[0])  // %5
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_r),               // %0
+        "+r"(src_g),               // %1
+        "+r"(src_b),               // %2
+        "+r"(dst_rgb),             // %3
+        "+r"(width)                // %4
+      : "r"(&kMergeRGBShuffle[0])  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGERGBROW_SSSE3
 
@@ -5676,8 +5476,7 @@ void MergeARGBRow_SSE2(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
 
@@ -5717,7 +5516,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
 
       "movq        (%2),%%xmm0                   \n"  // B
@@ -5738,13 +5537,13 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r,
       "lea         32(%3),%3                     \n"
       "sub         $0x8,%4                       \n"
       "jg          1b                            \n"
-               : "+r"(src_r),     // %0
-                 "+r"(src_g),     // %1
-                 "+r"(src_b),     // %2
-                 "+r"(dst_argb),  // %3
-                 "+r"(width)      // %4
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_r),     // %0
+        "+r"(src_g),     // %1
+        "+r"(src_b),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEARGBROW_SSE2
 
@@ -5755,8 +5554,7 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
 
@@ -5800,7 +5598,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm(
 
       LABELALIGN
       "1:                                        \n"
@@ -5844,8 +5642,7 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
-  asm volatile(
-      "sub         %1,%2                         \n"
+      asm("sub         %1,%2                         \n"
       "sub         %1,%3                         \n"
       "sub         %1,%4                         \n"
 
@@ -5895,8 +5692,7 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile(
-      LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
 
       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
@@ -5945,8 +5741,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_b,
                         uint8_t* dst_a,
                         int width) {
-  asm volatile(
-      "movdqa      %6,%%xmm3                     \n"
+      asm("movdqa      %6,%%xmm3                     \n"
       "sub         %1,%2                         \n"
       "sub         %1,%3                         \n"
       "sub         %1,%4                         \n"
@@ -5991,8 +5786,7 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
                         uint8_t* dst_g,
                         uint8_t* dst_b,
                         int width) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
+      asm("movdqa      %5,%%xmm3                     \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -6032,8 +5826,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
-  asm volatile(
-      "sub         %1,%2                         \n"
+      asm("sub         %1,%2                         \n"
       "sub         %1,%3                         \n"
       "sub         %1,%4                         \n"
       "vmovdqa     %7,%%ymm3                     \n"
@@ -6083,8 +5876,7 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile(
-      "vmovdqa     %6,%%ymm3                     \n"
+      asm("vmovdqa     %6,%%ymm3                     \n"
       "vbroadcastf128 %5,%%ymm4                  \n"
 
       LABELALIGN
@@ -6130,8 +5922,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r,
                        int depth,
                        int width) {
   int shift = depth - 10;
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
       "vpsrlw      $14,%%ymm5,%%ymm5             \n"
@@ -6196,8 +5987,7 @@ void MergeAR64Row_AVX2(const uint16_t* src_r,
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
   mask = (mask << 16) + mask;
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
       "vmovdqa     %8,%%ymm5                     \n"
@@ -6267,8 +6057,7 @@ void MergeXR64Row_AVX2(const uint16_t* src_r,
   int shift = 16 - depth;
   int mask = (1 << depth) - 1;
   mask = (mask << 16) + mask;
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "vmovdqa     %7,%%ymm5                     \n"
       "vmovd       %5,%%xmm6                     \n"
@@ -6330,8 +6119,7 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
                             int depth,
                             int width) {
   int shift = depth - 8;
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
       "vbroadcastf128 %7,%%ymm5                  \n"
@@ -6386,8 +6174,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
                             int depth,
                             int width) {
   int shift = depth - 8;
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "vbroadcastf128 %6,%%ymm5                  \n"
       "vmovd       %5,%%xmm6                     \n"
@@ -6430,8 +6217,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
 
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "test        $0xf,%0                       \n"
+      asm("test        $0xf,%0                       \n"
       "jne         2f                            \n"
       "test        $0xf,%1                       \n"
       "jne         2f                            \n"
@@ -6470,7 +6256,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_COPYROW_AVX
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -6481,11 +6267,11 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
       "sub         $0x40,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src),   // %0
-                 "+r"(dst),   // %1
-                 "+r"(width)  // %2
-               :
-               : "memory", "cc", "xmm0", "xmm1");
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_COPYROW_AVX
 
@@ -6493,20 +6279,19 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
 // Multiple of 1.
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
   size_t width_tmp = (size_t)(width);
-      asm volatile("rep         movsb                         \n"
-               : "+S"(src),       // %0
-                 "+D"(dst),       // %1
-                 "+c"(width_tmp)  // %2
-               :
-               : "memory", "cc");
+      asm("rep         movsb                         \n"
+      : "+S"(src),       // %0
+        "+D"(dst),       // %1
+        "+c"(width_tmp)  // %2
+      :
+      : "memory", "cc");
 }
 #endif  // HAS_COPYROW_ERMS
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
 void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm0,%%xmm0                 \n"
+      asm("pcmpeqb     %%xmm0,%%xmm0                 \n"
       "pslld       $0x18,%%xmm0                  \n"
       "pcmpeqb     %%xmm1,%%xmm1                 \n"
       "psrld       $0x8,%%xmm1                   \n"
@@ -6540,8 +6325,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
 void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
+      asm("vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
 
       LABELALIGN
@@ -6570,7 +6354,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0), %%xmm0                  \n"
       "movdqu      0x10(%0), %%xmm1              \n"
@@ -6583,11 +6367,11 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
       "lea         0x8(%1), %1                   \n"
       "sub         $0x8, %2                      \n"
       "jg          1b                            \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_a),     // %1
-                 "+rm"(width)     // %2
-               :
-               : "memory", "cc", "xmm0", "xmm1");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
 
@@ -6599,8 +6383,7 @@ static const uvec8 kShuffleAlphaShort_AVX2 = {
 void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-  asm volatile(
-      "vmovdqa     %3,%%ymm4                     \n"
+      asm("vmovdqa     %3,%%ymm4                     \n"
       "vbroadcastf128 %4,%%ymm5                  \n"
 
       LABELALIGN
@@ -6635,8 +6418,7 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
 void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm0,%%xmm0                 \n"
+      asm("pcmpeqb     %%xmm0,%%xmm0                 \n"
       "pslld       $0x18,%%xmm0                  \n"
       "pcmpeqb     %%xmm1,%%xmm1                 \n"
       "psrld       $0x8,%%xmm1                   \n"
@@ -6672,8 +6454,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
 void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
+      asm("vpcmpeqb    %%ymm0,%%ymm0,%%ymm0          \n"
       "vpsrld      $0x8,%%ymm0,%%ymm0            \n"
 
       LABELALIGN
@@ -6703,36 +6484,35 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width >> 2);
   const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-      asm volatile("rep         stosl                         \n"
-               : "+D"(dst),       // %0
-                 "+c"(width_tmp)  // %1
-               : "a"(v32)         // %2
-               : "memory", "cc");
+      asm("rep         stosl                         \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
 }
 
 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width);
-      asm volatile("rep         stosb                         \n"
-               : "+D"(dst),       // %0
-                 "+c"(width_tmp)  // %1
-               : "a"(v8)          // %2
-               : "memory", "cc");
+      asm("rep         stosb                         \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v8)          // %2
+      : "memory", "cc");
 }
 
 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
   size_t width_tmp = (size_t)(width);
-      asm volatile("rep         stosl                         \n"
-               : "+D"(dst_argb),  // %0
-                 "+c"(width_tmp)  // %1
-               : "a"(v32)         // %2
-               : "memory", "cc");
+      asm("rep         stosl                         \n"
+      : "+D"(dst_argb),  // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
 }
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_SSE2
 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
 
       LABELALIGN
@@ -6758,7 +6538,7 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
                         int stride_yuy2,
                         uint8_t* dst_uv,
                         int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -6774,11 +6554,11 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
       "lea         0x10(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-               : "+r"(src_yuy2),               // %0
-                 "+r"(dst_uv),                 // %1
-                 "+r"(width)                   // %2
-               : "r"((intptr_t)(stride_yuy2))  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_uv),                 // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(stride_yuy2))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
@@ -6786,8 +6566,7 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
 
@@ -6825,8 +6604,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
 
@@ -6857,7 +6635,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
 }
 
 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -6869,11 +6647,11 @@ void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
       "lea         0x10(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-               : "+r"(src_uyvy),  // %0
-                 "+r"(dst_y),     // %1
-                 "+r"(width)      // %2
-               :
-               : "memory", "cc", "xmm0", "xmm1");
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
 void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
@@ -6881,8 +6659,7 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
 
@@ -6920,8 +6697,7 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrlw       $0x8,%%xmm5                   \n"
       "sub         %1,%2                         \n"
 
@@ -6954,8 +6730,7 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
 
 #ifdef HAS_YUY2TOYROW_AVX2
 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
 
       LABELALIGN
@@ -6983,7 +6758,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
                         int stride_yuy2,
                         uint8_t* dst_uv,
                         int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -6999,11 +6774,11 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_yuy2),               // %0
-                 "+r"(dst_uv),                 // %1
-                 "+r"(width)                   // %2
-               : "r"((intptr_t)(stride_yuy2))  // %3
-               : "memory", "cc", "xmm0", "xmm1");
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_uv),                 // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(stride_yuy2))  // %3
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
 void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
@@ -7011,8 +6786,7 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
 
@@ -7051,8 +6825,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
 
@@ -7086,7 +6859,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
 }
 
 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -7100,19 +6873,18 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_uyvy),  // %0
-                 "+r"(dst_y),     // %1
-                 "+r"(width)      // %2
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm5");
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
                       int stride_uyvy,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
 
@@ -7151,8 +6923,7 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrlw      $0x8,%%ymm5,%%ymm5            \n"
       "sub         %1,%2                         \n"
 
@@ -7196,8 +6967,7 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
                         const uint8_t* src_argb1,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      asm("pcmpeqb     %%xmm7,%%xmm7                 \n"
       "psrlw       $0xf,%%xmm7                   \n"
       "pcmpeqb     %%xmm6,%%xmm6                 \n"
       "psrlw       $0x8,%%xmm6                   \n"
@@ -7284,8 +7054,7 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0,
                          const uint8_t* alpha,
                          uint8_t* dst,
                          int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psllw       $0x8,%%xmm5                   \n"
       "mov         $0x80808080,%%eax             \n"
       "movd        %%eax,%%xmm6                  \n"
@@ -7336,8 +7105,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
                         const uint8_t* alpha,
                         uint8_t* dst,
                         int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsllw      $0x8,%%ymm5,%%ymm5            \n"
       "mov         $0x80808080,%%eax             \n"
       "vmovd       %%eax,%%xmm6                  \n"
@@ -7396,8 +7164,7 @@ static const vec8 kAttenuateShuffle = {6,    -128, 6,    -128, 6,  -128,
 void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
       "pxor        %%xmm6,%%xmm6                 \n"
@@ -7451,8 +7218,7 @@ static const lvec8 kAttenuateShuffle_AVX2 = {
 void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int width) {
-  asm volatile(
-      "vmovdqa     %3,%%ymm4                     \n"
+      asm("vmovdqa     %3,%%ymm4                     \n"
       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpslld      $0x18,%%ymm5,%%ymm5           \n"
       "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"
@@ -7497,7 +7263,7 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-  asm volatile(
+  asm(
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -7545,8 +7311,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
                              uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "vbroadcastf128 %5,%%ymm5                  \n"
 
       // 8 pixel loop.
@@ -7607,8 +7372,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       // 8 pixel loop.
@@ -7669,8 +7433,7 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
-  asm volatile(
-      "movdqa      %2,%%xmm2                     \n"
+      asm("movdqa      %2,%%xmm2                     \n"
       "movdqa      %3,%%xmm3                     \n"
       "movdqa      %4,%%xmm4                     \n"
 
@@ -7730,8 +7493,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
                               uint8_t* dst_argb,
                               const int8_t* matrix_argb,
                               int width) {
-  asm volatile(
-      "movdqu      (%3),%%xmm5                   \n"
+      asm("movdqu      (%3),%%xmm5                   \n"
       "pshufd      $0x00,%%xmm5,%%xmm2           \n"
       "pshufd      $0x55,%%xmm5,%%xmm3           \n"
       "pshufd      $0xaa,%%xmm5,%%xmm4           \n"
@@ -7795,8 +7557,7 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
                           int interval_size,
                           int interval_offset,
                           int width) {
-  asm volatile(
-      "movd        %2,%%xmm2                     \n"
+      asm("movd        %2,%%xmm2                     \n"
       "movd        %3,%%xmm3                     \n"
       "movd        %4,%%xmm4                     \n"
       "pshuflw     $0x40,%%xmm2,%%xmm2           \n"
@@ -7846,8 +7607,7 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_argb,
                        int width,
                        uint32_t value) {
-  asm volatile(
-      "movd        %3,%%xmm2                     \n"
+      asm("movd        %3,%%xmm2                     \n"
       "punpcklbw   %%xmm2,%%xmm2                 \n"
       "punpcklqdq  %%xmm2,%%xmm2                 \n"
 
@@ -7882,10 +7642,10 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
+      asm("pxor        %%xmm5,%%xmm5                 \n"
 
-               // 4 pixel loop.
-               LABELALIGN
+      // 4 pixel loop.
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "lea         0x10(%0),%0                   \n"
@@ -7904,12 +7664,12 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
       "lea         0x10(%2),%2                   \n"
       "sub         $0x4,%3                       \n"
       "jg          1b                            \n"
-               : "+r"(src_argb),   // %0
-                 "+r"(src_argb1),  // %1
-                 "+r"(dst_argb),   // %2
-                 "+r"(width)       // %3
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 
@@ -7919,10 +7679,10 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
-               // 4 pixel loop.
-               LABELALIGN
+      // 4 pixel loop.
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm1                   \n"
       "lea         0x20(%0),%0                   \n"
@@ -7940,12 +7700,12 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
       "sub         $0x8,%3                       \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_argb),   // %0
-                 "+r"(src_argb1),  // %1
-                 "+r"(dst_argb),   // %2
-                 "+r"(width)       // %3
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+      : "+r"(src_argb),   // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_ARGBMULTIPLYROW_AVX2
 
@@ -7955,7 +7715,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile(
+  asm(
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -7983,7 +7743,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb,
                      const uint8_t* src_argb1,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile(
+  asm(
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -8011,7 +7771,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile(
+  asm(
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -8039,7 +7799,7 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
                           const uint8_t* src_argb1,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile(
+  asm(
       // 4 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -8071,8 +7831,7 @@ void SobelXRow_SSE2(const uint8_t* src_y0,
                     const uint8_t* src_y2,
                     uint8_t* dst_sobelx,
                     int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "sub         %0,%3                         \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -8125,8 +7884,7 @@ void SobelYRow_SSE2(const uint8_t* src_y0,
                     const uint8_t* src_y1,
                     uint8_t* dst_sobely,
                     int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "sub         %0,%2                         \n"
       "pxor        %%xmm5,%%xmm5                 \n"
 
@@ -8178,8 +7936,7 @@ void SobelRow_SSE2(const uint8_t* src_sobelx,
                    const uint8_t* src_sobely,
                    uint8_t* dst_argb,
                    int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
 
@@ -8225,8 +7982,7 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
                           const uint8_t* src_sobely,
                           uint8_t* dst_y,
                           int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "pslld       $0x18,%%xmm5                  \n"
 
@@ -8260,8 +8016,7 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx,
                      const uint8_t* src_sobely,
                      uint8_t* dst_argb,
                      int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "pcmpeqb     %%xmm5,%%xmm5                 \n"
 
       // 8 pixel loop.
@@ -8308,8 +8063,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
                                   int32_t* cumsum,
                                   const int32_t* previous_cumsum,
                                   int width) {
-  asm volatile(
-      "pxor        %%xmm0,%%xmm0                 \n"
+      asm("pxor        %%xmm0,%%xmm0                 \n"
       "pxor        %%xmm1,%%xmm1                 \n"
       "sub         $0x4,%3                       \n"
       "jl          49f                           \n"
@@ -8388,8 +8142,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
                                     int area,
                                     uint8_t* dst,
                                     int count) {
-  asm volatile(
-      "movd        %5,%%xmm5                     \n"
+      asm("movd        %5,%%xmm5                     \n"
       "cvtdq2ps    %%xmm5,%%xmm5                 \n"
       "rcpss       %%xmm5,%%xmm4                 \n"
       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
@@ -8523,8 +8276,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
                         int width) {
   intptr_t src_argb_stride_temp = src_argb_stride;
   intptr_t temp;
-  asm volatile(
-      "movq        (%3),%%xmm2                   \n"
+      asm("movq        (%3),%%xmm2                   \n"
       "movq        0x08(%3),%%xmm7               \n"
       "shl         $0x10,%1                      \n"
       "add         $0x4,%1                       \n"
@@ -8608,8 +8360,7 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
                           ptrdiff_t src_stride,
                           int width,
                           int source_y_fraction) {
-  asm volatile(
-      "sub         %1,%0                         \n"
+      asm("sub         %1,%0                         \n"
       "cmp         $0x0,%3                       \n"
       "je          100f                          \n"
       "cmp         $0x80,%3                      \n"
@@ -8689,8 +8440,7 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
                          ptrdiff_t src_stride,
                          int width,
                          int source_y_fraction) {
-  asm volatile(
-      "sub         %1,%0                         \n"
+      asm("sub         %1,%0                         \n"
       "cmp         $0x0,%3                       \n"
       "je          100f                          \n"
       "cmp         $0x80,%3                      \n"
@@ -8766,9 +8516,9 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           const uint8_t* shuffler,
                           int width) {
-      asm volatile("movdqu      (%3),%%xmm5                   \n"
+      asm("movdqu      (%3),%%xmm5                   \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -8780,11 +8530,11 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
       "lea         0x20(%1),%1                   \n"
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(width)      // %2
-               : "r"(shuffler)    // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm5");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
 
@@ -8794,9 +8544,9 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-      asm volatile("vbroadcastf128 (%3),%%ymm5                \n"
+      asm("vbroadcastf128 (%3),%%ymm5                \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -8809,11 +8559,11 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(width)      // %2
-               : "r"(shuffler)    // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm5");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
@@ -8823,9 +8573,9 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-      asm volatile("sub         %1,%2                         \n"
+      asm("sub         %1,%2                         \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movq        (%1),%%xmm2                   \n"
       "movq        0x00(%1,%2,1),%%xmm1          \n"
@@ -8841,13 +8591,13 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
       "lea         0x20(%3),%3                   \n"
       "sub         $0x10,%4                      \n"
       "jg          1b                            \n"
-               : "+r"(src_y),     // %0
-                 "+r"(src_u),     // %1
-                 "+r"(src_v),     // %2
-                 "+r"(dst_yuy2),  // %3
-                 "+rm"(width)     // %4
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOYUY2ROW_SSE2
 
@@ -8857,9 +8607,9 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-      asm volatile("sub         %1,%2                         \n"
+      asm("sub         %1,%2                         \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movq        (%1),%%xmm2                   \n"
       "movq        0x00(%1,%2,1),%%xmm1          \n"
@@ -8875,13 +8625,13 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
       "lea         0x20(%3),%3                   \n"
       "sub         $0x10,%4                      \n"
       "jg          1b                            \n"
-               : "+r"(src_y),     // %0
-                 "+r"(src_u),     // %1
-                 "+r"(src_v),     // %2
-                 "+r"(dst_uyvy),  // %3
-                 "+rm"(width)     // %4
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOUYVYROW_SSE2
 
@@ -8891,9 +8641,9 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_yuy2,
                         int width) {
-      asm volatile("sub         %1,%2                         \n"
+      asm("sub         %1,%2                         \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "vpmovzxbw   (%1),%%ymm1                   \n"
       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
@@ -8912,13 +8662,13 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
       "sub         $0x20,%4                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_y),     // %0
-                 "+r"(src_u),     // %1
-                 "+r"(src_v),     // %2
-                 "+r"(dst_yuy2),  // %3
-                 "+rm"(width)     // %4
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOYUY2ROW_AVX2
 
@@ -8928,9 +8678,9 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
                         const uint8_t* src_v,
                         uint8_t* dst_uyvy,
                         int width) {
-      asm volatile("sub         %1,%2                         \n"
+      asm("sub         %1,%2                         \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "vpmovzxbw   (%1),%%ymm1                   \n"
       "vpmovzxbw   0x00(%1,%2,1),%%ymm2          \n"
@@ -8949,13 +8699,13 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
       "sub         $0x20,%4                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_y),     // %0
-                 "+r"(src_u),     // %1
-                 "+r"(src_v),     // %2
-                 "+r"(dst_uyvy),  // %3
-                 "+rm"(width)     // %4
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOUYVYROW_AVX2
 
@@ -8964,10 +8714,10 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const float* poly,
                             int width) {
-      asm volatile("pxor        %%xmm3,%%xmm3                 \n"
+      asm("pxor        %%xmm3,%%xmm3                 \n"
 
-               // 2 pixel loop.
-               LABELALIGN
+      // 2 pixel loop.
+      LABELALIGN
       "1:                                        \n"
       "movq        (%0),%%xmm0                   \n"
       "lea         0x8(%0),%0                    \n"
@@ -9005,12 +8755,11 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
       "lea         0x8(%1),%1                    \n"
       "sub         $0x2,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(width)      // %2
-               : "r"(poly)        // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
@@ -9019,8 +8768,7 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const float* poly,
                             int width) {
-  asm volatile(
-      "vbroadcastf128 (%3),%%ymm4                \n"
+      asm("vbroadcastf128 (%3),%%ymm4                \n"
       "vbroadcastf128 0x10(%3),%%ymm5            \n"
       "vbroadcastf128 0x20(%3),%%ymm6            \n"
       "vbroadcastf128 0x30(%3),%%ymm7            \n"
@@ -9062,8 +8810,7 @@ void HalfFloatRow_SSE2(const uint16_t* src,
                        float scale,
                        int width) {
   scale *= kScaleBias;
-  asm volatile(
-      "movd        %3,%%xmm4                     \n"
+      asm("movd        %3,%%xmm4                     \n"
       "pshufd      $0x0,%%xmm4,%%xmm4            \n"
       "pxor        %%xmm5,%%xmm5                 \n"
       "sub         %0,%1                         \n"
@@ -9100,8 +8847,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
                        float scale,
                        int width) {
   scale *= kScaleBias;
-  asm volatile(
-      "vbroadcastss %3, %%ymm4                   \n"
+      asm("vbroadcastss %3, %%ymm4                   \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
       "sub         %0,%1                         \n"
 
@@ -9141,8 +8887,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
                        uint16_t* dst,
                        float scale,
                        int width) {
-  asm volatile(
-      "vbroadcastss %3, %%ymm4                   \n"
+      asm("vbroadcastss %3, %%ymm4                   \n"
       "sub         %0,%1                         \n"
 
       // 16 pixel loop.
@@ -9176,8 +8921,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
 
 #ifdef HAS_HALFFLOATROW_F16C
 void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       // 16 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -9207,7 +8951,7 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb,
                            const uint8_t* table_argb,
                            int width) {
   uintptr_t pixel_temp;
-  asm volatile(
+  asm(
       // 1 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -9240,7 +8984,7 @@ void RGBColorTableRow_X86(uint8_t* dst_argb,
                           const uint8_t* table_argb,
                           int width) {
   uintptr_t pixel_temp;
-  asm volatile(
+  asm(
       // 1 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -9273,8 +9017,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
                                  uint32_t lumacoeff) {
   uintptr_t pixel_temp;
   uintptr_t table_temp;
-  asm volatile(
-      "movd        %6,%%xmm3                     \n"
+      asm("movd        %6,%%xmm3                     \n"
       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
       "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psllw       $0x8,%%xmm4                   \n"
@@ -9377,8 +9120,7 @@ void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
                           const uint8_t* src_vu,
                           uint8_t* dst_yuv24,
                           int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "movdqa      (%4),%%xmm4                   \n"  // 3 shuffler constants
       "movdqa      16(%4),%%xmm5                 \n"
       "movdqa      32(%4),%%xmm6                 \n"
@@ -9415,8 +9157,7 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "vbroadcastf128 (%4),%%ymm4                \n"  // 3 shuffler constants
       "vbroadcastf128 16(%4),%%ymm5              \n"
       "vbroadcastf128 32(%4),%%ymm6              \n"
@@ -9463,8 +9204,7 @@ void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
                            const uint8_t* src_vu,
                            uint8_t* dst_yuv24,
                            int width) {
-  asm volatile(
-      "sub         %0,%1                         \n"
+      asm("sub         %0,%1                         \n"
       "vmovdqa     (%4),%%ymm4                   \n"  // 3 shuffler constants
       "vmovdqa     32(%4),%%ymm5                 \n"
       "vmovdqa     64(%4),%%ymm6                 \n" LABELALIGN
@@ -9502,9 +9242,9 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
 
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-      asm volatile("movdqu      %3,%%xmm5                     \n"
+      asm("movdqu      %3,%%xmm5                     \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -9516,19 +9256,19 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
       "lea         0x20(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-               : "+r"(src_uv),        // %0
-                 "+r"(dst_vu),        // %1
-                 "+r"(width)          // %2
-               : "m"(kShuffleUVToVU)  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm5");
+      : "+r"(src_uv),        // %0
+        "+r"(dst_vu),        // %1
+        "+r"(width)          // %2
+      : "m"(kShuffleUVToVU)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_SWAPUVROW_SSSE3
 
 #ifdef HAS_SWAPUVROW_AVX2
 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
+      asm("vbroadcastf128 %3,%%ymm5                  \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -9541,11 +9281,11 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_uv),        // %0
-                 "+r"(dst_vu),        // %1
-                 "+r"(width)          // %2
-               : "m"(kShuffleUVToVU)  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm5");
+      : "+r"(src_uv),        // %0
+        "+r"(dst_vu),        // %1
+        "+r"(width)          // %2
+      : "m"(kShuffleUVToVU)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_SWAPUVROW_AVX2
 
@@ -9555,8 +9295,7 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
                           int src_stride_v,
                           uint8_t* dst_uv,
                           int width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -9601,8 +9340,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
                          int src_stride_v,
                          uint8_t* dst_uv,
                          int width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
@@ -9643,8 +9381,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
 }
 
 void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
-  asm volatile(
-      "pxor        %%xmm1,%%xmm1                 \n"
+      asm("pxor        %%xmm1,%%xmm1                 \n"
 
       LABELALIGN
       "1:                                        \n"
diff --git a/source/row_rvv.cc b/source/row_rvv.cc
index 39a5c0dbc..3b04ec5f7 100644
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@@ -497,7 +497,6 @@ void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
     vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
     vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
     vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
-    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
     vuint8m2x3_t v_dst_bgr = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b);
     __riscv_vsseg3e8_v_u8m2x3(dst_raw, v_dst_bgr, vl);
     w -= vl;
@@ -2101,7 +2100,6 @@ void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
     vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
     vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
     vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
-    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
     v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
     v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
     v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
@@ -2191,7 +2189,6 @@ void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
     vuint16m4_t v_y_u16;
     size_t vl = __riscv_vsetvl_e8m2(w);
     vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl);
-    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 0);
     vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1);
     vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2);
     vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3);
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index 17eeffadf..304770d0c 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile(
+  asm(
       // 16 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -114,8 +114,8 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
@@ -123,8 +123,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   (void)src_stride;
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -146,16 +145,15 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst_ptr,
                             int dst_width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -195,7 +193,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -209,11 +207,11 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
@@ -221,8 +219,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
   (void)src_stride;
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
@@ -246,16 +243,15 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
@@ -297,8 +293,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrld       $0x18,%%xmm5                  \n"
       "pslld       $0x10,%%xmm5                  \n"
 
@@ -319,8 +314,8 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm5");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
@@ -328,8 +323,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
   intptr_t stridex3;
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "movdqa      %%xmm4,%%xmm5                 \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
@@ -383,8 +377,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
       "vpslld      $0x10,%%ymm5,%%ymm5           \n"
 
@@ -408,16 +401,15 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
       : "+r"(src_ptr),   // %0
         "+r"(dst_ptr),   // %1
         "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm5");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
@@ -472,8 +464,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
                           uint8_t* dst_ptr,
                           int dst_width) {
   (void)src_stride;
-  asm volatile(
-      "movdqa      %0,%%xmm3                     \n"
+      asm("movdqa      %0,%%xmm3                     \n"
       "movdqa      %1,%%xmm4                     \n"
       "movdqa      %2,%%xmm5                     \n"
       :
@@ -481,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf1),  // %1
         "m"(kShuf2)   // %2
   );
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm2               \n"
@@ -497,19 +488,18 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
       "lea         0x18(%1),%1                   \n"
       "sub         $0x18,%2                      \n"
       "jg          1b                            \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"  // kShuf01
+      asm("movdqa      %0,%%xmm2                     \n"  // kShuf01
       "movdqa      %1,%%xmm3                     \n"  // kShuf11
       "movdqa      %2,%%xmm4                     \n"  // kShuf21
       :
@@ -517,8 +507,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
   );
-  asm volatile(
-      "movdqa      %0,%%xmm5                     \n"  // kMadd01
+      asm("movdqa      %0,%%xmm5                     \n"  // kMadd01
       "movdqa      %1,%%xmm0                     \n"  // kMadd11
       "movdqa      %2,%%xmm1                     \n"  // kRound34
       :
@@ -526,7 +515,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
   );
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@@ -559,21 +548,20 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
       "lea         0x18(%1),%1                   \n"
       "sub         $0x18,%2                      \n"
       "jg          1b                            \n"
-               : "+r"(src_ptr),                // %0
-                 "+r"(dst_ptr),                // %1
-                 "+r"(dst_width)               // %2
-               : "r"((intptr_t)(src_stride)),  // %3
-                 "m"(kMadd21)                  // %4
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6", "xmm7");
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"  // kShuf01
+      asm("movdqa      %0,%%xmm2                     \n"  // kShuf01
       "movdqa      %1,%%xmm3                     \n"  // kShuf11
       "movdqa      %2,%%xmm4                     \n"  // kShuf21
       :
@@ -581,8 +569,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
   );
-  asm volatile(
-      "movdqa      %0,%%xmm5                     \n"  // kMadd01
+      asm("movdqa      %0,%%xmm5                     \n"  // kMadd01
       "movdqa      %1,%%xmm0                     \n"  // kMadd11
       "movdqa      %2,%%xmm1                     \n"  // kRound34
       :
@@ -591,7 +578,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kRound34)  // %2
   );
 
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@@ -627,13 +614,13 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
       "lea         0x18(%1),%1                   \n"
       "sub         $0x18,%2                      \n"
       "jg          1b                            \n"
-               : "+r"(src_ptr),                // %0
-                 "+r"(dst_ptr),                // %1
-                 "+r"(dst_width)               // %2
-               : "r"((intptr_t)(src_stride)),  // %3
-                 "m"(kMadd21)                  // %4
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6", "xmm7");
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
@@ -641,8 +628,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
                           uint8_t* dst_ptr,
                           int dst_width) {
   (void)src_stride;
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN
@@ -671,8 +657,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"
+      asm("movdqa      %0,%%xmm2                     \n"
       "movdqa      %1,%%xmm3                     \n"
       "movdqa      %2,%%xmm4                     \n"
       "movdqa      %3,%%xmm5                     \n"
@@ -682,7 +667,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAb2),  // %2
         "m"(kScaleAb2)  // %3
   );
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm1          \n"
@@ -703,20 +688,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
       "lea         0x6(%1),%1                    \n"
       "sub         $0x6,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_ptr),               // %0
-                 "+r"(dst_ptr),               // %1
-                 "+r"(dst_width)              // %2
-               : "r"((intptr_t)(src_stride))  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6");
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"
+      asm("movdqa      %0,%%xmm2                     \n"
       "movdqa      %1,%%xmm3                     \n"
       "movdqa      %2,%%xmm4                     \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -725,7 +708,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAc3),   // %1
         "m"(kScaleAc33)  // %2
   );
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm6          \n"
@@ -765,12 +748,12 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
       "lea         0x6(%1),%1                    \n"
       "sub         $0x6,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_ptr),               // %0
-                 "+r"(dst_ptr),               // %1
-                 "+r"(dst_width)              // %2
-               : "r"((intptr_t)(src_stride))  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6", "xmm7");
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 static const uvec8 kLinearShuffleFar = {2,  3,  0, 1, 6,  7,  4,  5,
@@ -783,8 +766,7 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
-  asm volatile(
-      "pxor        %%xmm0,%%xmm0                 \n"  // 0
+      asm("pxor        %%xmm0,%%xmm0                 \n"  // 0
       "pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
       "psllw       $1,%%xmm6                     \n"  // all 2
@@ -839,8 +821,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
-  asm volatile(
-      LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "pxor        %%xmm0,%%xmm0                 \n"  // 0
       // above line
@@ -953,8 +934,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  int dst_width) {
-  asm volatile(
-      "movdqa      %3,%%xmm5                     \n"
+      asm("movdqa      %3,%%xmm5                     \n"
       "pcmpeqw     %%xmm4,%%xmm4                 \n"
       "psrlw       $15,%%xmm4                    \n"
       "psllw       $1,%%xmm4                     \n"  // all 2
@@ -1005,8 +985,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    ptrdiff_t dst_stride,
                                    int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm7,%%xmm7                 \n"
+      asm("pcmpeqw     %%xmm7,%%xmm7                 \n"
       "psrlw       $15,%%xmm7                    \n"
       "psllw       $3,%%xmm7                     \n"  // all 8
       "movdqa      %5,%%xmm6                     \n"
@@ -1103,8 +1082,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-  asm volatile(
-      "pxor        %%xmm5,%%xmm5                 \n"
+      asm("pxor        %%xmm5,%%xmm5                 \n"
       "pcmpeqd     %%xmm4,%%xmm4                 \n"
       "psrld       $31,%%xmm4                    \n"
       "pslld       $1,%%xmm4                     \n"  // all 2
@@ -1156,8 +1134,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-  asm volatile(
-      "pxor        %%xmm7,%%xmm7                 \n"
+      asm("pxor        %%xmm7,%%xmm7                 \n"
       "pcmpeqd     %%xmm6,%%xmm6                 \n"
       "psrld       $31,%%xmm6                    \n"
       "pslld       $3,%%xmm6                     \n"  // all 8
@@ -1264,8 +1241,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqw     %%xmm4,%%xmm4                 \n"
       "psrlw       $15,%%xmm4                    \n"
       "psllw       $1,%%xmm4                     \n"  // all 2
       "movdqa      %3,%%xmm3                     \n"
@@ -1305,8 +1281,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 ptrdiff_t dst_stride,
                                 int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      asm("pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
       "psllw       $3,%%xmm6                     \n"  // all 8
       "movdqa      %5,%%xmm7                     \n"
@@ -1390,8 +1365,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
       "vbroadcastf128 %3,%%ymm3                  \n"
@@ -1434,8 +1408,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+      asm("vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
       "vbroadcastf128 %5,%%ymm7                  \n"
@@ -1516,8 +1489,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm5                  \n"
+      asm("vbroadcastf128 %3,%%ymm5                  \n"
       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
@@ -1568,8 +1540,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
+      asm("vbroadcastf128 %5,%%ymm5                  \n"
       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
       "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
@@ -1630,8 +1601,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrld      $31,%%ymm4,%%ymm4             \n"
       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
 
@@ -1680,8 +1650,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+      asm("vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrld      $31,%%ymm6,%%ymm6             \n"
       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
 
@@ -1763,10 +1732,10 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
+      asm("pxor        %%xmm5,%%xmm5                 \n"
 
-               // 16 pixel loop.
-               LABELALIGN
+      // 16 pixel loop.
+      LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm3                   \n"
       "lea         0x10(%0),%0                   \n"  // src_ptr += 16
@@ -1782,11 +1751,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
       "lea         0x20(%1),%1                   \n"
       "sub         $0x10,%2                      \n"
       "jg          1b                            \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(src_width)  // %2
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
 #ifdef HAS_SCALEADDROW_AVX2
@@ -1794,9 +1763,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
-               LABELALIGN
+      LABELALIGN
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm3                   \n"
       "lea         0x20(%0),%0                   \n"  // src_ptr += 32
@@ -1811,11 +1780,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
       "vzeroupper                                \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(src_width)  // %2
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEADDROW_AVX2
 
@@ -1835,8 +1804,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
                            int x,
                            int dx) {
   intptr_t x0, x1, temp_pixel;
-  asm volatile(
-      "movd        %6,%%xmm2                     \n"
+      asm("movd        %6,%%xmm2                     \n"
       "movd        %7,%%xmm3                     \n"
       "movl        $0x04040000,%k2               \n"
       "movd        %k2,%%xmm5                    \n"
@@ -1932,7 +1900,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
                        int dx) {
   (void)x;
   (void)dx;
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
       "lea         0x10(%1),%1                   \n"
@@ -1945,11 +1913,11 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
       "sub         $0x20,%2                      \n"
       "jg          1b                            \n"
 
-               : "+r"(dst_ptr),   // %0
-                 "+r"(src_ptr),   // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(dst_ptr),   // %0
+        "+r"(src_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
@@ -1957,7 +1925,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int dst_width) {
   (void)src_stride;
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1967,11 +1935,11 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
@@ -1979,7 +1947,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   (void)src_stride;
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1992,18 +1960,18 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                uint8_t* dst_argb,
                                int dst_width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -2020,11 +1988,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
       "lea         0x10(%1),%1                   \n"
       "sub         $0x4,%2                       \n"
       "jg          1b                            \n"
-               : "+r"(src_argb),              // %0
-                 "+r"(dst_argb),              // %1
-                 "+r"(dst_width)              // %2
-               : "r"((intptr_t)(src_stride))  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Reads 4 pixels at a time.
@@ -2037,8 +2005,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   (void)src_stride;
-  asm volatile(
-      "lea         0x00(,%1,4),%1                \n"
+      asm("lea         0x00(,%1,4),%1                \n"
       "lea         0x00(%1,%1,2),%4              \n"
 
       LABELALIGN
@@ -2060,8 +2027,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
         "+r"(dst_argb),       // %2
         "+r"(dst_width),      // %3
         "=&r"(src_stepx_x12)  // %4
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Blends four 2x2 to 4x1.
@@ -2074,8 +2041,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   intptr_t row1 = (intptr_t)(src_stride);
-  asm volatile(
-      "lea         0x00(,%1,4),%1                \n"
+      asm("lea         0x00(,%1,4),%1                \n"
       "lea         0x00(%1,%1,2),%4              \n"
       "lea         0x00(%0,%5,1),%5              \n"
 
@@ -2107,8 +2073,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
         "+rm"(dst_width),      // %3
         "=&r"(src_stepx_x12),  // %4
         "+r"(row1)             // %5
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
@@ -2117,8 +2083,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
                         int x,
                         int dx) {
   intptr_t x0, x1;
-  asm volatile(
-      "movd        %5,%%xmm2                     \n"
+      asm("movd        %5,%%xmm2                     \n"
       "movd        %6,%%xmm3                     \n"
       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
       "pshufd      $0x11,%%xmm3,%%xmm0           \n"
@@ -2188,7 +2153,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
                            int dx) {
   (void)x;
   (void)dx;
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
       "lea         0x10(%1),%1                   \n"
@@ -2201,11 +2166,11 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
       "sub         $0x8,%2                       \n"
       "jg          1b                            \n"
 
-               : "+r"(dst_argb),  // %0
-                 "+r"(src_argb),  // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(dst_argb),  // %0
+        "+r"(src_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
@@ -2226,16 +2191,14 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
                                int x,
                                int dx) {
   intptr_t x0, x1;
-  asm volatile(
-      "movdqa      %0,%%xmm4                     \n"
+      asm("movdqa      %0,%%xmm4                     \n"
       "movdqa      %1,%%xmm5                     \n"
       :
       : "m"(kShuffleColARGB),   // %0
         "m"(kShuffleFractions)  // %1
   );
 
-  asm volatile(
-      "movd        %5,%%xmm2                     \n"
+      asm("movd        %5,%%xmm2                     \n"
       "movd        %6,%%xmm3                     \n"
       "pcmpeqb     %%xmm6,%%xmm6                 \n"
       "psrlw       $0x9,%%xmm6                   \n"
@@ -2283,8 +2246,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
       "packuswb    %%xmm0,%%xmm0                 \n"
       "movd        %%xmm0,(%0)                   \n"
 
-      LABELALIGN
-      "99:                                       \n"  // clang-format error.
+      LABELALIGN "99:                                       \n"
 
       : "+r"(dst_argb),    // %0
         "+r"(src_argb),    // %1
@@ -2298,8 +2260,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_X86(int num, int div) {
-  asm volatile(
-      "cdq                                       \n"
+      asm("cdq                                       \n"
       "shld        $0x10,%%eax,%%edx             \n"
       "shl         $0x10,%%eax                   \n"
       "idiv        %1                            \n"
@@ -2312,8 +2273,7 @@ int FixedDiv_X86(int num, int div) {
 
 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_X86(int num, int div) {
-  asm volatile(
-      "cdq                                       \n"
+      asm("cdq                                       \n"
       "shld        $0x10,%%eax,%%edx             \n"
       "shl         $0x10,%%eax                   \n"
       "sub         $0x10001,%%eax                \n"
@@ -2344,8 +2304,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
                               int dst_width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
       "pxor        %%xmm5, %%xmm5                \n"  // zero
@@ -2384,8 +2343,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst_ptr,
                              int dst_width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
+      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
@@ -2428,8 +2386,7 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqw     %%xmm4,%%xmm4                 \n"
       "psrlw       $15,%%xmm4                    \n"
       "psllw       $1,%%xmm4                     \n"  // all 2
       "movdqa      %3,%%xmm3                     \n"
@@ -2469,8 +2426,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      asm("pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
       "psllw       $3,%%xmm6                     \n"  // all 8
       "movdqa      %5,%%xmm7                     \n"
@@ -2553,8 +2509,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
       "vbroadcastf128 %3,%%ymm3                  \n"
@@ -2596,8 +2551,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+      asm("vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
       "vbroadcastf128 %5,%%ymm7                  \n"
@@ -2676,8 +2630,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    int dst_width) {
-  asm volatile(
-      "pxor        %%xmm5,%%xmm5                 \n"
+      asm("pxor        %%xmm5,%%xmm5                 \n"
       "pcmpeqd     %%xmm4,%%xmm4                 \n"
       "psrld       $31,%%xmm4                    \n"
       "pslld       $1,%%xmm4                     \n"  // all 2
@@ -2728,8 +2681,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
                                      uint16_t* dst_ptr,
                                      ptrdiff_t dst_stride,
                                      int dst_width) {
-  asm volatile(
-      "pxor        %%xmm7,%%xmm7                 \n"
+      asm("pxor        %%xmm7,%%xmm7                 \n"
       "pcmpeqd     %%xmm6,%%xmm6                 \n"
       "psrld       $31,%%xmm6                    \n"
       "pslld       $3,%%xmm6                     \n"  // all 8
@@ -2819,8 +2771,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrld      $31,%%ymm4,%%ymm4             \n"
       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
 
@@ -2868,8 +2819,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                     uint16_t* dst_ptr,
                                     ptrdiff_t dst_stride,
                                     int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+      asm("vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrld      $31,%%ymm6,%%ymm6             \n"
       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
 
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index ec1d72eb8..9973318f5 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2551,11 +2551,12 @@ int TestHalfFloatPlane(int benchmark_width,
 #if defined(__arm__)
 static void EnableFlushDenormalToZero(void) {
   uint32_t cw;
-  __asm__ __volatile__(
-      "vmrs   %0, fpscr         \n"
-      "orr    %0, %0, #0x1000000        \n"
-      "vmsr   fpscr, %0         \n"
-      : "=r"(cw)::"memory");
+  asm volatile (
+    "vmrs   %0, fpscr                 \n"
+    "orr    %0, %0, #0x1000000        \n"
+    "vmsr   fpscr, %0                 \n"
+    : "=r"(cw)
+    ::"memory", "cc"); // Clobber List
 }
 #endif