RVV remove unused variables

- ARM Planar test use regular asm volatile syntax - x86 row functions remove volatile from asm Bug: 347111119, 347112532 Change-Id: I535b3dfa1a7a19824503bd95584a63b047b0e9a1 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5637058 Reviewed-by: Justin Green <greenjustin@google.com>
2026-01-01 03:12:16 +08:00 · 2024-06-17 12:51:24 -07:00 · 2024-06-17 12:51:24 -07:00 · b0dfa70114
commit b0dfa70114
parent 7758c961c5
6 changed files with 890 additions and 1209 deletions
--- a/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@ -21,14 +21,15 @@ extern "C" {
 // This module is for GCC x86 and x64.
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))

+// "memory" clobber prevents the reads from being removed
+
 #if defined(__x86_64__)
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                               const uint8_t* src_b,
                               int count) {
-  uint64_t diff = 0u;
+  uint64_t diff;

-  asm volatile(
-      "xor         %3,%3                         \n"
+      asm("xor         %3,%3                         \n"
      "xor         %%r8,%%r8                     \n"
      "xor         %%r9,%%r9                     \n"
      "xor         %%r10,%%r10                   \n"
@ -63,9 +64,9 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
      : "+r"(src_a),  // %0
        "+r"(src_b),  // %1
        "+r"(count),  // %2
-        "=r"(diff)    // %3
+        "=&r"(diff)   // %3
      :
-      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
+      : "cc", "memory", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");

  return (uint32_t)(diff);
 }
@ -75,7 +76,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                               int count) {
  uint32_t diff = 0u;

-  asm volatile(
+  asm(
      // Process 16 bytes per loop.
      LABELALIGN
      "1:                                        \n"
@ -104,7 +105,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
        "+r"(count),  // %2
        "+r"(diff)    // %3
      :
-      : "memory", "cc", "ecx", "edx");
+      : "cc", "memory", "ecx", "edx");

  return diff;
 }
@ -117,10 +118,9 @@ static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
 uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
                               const uint8_t* src_b,
                               int count) {
-  uint32_t diff = 0u;
+  uint32_t diff;

-  asm volatile(
-      "movdqa      %4,%%xmm2                     \n"
+      asm("movdqa      %4,%%xmm2                     \n"
      "movdqa      %5,%%xmm3                     \n"
      "pxor        %%xmm0,%%xmm0                 \n"
      "pxor        %%xmm1,%%xmm1                 \n"
@ -166,7 +166,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
        "=r"(diff)         // %3
      : "m"(kNibbleMask),  // %4
        "m"(kBitCount)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");

  return diff;
@ -176,10 +176,9 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
 uint32_t HammingDistance_AVX2(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count) {
-  uint32_t diff = 0u;
+  uint32_t diff;

-  asm volatile(
-      "vbroadcastf128 %4,%%ymm2                  \n"
+      asm("vbroadcastf128 %4,%%ymm2                  \n"
      "vbroadcastf128 %5,%%ymm3                  \n"
      "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
      "vpxor       %%ymm1,%%ymm1,%%ymm1          \n"
@ -214,7 +213,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
      "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
      "vpermq      $0xaa,%%ymm0,%%ymm1           \n"
      "vpaddd      %%ymm1,%%ymm0,%%ymm0          \n"
-      "vmovd       %%xmm0, %3                    \n"
+      "vmovd       %%xmm0,%3                     \n"
      "vzeroupper                                \n"
      : "+r"(src_a),       // %0
        "+r"(src_b),       // %1
@ -222,7 +221,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
        "=r"(diff)         // %3
      : "m"(kNibbleMask),  // %4
        "m"(kBitCount)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+      : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

  return diff;
 }
@ -232,8 +231,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
                             const uint8_t* src_b,
                             int count) {
  uint32_t sse;
-  asm volatile(
-      "pxor        %%xmm0,%%xmm0                 \n"
+      asm("pxor        %%xmm0,%%xmm0                 \n"
      "pxor        %%xmm5,%%xmm5                 \n"

      LABELALIGN
@ -261,13 +259,12 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
      "pshufd      $0x1,%%xmm0,%%xmm1            \n"
      "paddd       %%xmm1,%%xmm0                 \n"
      "movd        %%xmm0,%3                     \n"
-
      : "+r"(src_a),  // %0
        "+r"(src_b),  // %1
        "+r"(count),  // %2
-        "=g"(sse)     // %3
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+        "=r"(sse)     // %3
+      :
+      : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  return sse;
 }

@ -299,8 +296,7 @@ static const uvec32 kHashMul3 = {

 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
  uint32_t hash;
-  asm volatile(
-      "movd        %2,%%xmm0                     \n"
+      asm("movd        %2,%%xmm0                     \n"
      "pxor        %%xmm7,%%xmm7                 \n"
      "movdqa      %4,%%xmm6                     \n"

@ -341,13 +337,13 @@ uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
      : "+r"(src),        // %0
        "+r"(count),      // %1
        "+rm"(seed),      // %2
-        "=g"(hash)        // %3
+        "=r"(hash)        // %3
      : "m"(kHash16x33),  // %4
        "m"(kHashMul0),   // %5
        "m"(kHashMul1),   // %6
        "m"(kHashMul2),   // %7
        "m"(kHashMul3)    // %8
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
  return hash;
 }
--- a/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
                        uint8_t* dst,
                        int dst_stride,
                        int width) {
-  asm volatile(
+  asm(
      // Read in the data from the source pointer.
      // First round of bit swap.
      LABELALIGN
@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
                             uint8_t* dst,
                             int dst_stride,
                             int width) {
-  asm volatile(
+  asm(
      // Read in the data from the source pointer.
      // First round of bit swap.
      LABELALIGN
@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
                         uint8_t* dst_b,
                         int dst_stride_b,
                         int width) {
-  asm volatile(
+  asm(
      // Read in the data from the source pointer.
      // First round of bit swap.
      LABELALIGN
@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
                          uint8_t* dst,
                          int dst_stride,
                          int width) {
-  asm volatile(
+  asm(
      // Main loop transpose 4x4.  Read a column, write a row.
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"  // a b c d
@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
                          uint8_t* dst,
                          int dst_stride,
                          int width) {
-  asm volatile(
+  asm(
      // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
      "1:                                        \n"
      "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@ -497,7 +497,6 @@ void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
-    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
    vuint8m2x3_t v_dst_bgr = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b);
    __riscv_vsseg3e8_v_u8m2x3(dst_raw, v_dst_bgr, vl);
    w -= vl;
@ -2101,7 +2100,6 @@ void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
-    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
    v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
    v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
@ -2191,7 +2189,6 @@ void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
    vuint16m4_t v_y_u16;
    size_t vl = __riscv_vsetvl_e8m2(w);
    vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl);
-    vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 0);
    vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1);
    vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2);
    vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3);
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm(
      // 16 pixel loop.
      LABELALIGN
      "1:                                        \n"
@ -114,8 +114,8 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
      : "+r"(src_ptr),   // %0
        "+r"(dst_ptr),   // %1
        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1");
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }

 void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
@ -123,8 +123,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
  (void)src_stride;
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
      "psrlw       $0xf,%%xmm4                   \n"
      "packuswb    %%xmm4,%%xmm4                 \n"
      "pxor        %%xmm5,%%xmm5                 \n"
@ -146,16 +145,15 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
      : "+r"(src_ptr),   // %0
        "+r"(dst_ptr),   // %1
        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }

 void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
      "psrlw       $0xf,%%xmm4                   \n"
      "packuswb    %%xmm4,%%xmm4                 \n"
      "pxor        %%xmm5,%%xmm5                 \n"
@ -195,7 +193,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                        uint8_t* dst_ptr,
                        int dst_width) {
  (void)src_stride;
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -209,11 +207,11 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
      "sub         $0x20,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper                                \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }

 void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
@ -221,8 +219,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
  (void)src_stride;
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
@ -246,16 +243,15 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
      : "+r"(src_ptr),   // %0
        "+r"(dst_ptr),   // %1
        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }

 void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint8_t* dst_ptr,
                           int dst_width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
@ -297,8 +293,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
  (void)src_stride;
-  asm volatile(
-      "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
      "psrld       $0x18,%%xmm5                  \n"
      "pslld       $0x10,%%xmm5                  \n"

@ -319,8 +314,8 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
      : "+r"(src_ptr),   // %0
        "+r"(dst_ptr),   // %1
        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm5");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }

 void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
@ -328,8 +323,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int dst_width) {
  intptr_t stridex3;
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
      "psrlw       $0xf,%%xmm4                   \n"
      "movdqa      %%xmm4,%%xmm5                 \n"
      "packuswb    %%xmm4,%%xmm4                 \n"
@ -383,8 +377,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
                        uint8_t* dst_ptr,
                        int dst_width) {
  (void)src_stride;
-  asm volatile(
-      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
      "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
      "vpslld      $0x10,%%ymm5,%%ymm5           \n"

@ -408,16 +401,15 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
      : "+r"(src_ptr),   // %0
        "+r"(dst_ptr),   // %1
        "+r"(dst_width)  // %2
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm5");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }

 void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint8_t* dst_ptr,
                           int dst_width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
      "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
@ -472,8 +464,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
  (void)src_stride;
-  asm volatile(
-      "movdqa      %0,%%xmm3                     \n"
+      asm("movdqa      %0,%%xmm3                     \n"
      "movdqa      %1,%%xmm4                     \n"
      "movdqa      %2,%%xmm5                     \n"
      :
@ -481,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
        "m"(kShuf1),  // %1
        "m"(kShuf2)   // %2
  );
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm2               \n"
@ -497,19 +488,18 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
      "lea         0x18(%1),%1                   \n"
      "sub         $0x18,%2                      \n"
      "jg          1b                            \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }

 void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"  // kShuf01
+      asm("movdqa      %0,%%xmm2                     \n"  // kShuf01
      "movdqa      %1,%%xmm3                     \n"  // kShuf11
      "movdqa      %2,%%xmm4                     \n"  // kShuf21
      :
@ -517,8 +507,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShuf11),  // %1
        "m"(kShuf21)   // %2
  );
-  asm volatile(
-      "movdqa      %0,%%xmm5                     \n"  // kMadd01
+      asm("movdqa      %0,%%xmm5                     \n"  // kMadd01
      "movdqa      %1,%%xmm0                     \n"  // kMadd11
      "movdqa      %2,%%xmm1                     \n"  // kRound34
      :
@ -526,7 +515,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kMadd11),  // %1
        "m"(kRound34)  // %2
  );
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm6                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@ -559,21 +548,20 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x18(%1),%1                   \n"
      "sub         $0x18,%2                      \n"
      "jg          1b                            \n"
-               : "+r"(src_ptr),                // %0
-                 "+r"(dst_ptr),                // %1
-                 "+r"(dst_width)               // %2
-               : "r"((intptr_t)(src_stride)),  // %3
-                 "m"(kMadd21)                  // %4
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6", "xmm7");
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }

 void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"  // kShuf01
+      asm("movdqa      %0,%%xmm2                     \n"  // kShuf01
      "movdqa      %1,%%xmm3                     \n"  // kShuf11
      "movdqa      %2,%%xmm4                     \n"  // kShuf21
      :
@ -581,8 +569,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShuf11),  // %1
        "m"(kShuf21)   // %2
  );
-  asm volatile(
-      "movdqa      %0,%%xmm5                     \n"  // kMadd01
+      asm("movdqa      %0,%%xmm5                     \n"  // kMadd01
      "movdqa      %1,%%xmm0                     \n"  // kMadd11
      "movdqa      %2,%%xmm1                     \n"  // kRound34
      :
@ -591,7 +578,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kRound34)  // %2
  );

-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm6                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@ -627,13 +614,13 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x18(%1),%1                   \n"
      "sub         $0x18,%2                      \n"
      "jg          1b                            \n"
-               : "+r"(src_ptr),                // %0
-                 "+r"(dst_ptr),                // %1
-                 "+r"(dst_width)               // %2
-               : "r"((intptr_t)(src_stride)),  // %3
-                 "m"(kMadd21)                  // %4
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6", "xmm7");
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }

 void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
@ -641,8 +628,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
  (void)src_stride;
-  asm volatile(
-      "movdqa      %3,%%xmm4                     \n"
+      asm("movdqa      %3,%%xmm4                     \n"
      "movdqa      %4,%%xmm5                     \n"

      LABELALIGN
@ -671,8 +657,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"
+      asm("movdqa      %0,%%xmm2                     \n"
      "movdqa      %1,%%xmm3                     \n"
      "movdqa      %2,%%xmm4                     \n"
      "movdqa      %3,%%xmm5                     \n"
@ -682,7 +667,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShufAb2),  // %2
        "m"(kScaleAb2)  // %3
  );
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm1          \n"
@ -703,20 +688,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x6(%1),%1                    \n"
      "sub         $0x6,%2                       \n"
      "jg          1b                            \n"
-               : "+r"(src_ptr),               // %0
-                 "+r"(dst_ptr),               // %1
-                 "+r"(dst_width)              // %2
-               : "r"((intptr_t)(src_stride))  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6");
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }

 void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
-      "movdqa      %0,%%xmm2                     \n"
+      asm("movdqa      %0,%%xmm2                     \n"
      "movdqa      %1,%%xmm3                     \n"
      "movdqa      %2,%%xmm4                     \n"
      "pxor        %%xmm5,%%xmm5                 \n"
@ -725,7 +708,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShufAc3),   // %1
        "m"(kScaleAc33)  // %2
  );
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm6          \n"
@ -765,12 +748,12 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
      "lea         0x6(%1),%1                    \n"
      "sub         $0x6,%2                       \n"
      "jg          1b                            \n"
-               : "+r"(src_ptr),               // %0
-                 "+r"(dst_ptr),               // %1
-                 "+r"(dst_width)              // %2
-               : "r"((intptr_t)(src_stride))  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
-                 "xmm6", "xmm7");
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }

 static const uvec8 kLinearShuffleFar = {2,  3,  0, 1, 6,  7,  4,  5,
@ -783,8 +766,7 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
-  asm volatile(
-      "pxor        %%xmm0,%%xmm0                 \n"  // 0
+      asm("pxor        %%xmm0,%%xmm0                 \n"  // 0
      "pcmpeqw     %%xmm6,%%xmm6                 \n"
      "psrlw       $15,%%xmm6                    \n"
      "psllw       $1,%%xmm6                     \n"  // all 2
@ -839,8 +821,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               ptrdiff_t dst_stride,
                               int dst_width) {
-  asm volatile(
-      LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "pxor        %%xmm0,%%xmm0                 \n"  // 0
      // above line
@ -953,8 +934,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-  asm volatile(
-      "movdqa      %3,%%xmm5                     \n"
+      asm("movdqa      %3,%%xmm5                     \n"
      "pcmpeqw     %%xmm4,%%xmm4                 \n"
      "psrlw       $15,%%xmm4                    \n"
      "psllw       $1,%%xmm4                     \n"  // all 2
@ -1005,8 +985,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm7,%%xmm7                 \n"
+      asm("pcmpeqw     %%xmm7,%%xmm7                 \n"
      "psrlw       $15,%%xmm7                    \n"
      "psllw       $3,%%xmm7                     \n"  // all 8
      "movdqa      %5,%%xmm6                     \n"
@ -1103,8 +1082,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
-      "pxor        %%xmm5,%%xmm5                 \n"
+      asm("pxor        %%xmm5,%%xmm5                 \n"
      "pcmpeqd     %%xmm4,%%xmm4                 \n"
      "psrld       $31,%%xmm4                    \n"
      "pslld       $1,%%xmm4                     \n"  // all 2
@ -1156,8 +1134,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-  asm volatile(
-      "pxor        %%xmm7,%%xmm7                 \n"
+      asm("pxor        %%xmm7,%%xmm7                 \n"
      "pcmpeqd     %%xmm6,%%xmm6                 \n"
      "psrld       $31,%%xmm6                    \n"
      "pslld       $3,%%xmm6                     \n"  // all 8
@ -1264,8 +1241,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqw     %%xmm4,%%xmm4                 \n"
      "psrlw       $15,%%xmm4                    \n"
      "psllw       $1,%%xmm4                     \n"  // all 2
      "movdqa      %3,%%xmm3                     \n"
@ -1305,8 +1281,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      asm("pcmpeqw     %%xmm6,%%xmm6                 \n"
      "psrlw       $15,%%xmm6                    \n"
      "psllw       $3,%%xmm6                     \n"  // all 8
      "movdqa      %5,%%xmm7                     \n"
@ -1390,8 +1365,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
      "vbroadcastf128 %3,%%ymm3                  \n"
@ -1434,8 +1408,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               ptrdiff_t dst_stride,
                               int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+      asm("vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
      "vbroadcastf128 %5,%%ymm7                  \n"
@ -1516,8 +1489,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
-      "vbroadcastf128 %3,%%ymm5                  \n"
+      asm("vbroadcastf128 %3,%%ymm5                  \n"
      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
@ -1568,8 +1540,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
+      asm("vbroadcastf128 %5,%%ymm5                  \n"
      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
      "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
@ -1630,8 +1601,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrld      $31,%%ymm4,%%ymm4             \n"
      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2

@ -1680,8 +1650,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+      asm("vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
      "vpsrld      $31,%%ymm6,%%ymm6             \n"
      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8

@ -1763,10 +1732,10 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
                      int src_width) {
-      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
+      asm("pxor        %%xmm5,%%xmm5                 \n"

-               // 16 pixel loop.
-               LABELALIGN
+      // 16 pixel loop.
+      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm3                   \n"
      "lea         0x10(%0),%0                   \n"  // src_ptr += 16
@ -1782,11 +1751,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
      "lea         0x20(%1),%1                   \n"
      "sub         $0x10,%2                      \n"
      "jg          1b                            \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(src_width)  // %2
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }

 #ifdef HAS_SCALEADDROW_AVX2
@ -1794,9 +1763,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
                      int src_width) {
-      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      asm("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"

-               LABELALIGN
+      LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm3                   \n"
      "lea         0x20(%0),%0                   \n"  // src_ptr += 32
@ -1811,11 +1780,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
      "sub         $0x20,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper                                \n"
-               : "+r"(src_ptr),   // %0
-                 "+r"(dst_ptr),   // %1
-                 "+r"(src_width)  // %2
-               :
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEADDROW_AVX2

@ -1835,8 +1804,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
                           int x,
                           int dx) {
  intptr_t x0, x1, temp_pixel;
-  asm volatile(
-      "movd        %6,%%xmm2                     \n"
+      asm("movd        %6,%%xmm2                     \n"
      "movd        %7,%%xmm3                     \n"
      "movl        $0x04040000,%k2               \n"
      "movd        %k2,%%xmm5                    \n"
@ -1932,7 +1900,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
                       int dx) {
  (void)x;
  (void)dx;
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "movdqu      (%1),%%xmm0                   \n"
      "lea         0x10(%1),%1                   \n"
@ -1945,11 +1913,11 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
      "sub         $0x20,%2                      \n"
      "jg          1b                            \n"

-               : "+r"(dst_ptr),   // %0
-                 "+r"(src_ptr),   // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(dst_ptr),   // %0
+        "+r"(src_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }

 void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
@ -1957,7 +1925,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int dst_width) {
  (void)src_stride;
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1967,11 +1935,11 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
      "lea         0x10(%1),%1                   \n"
      "sub         $0x4,%2                       \n"
      "jg          1b                            \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }

 void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
@ -1979,7 +1947,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                  uint8_t* dst_argb,
                                  int dst_width) {
  (void)src_stride;
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1992,18 +1960,18 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
      "lea         0x10(%1),%1                   \n"
      "sub         $0x4,%2                       \n"
      "jg          1b                            \n"
-               : "+r"(src_argb),  // %0
-                 "+r"(dst_argb),  // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }

 void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                               ptrdiff_t src_stride,
                               uint8_t* dst_argb,
                               int dst_width) {
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -2020,11 +1988,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
      "lea         0x10(%1),%1                   \n"
      "sub         $0x4,%2                       \n"
      "jg          1b                            \n"
-               : "+r"(src_argb),              // %0
-                 "+r"(dst_argb),              // %1
-                 "+r"(dst_width)              // %2
-               : "r"((intptr_t)(src_stride))  // %3
-               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }

 // Reads 4 pixels at a time.
@ -2037,8 +2005,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  intptr_t src_stepx_x12;
  (void)src_stride;
-  asm volatile(
-      "lea         0x00(,%1,4),%1                \n"
+      asm("lea         0x00(,%1,4),%1                \n"
      "lea         0x00(%1,%1,2),%4              \n"

      LABELALIGN
@ -2060,8 +2027,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
        "+r"(dst_argb),       // %2
        "+r"(dst_width),      // %3
        "=&r"(src_stepx_x12)  // %4
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }

 // Blends four 2x2 to 4x1.
@ -2074,8 +2041,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  intptr_t src_stepx_x12;
  intptr_t row1 = (intptr_t)(src_stride);
-  asm volatile(
-      "lea         0x00(,%1,4),%1                \n"
+      asm("lea         0x00(,%1,4),%1                \n"
      "lea         0x00(%1,%1,2),%4              \n"
      "lea         0x00(%0,%5,1),%5              \n"

@ -2107,8 +2073,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
        "+rm"(dst_width),      // %3
        "=&r"(src_stepx_x12),  // %4
        "+r"(row1)             // %5
-        ::"memory",
-        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }

 void ScaleARGBCols_SSE2(uint8_t* dst_argb,
@ -2117,8 +2083,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
                        int x,
                        int dx) {
  intptr_t x0, x1;
-  asm volatile(
-      "movd        %5,%%xmm2                     \n"
+      asm("movd        %5,%%xmm2                     \n"
      "movd        %6,%%xmm3                     \n"
      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
      "pshufd      $0x11,%%xmm3,%%xmm0           \n"
@ -2188,7 +2153,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
                           int dx) {
  (void)x;
  (void)dx;
-  asm volatile(LABELALIGN
+  asm(LABELALIGN
      "1:                                        \n"
      "movdqu      (%1),%%xmm0                   \n"
      "lea         0x10(%1),%1                   \n"
@ -2201,11 +2166,11 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
      "sub         $0x8,%2                       \n"
      "jg          1b                            \n"

-               : "+r"(dst_argb),  // %0
-                 "+r"(src_argb),  // %1
-                 "+r"(dst_width)  // %2
-                 ::"memory",
-                 "cc", "xmm0", "xmm1");
+      : "+r"(dst_argb),  // %0
+        "+r"(src_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }

 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
@ -2226,16 +2191,14 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
                               int x,
                               int dx) {
  intptr_t x0, x1;
-  asm volatile(
-      "movdqa      %0,%%xmm4                     \n"
+      asm("movdqa      %0,%%xmm4                     \n"
      "movdqa      %1,%%xmm5                     \n"
      :
      : "m"(kShuffleColARGB),   // %0
        "m"(kShuffleFractions)  // %1
  );

-  asm volatile(
-      "movd        %5,%%xmm2                     \n"
+      asm("movd        %5,%%xmm2                     \n"
      "movd        %6,%%xmm3                     \n"
      "pcmpeqb     %%xmm6,%%xmm6                 \n"
      "psrlw       $0x9,%%xmm6                   \n"
@ -2283,8 +2246,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
      "packuswb    %%xmm0,%%xmm0                 \n"
      "movd        %%xmm0,(%0)                   \n"

-      LABELALIGN
-      "99:                                       \n"  // clang-format error.
+      LABELALIGN "99:                                       \n"

      : "+r"(dst_argb),    // %0
        "+r"(src_argb),    // %1
@ -2298,8 +2260,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,

 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_X86(int num, int div) {
-  asm volatile(
-      "cdq                                       \n"
+      asm("cdq                                       \n"
      "shld        $0x10,%%eax,%%edx             \n"
      "shl         $0x10,%%eax                   \n"
      "idiv        %1                            \n"
@ -2312,8 +2273,7 @@ int FixedDiv_X86(int num, int div) {

 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_X86(int num, int div) {
-  asm volatile(
-      "cdq                                       \n"
+      asm("cdq                                       \n"
      "shld        $0x10,%%eax,%%edx             \n"
      "shl         $0x10,%%eax                   \n"
      "sub         $0x10001,%%eax                \n"
@ -2344,8 +2304,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst_ptr,
                              int dst_width) {
-  asm volatile(
-      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
+      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
      "psrlw       $0xf,%%xmm4                   \n"
      "packuswb    %%xmm4,%%xmm4                 \n"
      "pxor        %%xmm5, %%xmm5                \n"  // zero
@ -2384,8 +2343,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst_ptr,
                             int dst_width) {
-  asm volatile(
-      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
+      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
@ -2428,8 +2386,7 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      asm("pcmpeqw     %%xmm4,%%xmm4                 \n"
      "psrlw       $15,%%xmm4                    \n"
      "psllw       $1,%%xmm4                     \n"  // all 2
      "movdqa      %3,%%xmm3                     \n"
@ -2469,8 +2426,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-  asm volatile(
-      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      asm("pcmpeqw     %%xmm6,%%xmm6                 \n"
      "psrlw       $15,%%xmm6                    \n"
      "psllw       $3,%%xmm6                     \n"  // all 8
      "movdqa      %5,%%xmm7                     \n"
@ -2553,8 +2509,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
      "vbroadcastf128 %3,%%ymm3                  \n"
@ -2596,8 +2551,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 ptrdiff_t dst_stride,
                                 int dst_width) {
-  asm volatile(
-      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+      asm("vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
      "vbroadcastf128 %5,%%ymm7                  \n"
@ -2676,8 +2630,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width) {
-  asm volatile(
-      "pxor        %%xmm5,%%xmm5                 \n"
+      asm("pxor        %%xmm5,%%xmm5                 \n"
      "pcmpeqd     %%xmm4,%%xmm4                 \n"
      "psrld       $31,%%xmm4                    \n"
      "pslld       $1,%%xmm4                     \n"  // all 2
@ -2728,8 +2681,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
                                     uint16_t* dst_ptr,
                                     ptrdiff_t dst_stride,
                                     int dst_width) {
-  asm volatile(
-      "pxor        %%xmm7,%%xmm7                 \n"
+      asm("pxor        %%xmm7,%%xmm7                 \n"
      "pcmpeqd     %%xmm6,%%xmm6                 \n"
      "psrld       $31,%%xmm6                    \n"
      "pslld       $3,%%xmm6                     \n"  // all 8
@ -2819,8 +2771,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+      asm("vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrld      $31,%%ymm4,%%ymm4             \n"
      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2

@ -2868,8 +2819,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    ptrdiff_t dst_stride,
                                    int dst_width) {
-  asm volatile(
-      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+      asm("vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
      "vpsrld      $31,%%ymm6,%%ymm6             \n"
      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8

--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -2551,11 +2551,12 @@ int TestHalfFloatPlane(int benchmark_width,
 #if defined(__arm__)
 static void EnableFlushDenormalToZero(void) {
  uint32_t cw;
-  __asm__ __volatile__(
-      "vmrs   %0, fpscr         \n"
-      "orr    %0, %0, #0x1000000        \n"
-      "vmsr   fpscr, %0         \n"
-      : "=r"(cw)::"memory");
+  asm volatile (
+    "vmrs   %0, fpscr                 \n"
+    "orr    %0, %0, #0x1000000        \n"
+    "vmsr   fpscr, %0                 \n"
+    : "=r"(cw)
+    ::"memory", "cc"); // Clobber List
 }
 #endif