Add volatile for gcc inline to avoid being removed

Bug: b/42280943 Change-Id: I4439077a92ffa6dff91d2d10accd5251b76f7544 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5671187 Reviewed-by: David Gao <davidgao@google.com>
2026-02-14 06:09:51 +08:00 · 2024-07-01 18:18:10 -07:00 · 2024-07-01 18:18:10 -07:00 · 616bee5420
commit 616bee5420
parent efd164d64e
21 changed files with 795 additions and 610 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1888
+Version: 1889
 License: BSD
 License File: LICENSE
 Shipped: yes
--- a/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
@ -20,9 +20,9 @@
  ({                                                   \
    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
    uint32_t val_m;                                    \
-    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"     \
+    asm("lw  %[val_m],  %[psrc_lw_m]  \n"              \
-                 : [val_m] "=r"(val_m)                 \
+        : [val_m] "=r"(val_m)                          \
-                 : [psrc_lw_m] "m"(*psrc_lw_m));       \
+        : [psrc_lw_m] "m"(*psrc_lw_m));                \
    val_m;                                             \
  })
@ -31,9 +31,9 @@
  ({                                                   \
    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
    uint64_t val_m = 0;                                \
-    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"     \
+    asm("ld  %[val_m],  %[psrc_ld_m]  \n"              \
-                 : [val_m] "=r"(val_m)                 \
+        : [val_m] "=r"(val_m)                          \
-                 : [psrc_ld_m] "m"(*psrc_ld_m));       \
+        : [psrc_ld_m] "m"(*psrc_ld_m));                \
    val_m;                                             \
  })
 #else  // !(__mips == 64)
@ -55,9 +55,9 @@
  ({                                                    \
    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
    uint32_t val_m = (val);                             \
-    asm volatile("sw  %[val_m],  %[pdst_sw_m]  \n"      \
+    asm("sw  %[val_m],  %[pdst_sw_m]  \n"               \
-                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
+        : [pdst_sw_m] "=m"(*pdst_sw_m)                  \
-                 : [val_m] "r"(val_m));                 \
+        : [val_m] "r"(val_m));                          \
  })
 #if (__mips == 64)
@ -65,9 +65,9 @@
  ({                                                    \
    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
    uint64_t val_m = (val);                             \
-    asm volatile("sd  %[val_m],  %[pdst_sd_m]  \n"      \
+    asm("sd  %[val_m],  %[pdst_sd_m]  \n"               \
-                 : [pdst_sd_m] "=m"(*pdst_sd_m)         \
+        : [pdst_sd_m] "=m"(*pdst_sd_m)                  \
-                 : [val_m] "r"(val_m));                 \
+        : [val_m] "r"(val_m));                          \
  })
 #else  // !(__mips == 64)
 #define SD(val, pdst)                                        \
@ -86,8 +86,7 @@
    uint8_t* psrc_lw_m = (uint8_t*)(psrc);      \
    uint32_t val_lw_m;                          \
                                                \
-    __asm__ volatile(                           \
+    asm("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
        "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
        "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
                                                \
        : [val_lw_m] "=&r"(val_lw_m)            \
@ -102,8 +101,7 @@
    uint8_t* psrc_ld_m = (uint8_t*)(psrc);      \
    uint64_t val_ld_m = 0;                      \
                                                \
-    __asm__ volatile(                           \
+    asm("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
        "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
        "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
                                                \
        : [val_ld_m] "=&r"(val_ld_m)            \
@ -130,9 +128,9 @@
  ({                                                    \
    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
    uint32_t val_m = (val);                             \
-    asm volatile("usw  %[val_m],  %[pdst_sw_m]  \n"     \
+    asm("usw  %[val_m],  %[pdst_sw_m]  \n"              \
-                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
+        : [pdst_sw_m] "=m"(*pdst_sw_m)                  \
-                 : [val_m] "r"(val_m));                 \
+        : [val_m] "r"(val_m));                          \
  })
 #define SD(val, pdst)                                        \
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1888
+#define LIBYUV_VERSION 1889
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@ -29,7 +29,8 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                               int count) {
  uint64_t diff;
-      asm("xor         %3,%3                         \n"
+      asm volatile (
      "xor         %3,%3                         \n"
      "xor         %%r8,%%r8                     \n"
      "xor         %%r9,%%r9                     \n"
      "xor         %%r10,%%r10                   \n"
@ -76,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                               int count) {
  uint32_t diff = 0u;
-  asm(
+  asm volatile (
      // Process 16 bytes per loop.
      LABELALIGN
      "1:                                        \n"
@ -120,7 +121,8 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
                               int count) {
  uint32_t diff;
-      asm("movdqa      %4,%%xmm2                     \n"
+  asm volatile (
      "movdqa      %4,%%xmm2                     \n"
      "movdqa      %5,%%xmm3                     \n"
      "pxor        %%xmm0,%%xmm0                 \n"
      "pxor        %%xmm1,%%xmm1                 \n"
@ -178,7 +180,8 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
                              int count) {
  uint32_t diff;
-      asm("vbroadcastf128 %4,%%ymm2                  \n"
+      asm volatile (
      "vbroadcastf128 %4,%%ymm2                  \n"
      "vbroadcastf128 %5,%%ymm3                  \n"
      "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
      "vpxor       %%ymm1,%%ymm1,%%ymm1          \n"
@ -231,7 +234,8 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
                             const uint8_t* src_b,
                             int count) {
  uint32_t sse;
-      asm("pxor        %%xmm0,%%xmm0                 \n"
+      asm volatile (
      "pxor        %%xmm0,%%xmm0                 \n"
      "pxor        %%xmm5,%%xmm5                 \n"
      LABELALIGN
@ -296,7 +300,8 @@ static const uvec32 kHashMul3 = {
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
  uint32_t hash;
-      asm("movd        %2,%%xmm0                     \n"
+      asm volatile (
      "movd        %2,%%xmm0                     \n"
      "pxor        %%xmm7,%%xmm7                 \n"
      "movdqa      %4,%%xmm6                     \n"
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@ -28,7 +28,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
                              int count) {
  uint32_t diff;
-  asm volatile(
+  asm volatile (
      "vmov.u16    q4, #0                        \n"  // accumulator
      "1:                                        \n"
@ -58,7 +58,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
                             const uint8_t* src_b,
                             int count) {
  uint32_t sse;
-  asm volatile(
+  asm volatile (
      "vmov.u8     q8, #0                        \n"
      "vmov.u8     q10, #0                       \n"
      "vmov.u8     q9, #0                        \n"
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@ -26,7 +26,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
                              const uint8_t* src_b,
                              int count) {
  uint32_t diff;
-  asm volatile(
+  asm volatile (
      "movi        v4.8h, #0                     \n"
      "1:                                        \n"
@ -55,7 +55,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
                             const uint8_t* src_b,
                             int count) {
  uint32_t sse;
-  asm volatile(
+  asm volatile (
      "movi        v16.16b, #0                   \n"
      "movi        v17.16b, #0                   \n"
      "movi        v18.16b, #0                   \n"
@ -157,7 +157,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
                                      const uint8_t* src_b,
                                      int count) {
  uint32_t diff;
-  asm volatile(
+  asm volatile (
      "movi        v4.4s, #0                     \n"
      "movi        v5.4s, #0                     \n"
      "movi        v6.16b, #1                    \n"
@ -190,7 +190,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
                                     int count) {
  // count is guaranteed to be a multiple of 32.
  uint32_t sse;
-  asm volatile(
+  asm volatile (
      "movi        v4.4s, #0                     \n"
      "movi        v5.4s, #0                     \n"
--- a/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
                        uint8_t* dst,
                        int dst_stride,
                        int width) {
-  asm(
+  asm volatile (
      // Read in the data from the source pointer.
      // First round of bit swap.
      LABELALIGN
@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
                             uint8_t* dst,
                             int dst_stride,
                             int width) {
-  asm(
+  asm volatile (
      // Read in the data from the source pointer.
      // First round of bit swap.
      LABELALIGN
@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
                         uint8_t* dst_b,
                         int dst_stride_b,
                         int width) {
-  asm(
+  asm volatile (
      // Read in the data from the source pointer.
      // First round of bit swap.
      LABELALIGN
@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
                          uint8_t* dst,
                          int dst_stride,
                          int width) {
-  asm(
+  asm volatile (
      // Main loop transpose 4x4.  Read a column, write a row.
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"  // a b c d
@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
                          uint8_t* dst,
                          int dst_stride,
                          int width) {
-  asm(
+  asm volatile (
      // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
      "1:                                        \n"
      "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
--- a/source/rotate_msa.cc
+++ b/source/rotate_msa.cc
@ -51,6 +51,16 @@ extern "C" {
    out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
  }
 void TransposeWx16_C(const uint8_t* src,
                     int src_stride,
                     uint8_t* dst,
                     int dst_stride,
                     int width) {
  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
                 width);
 }
 void TransposeUVWx16_C(const uint8_t* src,
                       int src_stride,
                       uint8_t* dst_a,
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@ -27,7 +27,7 @@ void TransposeWx8_NEON(const uint8_t* src,
                       int dst_stride,
                       int width) {
  const uint8_t* temp;
-  asm(
+  asm volatile (
      // loops are on blocks of 8. loop will stop when
      // counter gets to or below 0. starting the counter
      // at w-8 allow for this
@ -95,7 +95,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                         int dst_stride_b,
                         int width) {
  const uint8_t* temp;
-  asm(
+  asm volatile (
      // loops are on blocks of 8. loop will stop when
      // counter gets to or below 0. starting the counter
      // at w-8 allow for this
@ -184,7 +184,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
  uint8_t* dst1 = dst + dst_stride;
  uint8_t* dst2 = dst1 + dst_stride;
  uint8_t* dst3 = dst2 + dst_stride;
-  asm volatile(
+  asm volatile (
      // Main loop transpose 4x4.  Read a column, write a row.
      "1:                                        \n"
      "vld4.32     {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@ -27,7 +27,8 @@ void TransposeWx16_NEON(const uint8_t* src,
                        int dst_stride,
                        int width) {
  const uint8_t* src_temp;
-  asm("1:                                                \n"
+  asm volatile (
    "1:                                                \n"
      "mov   %[src_temp], %[src]                         \n"
      "ld1   {v16.16b}, [%[src_temp]], %[src_stride]     \n"
@ -144,7 +145,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                         int dst_stride_b,
                         int width) {
  const uint8_t* temp;
-  asm(
+  asm volatile (
      // loops are on blocks of 8. loop will stop when
      // counter gets to or below 0. starting the counter
      // at w-8 allow for this
@ -238,7 +239,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
  uint8_t* dst1 = dst + dst_stride;
  uint8_t* dst2 = dst1 + dst_stride;
  uint8_t* dst3 = dst2 + dst_stride;
-  asm volatile(
+  asm volatile (
      // Main loop transpose 4x4.  Read a column, write a row.
      "1:                                        \n"
      "ld4         {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
--- a/source/row_lasx.cc
+++ b/source/row_lasx.cc
@ -2037,7 +2037,7 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
                                  int width,
                                  const struct RgbConstants* rgbconstants) {
  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
-  asm volatile(
+  asm volatile (
      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
@ -2099,7 +2099,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
                                  int width,
                                  const struct RgbConstants* rgbconstants) {
  int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
-  asm volatile(
+  asm volatile (
      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
@ -2163,7 +2163,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
      1,  0,  4,  0,  7,  0, 10, 0,  13, 0,  16, 0,  19, 0,  22, 0,
      25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0,
      25, 0,  28, 0,  31, 0, 2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
-  asm volatile(
+  asm volatile (
      "xvldrepl.b      $xr0,  %3,    0             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr1,  %3,    1             \n\t"  // load rgbconstants
      "xvldrepl.b      $xr2,  %3,    2             \n\t"  // load rgbconstants
--- a/source/row_lsx.cc
+++ b/source/row_lsx.cc
@ -2805,8 +2805,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
                                 uint8_t* dst_y,
                                 int width,
                                 const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm("vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
@ -2864,8 +2863,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
                                 uint8_t* dst_y,
                                 int width,
                                 const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm("vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
@ -2922,8 +2920,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
                      7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
                      0,  13, 0,  16, 0,  19, 0,  22, 0,  25, 0,  28, 0,
                      31, 0,  2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
-  asm volatile(
+  asm("vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -140,7 +140,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READYUV444 YUVTORGB
@ -164,7 +164,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "1:                                        \n" READYUV444 YUVTORGB
          RGBTORGB8
@ -187,7 +187,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READYUV422 YUVTORGB
@ -212,7 +212,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "1:                                        \n" READYUV444 YUVTORGB
          RGBTORGB8
@ -238,7 +238,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "1:                                        \n" READYUV422 YUVTORGB
          RGBTORGB8
@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
                        uint8_t* dst_rgba,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READYUV422 YUVTORGB
@ -285,7 +285,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READYUV422 YUVTORGB
@ -316,7 +316,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READYUV422 YUVTORGB
@ -348,7 +348,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "1:                                        \n" READYUV422 YUVTORGB
          RGBTORGB8
@ -381,7 +381,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
                            uint8_t* dst_argb4444,
                            const struct YuvConstants* yuvconstants,
                            int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "vmov.u8     d7, #0x0f                     \n"  // vbic bits to clear
@ -404,7 +404,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READYUV400 YUVTORGB
@ -421,7 +421,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
 }
 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d23, #255                     \n"
      "1:                                        \n"
      "vld1.8      {d20}, [%0]!                  \n"
@ -442,7 +442,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
@ -463,7 +463,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
@ -484,7 +484,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
@ -505,7 +505,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                         uint8_t* dst_rgb24,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READNV21 YUVTORGB RGBTORGB8
@ -526,7 +526,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb565,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
@ -546,7 +546,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
@ -565,7 +565,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  asm volatile(
+  asm volatile (
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
      "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
@ -585,7 +585,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pairs of UV
      "subs        %3, %3, #16                   \n"  // 16 processed per loop
@ -609,7 +609,7 @@ void DetileRow_NEON(const uint8_t* src,
                    ptrdiff_t src_tile_stride,
                    uint8_t* dst,
                    int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q0}, [%0], %3                \n"  // load 16 bytes
      "subs        %2, %2, #16                   \n"  // 16 processed per loop
@ -629,7 +629,7 @@ void DetileRow_16_NEON(const uint16_t* src,
                       ptrdiff_t src_tile_stride,
                       uint16_t* dst,
                       int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.16     {q0, q1}, [%0], %3            \n"  // load 16 pixels
      "subs        %2, %2, #16                   \n"  // 16 processed per loop
@ -650,7 +650,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld2.8      {d0, d1}, [%0], %4            \n"
      "subs        %3, %3, #16                   \n"
@ -675,7 +675,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
                       ptrdiff_t src_uv_tile_stride,
                       uint8_t* dst_yuy2,
                       int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
      "pld         [%0, #1792]                   \n"
@ -701,7 +701,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
                       ptrdiff_t src_uv_tile_stride,
                       uint8_t* dst_yuy2,
                       int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q0}, [%0], %4                \n"  // Load 16 Y
      "vld1.8      {q1}, [%1], %5                \n"  // Load 8 UV
@ -723,7 +723,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
 #endif
 void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q14}, [%0]!                  \n"  // Load lower bits.
      "vld1.8      {q9}, [%0]!                   \n"  // Load upper bits row
@ -767,7 +767,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q0}, [%0]!                   \n"  // load U
      "vld1.8      {q1}, [%1]!                   \n"  // load V
@ -789,7 +789,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
                      uint8_t* dst_g,
                      uint8_t* dst_b,
                      int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld3.8      {d0, d2, d4}, [%0]!           \n"  // load 8 RGB
      "vld3.8      {d1, d3, d5}, [%0]!           \n"  // next 8 RGB
@ -814,7 +814,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
                      const uint8_t* src_b,
                      uint8_t* dst_rgb,
                      int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q0}, [%0]!                   \n"  // load R
      "vld1.8      {q1}, [%1]!                   \n"  // load G
@ -840,7 +840,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_b,
                       uint8_t* dst_a,
                       int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
@ -868,7 +868,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_a,
                       uint8_t* dst_argb,
                       int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q2}, [%0]!                   \n"  // load R
      "vld1.8      {q1}, [%1]!                   \n"  // load G
@ -895,7 +895,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
@ -920,7 +920,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     q3, #255                      \n"  // load A(255)
      "1:                                        \n"
      "vld1.8      {q2}, [%0]!                   \n"  // load R
@ -947,7 +947,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
                       int depth,
                       int width) {
  int shift = 10 - depth;
-  asm volatile(
+  asm volatile (
      "vmov.u32    q14, #1023                    \n"
      "vdup.32     q15, %5                       \n"
      "1:                                        \n"
@ -984,7 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
                          uint8_t* dst_ar30,
                          int /* depth */,
                          int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u32    q14, #1023                    \n"
      "1:                                        \n"
      "vld1.16     {d4}, [%2]!                   \n"  // B
@ -1021,7 +1021,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
                       int width) {
  int shift = 16 - depth;
  int mask = (1 << depth) - 1;
-  asm volatile(
+  asm volatile (
      "vdup.u16    q15, %6                       \n"
      "vdup.u16    q14, %7                       \n"
@ -1061,7 +1061,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
                       int width) {
  int shift = 16 - depth;
  int mask = (1 << depth) - 1;
-  asm volatile(
+  asm volatile (
      "vmov.u8     q3, #0xff                     \n"  // A (0xffff)
      "vdup.u16    q15, %5                       \n"
@ -1098,7 +1098,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
                            int depth,
                            int width) {
  int shift = 8 - depth;
-  asm volatile(
+  asm volatile (
      "vdup.16     q15, %6                       \n"
      "1:                                        \n"
@ -1134,7 +1134,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
                            int depth,
                            int width) {
  int shift = 8 - depth;
-  asm volatile(
+  asm volatile (
      "vdup.16     q15, %5                       \n"
      "vmov.u8     d6, #0xff                     \n"  // A (0xff)
@ -1162,7 +1162,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 32
      "subs        %2, %2, #32                   \n"  // 32 processed per loop
@ -1178,7 +1178,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
 // SetRow writes 'width' bytes using an 8 bit value repeated.
 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
-  asm volatile(
+  asm volatile (
      "vdup.8      q0, %2                        \n"  // duplicate 16 bytes
      "1:                                        \n"
      "subs        %1, %1, #16                   \n"  // 16 bytes per loop
@ -1192,7 +1192,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
 // ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
-  asm volatile(
+  asm volatile (
      "vdup.u32    q0, %2                        \n"  // duplicate 4 ints
      "1:                                        \n"
      "subs        %1, %1, #4                    \n"  // 4 pixels per loop
@ -1205,7 +1205,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
 }
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
+  asm volatile (
      // Start at end of source row.
      "add         %0, %0, %2                    \n"
      "sub         %0, %0, #32                   \n"  // 32 bytes per loop
@ -1227,7 +1227,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
 }
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
-  asm volatile(
+  asm volatile (
      // Start at end of source row.
      "mov         r12, #-16                     \n"
      "add         %0, %0, %2, lsl #1            \n"
@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
-  asm volatile(
+  asm volatile (
      // Start at end of source row.
      "mov         r12, #-16                     \n"
      "add         %0, %0, %3, lsl #1            \n"
@ -1272,7 +1272,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
 }
 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
      "add         %0, %0, %2, lsl #2            \n"
      "sub         %0, #32                       \n"
@ -1296,7 +1296,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_rgb24,
                         int width) {
  src_rgb24 += width * 3 - 24;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld3.8      {d0, d1, d2}, [%0], %3        \n"  // src -= 24
      "subs        %2, #8                        \n"  // 8 pixels per loop.
@ -1315,7 +1315,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d4, #255                      \n"  // Alpha
      "1:                                        \n"
      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RGB24.
@ -1331,7 +1331,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
 }
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d4, #255                      \n"  // Alpha
      "1:                                        \n"
      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
@ -1348,7 +1348,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
 }
 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d0, #255                      \n"  // Alpha
      "1:                                        \n"
      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
@ -1364,7 +1364,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
  );
 }
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld3.8      {d1, d2, d3}, [%0]!           \n"  // load 8 pixels of RAW.
      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@ -1395,7 +1395,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d3, #255                      \n"  // Alpha
      "1:                                        \n"
      "vld1.8      {q0}, [%0]!                   \n"  // load 8 RGB565 pixels.
@ -1441,7 +1441,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
                            int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d3, #255                      \n"  // Alpha
      "1:                                        \n"
      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB1555 pixels.
@ -1470,7 +1470,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
                            uint8_t* dst_argb,
                            int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d3, #255                      \n"  // Alpha
      "1:                                        \n"
      "vld1.8      {q0}, [%0]!                   \n"  // load 8 ARGB4444 pixels.
@ -1489,7 +1489,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_rgb24,
                         int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 16 pixels of ARGB.
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"
@ -1506,7 +1506,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
 }
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d1, d2, d3, d4}, [%0]!       \n"  // load 8 pixels of ARGB.
      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@ -1522,7 +1522,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
 }
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
@ -1537,7 +1537,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
 }
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of UYVY.
      "subs        %2, %2, #16                   \n"  // 16 processed per loop.
@ -1555,7 +1555,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
@ -1575,7 +1575,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
      "subs        %3, %3, #16                   \n"  // 16 pixels = 8 UVs.
@ -1596,7 +1596,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile(
+  asm volatile (
      "add         %1, %0, %1                    \n"  // stride + src_yuy2
      "1:                                        \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of YUY2.
@ -1623,7 +1623,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width) {
-  asm volatile(
+  asm volatile (
      "add         %1, %0, %1                    \n"  // stride + src_uyvy
      "1:                                        \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 16 pixels of UYVY.
@ -1649,7 +1649,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
                        int stride_yuy2,
                        uint8_t* dst_uv,
                        int width) {
-  asm volatile(
+  asm volatile (
      "add         %1, %0, %1                    \n"  // stride + src_yuy2
      "1:                                        \n"
      "vld2.8      {q0, q1}, [%0]!               \n"  // load 16 pixels of YUY2.
@ -1673,7 +1673,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
                         int width) {
-  asm volatile(
+  asm volatile (
      "vld1.8      {q2}, [%3]                    \n"  // shuffler
      "1:                                        \n"
      "vld1.8      {q0}, [%0]!                   \n"  // load 4 pixels.
@ -1695,7 +1695,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 Ys
      "vld1.8      {d1}, [%1]!                   \n"  // load 8 Us
@ -1717,7 +1717,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
                        int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld2.8      {d1, d3}, [%0]!               \n"  // load 16 Ys
      "vld1.8      {d0}, [%1]!                   \n"  // load 8 Us
@ -1737,7 +1737,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
                          uint8_t* dst_rgb565,
                          int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@ -1755,7 +1755,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
                                uint8_t* dst_rgb,
                                uint32_t dither4,
                                int width) {
-  asm volatile(
+  asm volatile (
      "vdup.32     d7, %2                        \n"  // dither4
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%1]!       \n"  // load 8 pixels of ARGB.
@ -1776,7 +1776,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb1555,
                            int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 pixels of ARGB.
      "subs        %2, %2, #8                    \n"  // 8 processed per loop.
@ -1793,7 +1793,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
                            uint8_t* dst_argb4444,
                            int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d7, #0x0f                     \n"  // bits to clear with
                                                      // vbic.
      "1:                                        \n"
@ -1812,7 +1812,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB pixels
@ -1838,7 +1838,7 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                               uint8_t* dst_v,
                               int width,
                               const struct RgbUVConstants* rgbuvconstants) {
-  asm volatile(
+  asm volatile (
      "vld1.8      {d0}, [%4]                    \n"  // load rgbuvconstants
      "vdup.u8     d24, d0[0]                    \n"  // UB  0.875  coefficient
@ -2366,7 +2366,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width) {
-  asm volatile(
+  asm volatile (
      "add         %1, %0, %1                    \n"  // src_stride + src_argb
      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                      // coefficient
@ -2432,7 +2432,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile(
+  asm volatile (
      "add         %1, %0, %1                    \n"  // src_stride + src_argb
      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                      // coefficient
@ -2498,7 +2498,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width) {
-  asm volatile(
+  asm volatile (
      "add         %1, %0, %1                    \n"  // src_stride + src_argb
      "vmov.s16    q10, #112 / 2                 \n"  // UB / VR 0.875
                                                      // coefficient
@ -2550,7 +2550,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
 }
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
@ -2576,7 +2576,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
                         uint8_t* dst_y,
                         int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
@ -2602,7 +2602,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                         uint8_t* dst_y,
                         int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d24, #25                      \n"  // B * 0.1016 coefficient
      "vmov.u8     d25, #129                     \n"  // G * 0.5078 coefficient
      "vmov.u8     d26, #66                      \n"  // R * 0.2578 coefficient
@ -2628,7 +2628,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
                        uint16_t* dst_ar64,
                        int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q0}, [%0]!                   \n"
      "vld1.8      {q2}, [%0]!                   \n"
@ -2651,7 +2651,7 @@ static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
                        uint16_t* dst_ab64,
                        int width) {
-  asm volatile(
+  asm volatile (
      "vld1.8      {q4}, [%3]                    \n"  // shuffler
      "1:                                        \n"
@ -2677,7 +2677,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
 void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.16     {q0}, [%0]!                   \n"
      "vld1.16     {q1}, [%0]!                   \n"
@ -2703,7 +2703,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
 void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm volatile (
      "vld1.8      {d8}, [%3]                    \n"  // shuffler
      "1:                                        \n"
@ -2756,7 +2756,7 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
                           uint8_t* dst_y,
                           int width,
                           const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm volatile (
      "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
      "vdup.u8     d20, d0[0]                    \n"
      "vdup.u8     d21, d0[1]                    \n"
@ -2806,7 +2806,7 @@ void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
                           uint8_t* dst_y,
                           int width,
                           const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm volatile (
      "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
      "vdup.u8     d20, d0[0]                    \n"
      "vdup.u8     d21, d0[1]                    \n"
@ -2850,7 +2850,7 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
                          uint8_t* dst_y,
                          int width,
                          const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm volatile (
      "vld1.8      {d0}, [%3]                    \n"  // load rgbconstants
      "vdup.u8     d20, d0[0]                    \n"
      "vdup.u8     d21, d0[1]                    \n"
@ -2902,7 +2902,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
                         int dst_width,
                         int source_y_fraction) {
  int y1_fraction = source_y_fraction;
-  asm volatile(
+  asm volatile (
      "cmp         %4, #0                        \n"
      "beq         100f                          \n"
      "add         %2, %1                        \n"
@ -2964,7 +2964,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
  int y0_fraction = 256 - y1_fraction;
  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  asm volatile(
+  asm volatile (
      "cmp         %4, #0                        \n"
      "beq         100f                          \n"
      "cmp         %4, #128                      \n"
@ -3019,7 +3019,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
                       const uint8_t* src_argb1,
                       uint8_t* dst_argb,
                       int width) {
-  asm volatile(
+  asm volatile (
      "subs        %3, #8                        \n"
      "blt         89f                           \n"
      // Blend 8 pixels.
@ -3078,7 +3078,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
                           uint8_t* dst_argb,
                           int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u16    q15, #0x00ff                  \n"  // 255 for rounding up
      // Attenuate 8 pixels.
@ -3107,7 +3107,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
                          int interval_size,
                          int interval_offset,
                          int width) {
-  asm volatile(
+  asm volatile (
      "vdup.u16    q8, %2                        \n"
      "vshr.u16    q8, q8, #1                    \n"  // scale >>= 1
      "vdup.u16    q9, %3                        \n"  // interval multiply.
@ -3149,7 +3149,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_argb,
                       int width,
                       uint32_t value) {
-  asm volatile(
+  asm volatile (
      "vdup.u32    q0, %3                        \n"  // duplicate scale value.
      "vzip.u8     d0, d1                        \n"  // d0 aarrggbb.
      "vshr.u16    q0, q0, #1                    \n"  // scale / 2.
@ -3183,7 +3183,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
 // Similar to ARGBToYJ but stores ARGB.
 // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d24, #29                      \n"  // B * 0.1140 coefficient
      "vmov.u8     d25, #150                     \n"  // G * 0.5870 coefficient
      "vmov.u8     d26, #77                      \n"  // R * 0.2990 coefficient
@ -3210,7 +3210,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d20, #17                      \n"  // BB coefficient
      "vmov.u8     d21, #68                      \n"  // BG coefficient
      "vmov.u8     d22, #35                      \n"  // BR coefficient
@ -3251,7 +3251,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             const int8_t* matrix_argb,
                             int width) {
-  asm volatile(
+  asm volatile (
      "vld1.8      {q2}, [%3]                    \n"  // load 3 ARGB vectors.
      "vmovl.s8    q0, d4                        \n"  // B,G coefficients s16.
      "vmovl.s8    q1, d5                        \n"  // R,A coefficients s16.
@ -3310,7 +3310,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile(
+  asm volatile (
      // 8 pixel loop.
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
@ -3339,7 +3339,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
                     const uint8_t* src_argb1,
                     uint8_t* dst_argb,
                     int width) {
-  asm volatile(
+  asm volatile (
      // 8 pixel loop.
      "1:                                        \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
@ -3362,7 +3362,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile(
+  asm volatile (
      // 8 pixel loop.
      "1:                                        \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // load 8 ARGB pixels.
@ -3389,7 +3389,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
                   const uint8_t* src_sobely,
                   uint8_t* dst_argb,
                   int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d3, #255                      \n"  // alpha
      // 8 pixel loop.
      "1:                                        \n"
@ -3414,7 +3414,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
                          const uint8_t* src_sobely,
                          uint8_t* dst_y,
                          int width) {
-  asm volatile(
+  asm volatile (
      // 16 pixel loop.
      "1:                                        \n"
      "vld1.8      {q0}, [%0]!                   \n"  // load 16 sobelx.
@ -3440,7 +3440,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
                     const uint8_t* src_sobely,
                     uint8_t* dst_argb,
                     int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d3, #255                      \n"  // alpha
      // 8 pixel loop.
      "1:                                        \n"
@ -3467,7 +3467,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
                    const uint8_t* src_y2,
                    uint8_t* dst_sobelx,
                    int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {d0}, [%0],%5                 \n"  // top
      "vld1.8      {d1}, [%0],%6                 \n"
@ -3505,7 +3505,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
                    const uint8_t* src_y1,
                    uint8_t* dst_sobely,
                    int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {d0}, [%0],%4                 \n"  // left
      "vld1.8      {d1}, [%1],%4                 \n"
@ -3542,7 +3542,7 @@ void HalfFloat1Row_NEON(const uint16_t* src,
                        uint16_t* dst,
                        float /*unused*/,
                        int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
@ -3568,7 +3568,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
                       uint16_t* dst,
                       float scale,
                       int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
@ -3594,7 +3594,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
                         float* dst,
                         float scale,
                         int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {d2}, [%0]!                   \n"  // load 8 bytes
@ -3623,7 +3623,7 @@ void GaussCol_NEON(const uint16_t* src0,
                   const uint16_t* src4,
                   uint32_t* dst,
                   int width) {
-  asm volatile(
+  asm volatile (
      "vmov.u16    d6, #4                        \n"  // constant 4
      "vmov.u16    d7, #6                        \n"  // constant 6
@ -3660,7 +3660,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
  const uint32_t* src1 = src + 1;
  const uint32_t* src2 = src + 2;
  const uint32_t* src3 = src + 3;
-  asm volatile(
+  asm volatile (
      "vmov.u32    q10, #4                       \n"  // constant 4
      "vmov.u32    q11, #6                       \n"  // constant 6
@ -3698,7 +3698,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_vu,
                         uint8_t* dst_yuv24,
                         int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q2}, [%0]!                   \n"  // load 16 Y values
      "vld2.8      {d0, d2}, [%1]!               \n"  // load 8 VU values
@ -3722,7 +3722,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
                      int src_stride_ayuv,
                      uint8_t* dst_uv,
                      int width) {
-  asm volatile(
+  asm volatile (
      "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
@ -3753,7 +3753,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
                      int src_stride_ayuv,
                      uint8_t* dst_vu,
                      int width) {
-  asm volatile(
+  asm volatile (
      "add         %1, %0, %1                    \n"  // src_stride + src_AYUV
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels.
@ -3783,7 +3783,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
 // Copy row of AYUV Y's into Y.
 // Similar to ARGBExtractAlphaRow_NEON
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 AYUV pixels
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 AYUV pixels
@ -3799,7 +3799,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld2.8      {d0, d2}, [%0]!               \n"  // load 16 UV values
      "vld2.8      {d1, d3}, [%0]!               \n"
@ -3822,7 +3822,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
                         int width) {
  const uint8_t* src_u_1 = src_u + src_stride_u;
  const uint8_t* src_v_1 = src_v + src_stride_v;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q0}, [%0]!                   \n"  // load 16 U values
      "vld1.8      {q1}, [%2]!                   \n"  // load 16 V values
@ -3853,7 +3853,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
                        int depth,
                        int width) {
  int shift = depth - 16;  // Negative for right shift.
-  asm volatile(
+  asm volatile (
      "vdup.16     q2, %4                        \n"
      "1:                                        \n"
      "vld2.16     {q0, q1}, [%0]!               \n"  // load 8 UV
@ -3877,7 +3877,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
                        int depth,
                        int width) {
  int shift = 16 - depth;
-  asm volatile(
+  asm volatile (
      "vdup.16     q2, %4                        \n"
      "1:                                        \n"
      "vld1.16     {q0}, [%0]!                   \n"  // load 8 U
@ -3899,7 +3899,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
                         uint16_t* dst_y,
                         int scale,
                         int width) {
-  asm volatile(
+  asm volatile (
      "vdup.16     q2, %3                        \n"
      "1:                                        \n"
      "vld1.16     {q0}, [%0]!                   \n"
@ -3921,7 +3921,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                       uint16_t* dst_y,
                       int scale,
                       int width) {
-  asm volatile(
+  asm volatile (
      "vdup.16     d8, %3                        \n"
      "1:                                        \n"
      "vld1.16     {q2, q3}, [%0]!               \n"
@ -3953,7 +3953,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
                          int scale,
                          int width) {
  int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
-  asm volatile(
+  asm volatile (
      "vdup.16     q2, %3                        \n"
      "1:                                        \n"
      "vld1.16     {q0}, [%0]!                   \n"
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@ -47,7 +47,7 @@ extern "C" {
 // register) is set to round-to-nearest-up mode(0).
 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
  {                                                              \
-    asm volatile("csrwi vxrm, 0");                               \
+    asm volatile ("csrwi vxrm, 0");                               \
    ub = yuvconst->kUVCoeff[0];                                  \
    vr = yuvconst->kUVCoeff[1];                                  \
    ug = yuvconst->kUVCoeff[2];                                  \
@ -1238,7 +1238,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y,
  vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
  // To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) sets to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  if (is_yb_positive) {
    v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
  } else {
@ -1632,7 +1632,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
  }
  // To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  // Blend 50 / 50.
  if (y1_fraction == 128) {
    do {
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -139,7 +139,8 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "dup      z19.b, #255                             \n" /* A */
      "subs     %w[width], %w[width], %w[vl]            \n"
@ -181,7 +182,8 @@ void I400ToARGBRow_SVE2(const uint8_t* src_y,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n"
      "dup      z19.b, #255                             \n"  // A
      YUVTORGB_SVE_SETUP
@ -229,7 +231,8 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "dup      z19.b, #255                             \n" /* A */
      "subs     %w[width], %w[width], %w[vl]            \n"
@ -273,7 +276,8 @@ void I422ToRGBARow_SVE2(const uint8_t* src_y,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "dup      z19.b, #255                             \n"  // A
      "subs     %w[width], %w[width], %w[vl]            \n"
@ -318,7 +322,8 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "subs     %w[width], %w[width], %w[vl]            \n"
      "b.lt     2f                                      \n"
@ -366,7 +371,8 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "subs     %w[width], %w[width], %w[vl]            \n"
      "b.lt     2f                                      \n"
@ -416,11 +422,13 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
                                    uint32_t nv_v_start,
                                    uint32_t nv_v_step) {
  uint64_t vl;
-  asm("cnth %0" : "=r"(vl));
+  asm volatile (
      "cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  width_last_y = width_last_y == 0 ? vl : width_last_y;
  int width_last_uv = width_last_y + (width_last_y & 1);
-  asm("ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
+  asm volatile (
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "index    z22.s, %w[nv_u_start], %w[nv_u_step]    \n"
      "index    z23.s, %w[nv_v_start], %w[nv_v_step]    \n"
      "dup      z19.b, #255                             \n"  // A
@ -534,7 +542,7 @@ void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
                            const int16_t* uvconstants) {
  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
  uint64_t vl;
-  asm volatile(
+  asm volatile (
      "ptrue    p0.b                                \n"
      "ld1rd    {z24.d}, p0/z, [%[uvconstants]]     \n"
      "ld1rd    {z25.d}, p0/z, [%[uvconstants], #8] \n"
@ -746,7 +754,8 @@ void ARGBToRGB565Row_SVE2(const uint8_t* src_argb,
  unsigned bsl_mask = 0x7e0;
  uint64_t vl;
  width *= 2;
-  asm("mov     z3.h, #3                     \n"
+  asm volatile (
      "mov     z3.h, #3                     \n"
      "dup     z4.h, %w[bsl_mask]           \n"
      "cntb    %[vl]                        \n"
@ -787,7 +796,8 @@ void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb,
  unsigned bsl_mask = 0x7e0;
  uint64_t vl;
  width *= 2;
-  asm("mov     z3.h, #3                     \n"
+  asm volatile (
      "mov     z3.h, #3                     \n"
      "dup     z4.h, %w[bsl_mask]           \n"
      "dup     z2.s, %w[dither4]            \n"
      "zip1    z2.b, z2.b, z2.b             \n"
@ -844,7 +854,8 @@ void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
                            int width) {
  uint64_t vl;
-  asm("mov     z4.h, #0x0300                           \n"
+  asm volatile (
      "mov     z4.h, #0x0300                           \n"
      "ptrue   p0.b                                    \n"
      "cnth    %x[vl]                                  \n"
@ -912,7 +923,8 @@ void AYUVToUVRow_SVE2(const uint8_t* src_ayuv,
  // Output a row of UV values, filtering 2x2 rows of AYUV.
  const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
  int vl;
-  asm("cntb    %x[vl]                            \n"
+  asm volatile (
      "cntb    %x[vl]                            \n"
      "subs    %w[width], %w[width], %w[vl]      \n"
      "b.lt    2f                                \n"
@ -950,7 +962,8 @@ void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
  // Output a row of VU values, filtering 2x2 rows of AYUV.
  const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
  int vl;
-  asm("cntb    %x[vl]                            \n"
+  asm volatile (
      "cntb    %x[vl]                            \n"
      "cmp     %w[width], %w[vl]                 \n"
      "subs    %w[width], %w[width], %w[vl]      \n"
      "b.lt    2f                                \n"
@ -990,10 +1003,12 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
  uint32_t nv_v_start = 0x0003'0003U;
  uint32_t nv_v_step = 0x0004'0004U;
  uint64_t vl;
-  asm("cnth %0" : "=r"(vl));
+  asm volatile (
      "cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
-  asm("ptrue    p0.b                                    \n"
+  asm volatile (
      "ptrue    p0.b                                    \n"
      "index    z22.s, %w[nv_u_start], %w[nv_u_step]    \n"
      "index    z23.s, %w[nv_v_start], %w[nv_v_step]    \n"
      "dup      z19.b, #255                             \n"  // A
@ -1047,10 +1062,12 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
  uint32_t nv_v_start = 0x0002'0002U;
  uint32_t nv_v_step = 0x0004'0004U;
  uint64_t vl;
-  asm("cnth %0" : "=r"(vl));
+  asm volatile (
      "cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
-  asm("ptrue    p0.b                                    \n"
+  asm volatile (
      "ptrue    p0.b                                    \n"
      "index    z22.s, %w[nv_u_start], %w[nv_u_step]    \n"
      "index    z23.s, %w[nv_v_start], %w[nv_v_step]    \n"
      "dup      z19.b, #255                             \n"  // A
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@ -193,7 +193,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                        uint8_t* dst_ptr,
                        int dst_width) {
  (void)src_stride;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
        "m"(kShuf1),  // %1
        "m"(kShuf2)   // %2
  );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm2               \n"
@ -515,7 +515,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kMadd11),  // %1
        "m"(kRound34)  // %2
  );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm6                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@ -578,7 +578,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kRound34)  // %2
  );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm6                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@ -667,7 +667,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShufAb2),  // %2
        "m"(kScaleAb2)  // %3
  );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm1          \n"
@ -708,7 +708,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShufAc3),   // %1
        "m"(kScaleAc33)  // %2
  );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm6          \n"
@ -821,7 +821,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               ptrdiff_t dst_stride,
                               int dst_width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "pxor        %%xmm0,%%xmm0                 \n"  // 0
      // above line
@ -1900,7 +1900,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
                       int dx) {
  (void)x;
  (void)dx;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%1),%%xmm0                   \n"
      "lea         0x10(%1),%1                   \n"
@ -1925,7 +1925,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int dst_width) {
  (void)src_stride;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1947,7 +1947,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                  uint8_t* dst_argb,
                                  int dst_width) {
  (void)src_stride;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1971,7 +1971,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                               ptrdiff_t src_stride,
                               uint8_t* dst_argb,
                               int dst_width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -2153,7 +2153,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
                           int dx) {
  (void)x;
  (void)dx;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%1),%%xmm0                   \n"
      "lea         0x10(%1),%1                   \n"
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@ -28,7 +28,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
                        uint8_t* dst,
                        int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      // load even pixels into q0, odd into q1
      "vld2.8      {q0, q1}, [%0]!               \n"
@ -49,7 +49,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
                              uint8_t* dst,
                              int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld2.8      {q0, q1}, [%0]!               \n"  // load 32 pixels
      "subs        %2, %2, #16                   \n"  // 16 processed per loop
@ -69,7 +69,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint8_t* dst,
                           int dst_width) {
-  asm volatile(
+  asm volatile (
      // change the stride to row 2 pointer
      "add         %1, %0                        \n"
      "1:                                        \n"
@ -100,7 +100,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
                        uint8_t* dst_ptr,
                        int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
      "subs        %2, %2, #8                    \n"  // 8 processed per loop
@ -120,7 +120,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
  const uint8_t* src_ptr1 = src_ptr + src_stride;
  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.8      {q0}, [%0]!                   \n"  // load up 16x4
      "vld1.8      {q1}, [%3]!                   \n"
@ -154,7 +154,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.8      {d0, d1, d2, d3}, [%0]!       \n"  // src line 0
      "subs        %2, %2, #24                   \n"
@ -172,7 +172,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
                               int dst_width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d24, #3                       \n"
      "add         %3, %0                        \n"
      "1:                                        \n"
@ -229,7 +229,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
                               int dst_width) {
-  asm volatile(
+  asm volatile (
      "vmov.u8     d24, #3                       \n"
      "add         %3, %0                        \n"
      "1:                                        \n"
@ -281,7 +281,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "vld1.8      {q3}, [%3]                    \n"
      "1:                                        \n"
      "vld1.8      {d0, d1, d2, d3}, [%0]!       \n"
@ -305,7 +305,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                      int dst_width) {
  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
-  asm volatile(
+  asm volatile (
      "vld1.16     {q13}, [%5]                   \n"
      "vld1.8      {q14}, [%6]                   \n"
      "vld1.8      {q15}, [%7]                   \n"
@ -415,7 +415,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
                               int dst_width) {
-  asm volatile(
+  asm volatile (
      "vld1.16     {q13}, [%4]                   \n"
      "vld1.8      {q14}, [%5]                   \n"
      "add         %3, %0                        \n"
@ -508,7 +508,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
  const uint8_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
      "vmov.u8     d30, #3                       \n"
      "1:                                        \n"
@ -545,7 +545,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
  const uint8_t* src_temp = src_ptr + 1;
  const uint8_t* src_temp1 = src_ptr1 + 1;
-  asm volatile(
+  asm volatile (
      "vmov.u16    q15, #3                       \n"
      "vmov.u8     d28, #3                       \n"
@ -607,7 +607,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
  const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
      "vmov.u16    q15, #3                       \n"
      "1:                                        \n"
@ -643,7 +643,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
  const uint16_t* src_temp = src_ptr + 1;
  const uint16_t* src_temp1 = src_ptr1 + 1;
-  asm volatile(
+  asm volatile (
      "vmov.u16    q15, #3                       \n"
      "1:                                        \n"
@ -694,7 +694,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
  const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
      "vmov.u16    d31, #3                       \n"
      "1:                                        \n"
@ -738,7 +738,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
  const uint16_t* src_temp = src_ptr + 1;
  const uint16_t* src_temp1 = src_ptr1 + 1;
-  asm volatile(
+  asm volatile (
      "vmov.u16    d31, #3                       \n"
      "vmov.u32    q14, #3                       \n"
@ -790,7 +790,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
  const uint8_t* src_temp = src_ptr + 2;
-  asm volatile(
+  asm volatile (
      "vmov.u8     d30, #3                       \n"
      "1:                                        \n"
@ -827,7 +827,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
  const uint8_t* src_temp = src_ptr + 2;
  const uint8_t* src_temp1 = src_ptr1 + 2;
-  asm volatile(
+  asm volatile (
      "vmov.u16    q15, #3                       \n"
      "vmov.u8     d28, #3                       \n"
@ -889,7 +889,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  int dst_width) {
  const uint16_t* src_temp = src_ptr + 2;
-  asm volatile(
+  asm volatile (
      "vmov.u16    d30, #3                       \n"
      "1:                                        \n"
@ -934,7 +934,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
  const uint16_t* src_temp = src_ptr + 2;
  const uint16_t* src_temp1 = src_ptr1 + 2;
-  asm volatile(
+  asm volatile (
      "vmov.u16    d30, #3                       \n"
      "vmov.u32    q14, #3                       \n"
@ -987,7 +987,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
                      int src_width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.16     {q1, q2}, [%1]                \n"  // load accumulator
      "vld1.8      {q0}, [%0]!                   \n"  // load 16 bytes
@ -1086,7 +1086,7 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
                          ptrdiff_t src_stride,
                          int dst_width,
                          int source_y_fraction) {
-  asm volatile(
+  asm volatile (
      "cmp         %4, #0                        \n"
      "beq         100f                          \n"
      "add         %2, %1                        \n"
@ -1170,7 +1170,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
                            uint8_t* dst,
                            int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
      "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
@ -1198,7 +1198,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
                                  uint8_t* dst_argb,
                                  int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld4.32     {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB pixels.
      "vld4.32     {d1, d3, d5, d7}, [%0]!       \n"  // load next 8 ARGB
@ -1219,7 +1219,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst,
                               int dst_width) {
-  asm volatile(
+  asm volatile (
      // change the stride to row 2 pointer
      "add         %1, %1, %0                    \n"
      "1:                                        \n"
@ -1258,7 +1258,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
                               uint8_t* dst_argb,
                               int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "mov         r12, %3, lsl #2               \n"
      "1:                                        \n"
      "vld1.32     {d0[0]}, [%0], r12            \n"
@ -1282,7 +1282,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
                                  int src_stepx,
                                  uint8_t* dst_argb,
                                  int dst_width) {
-  asm volatile(
+  asm volatile (
      "mov         r12, %4, lsl #2               \n"
      "add         %1, %1, %0                    \n"
      "1:                                        \n"
@ -1330,7 +1330,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
                        int dx) {
  int tmp;
  const uint8_t* src_tmp = src_argb;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      // clang-format off
      LOAD1_DATA32_LANE(d0, 0)
@ -1433,7 +1433,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
                          uint8_t* dst,
                          int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
      "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
@ -1452,7 +1452,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
                                uint8_t* dst,
                                int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld2.16     {d0, d2}, [%0]!               \n"  // load 8 UV pixels.
      "vld2.16     {d1, d3}, [%0]!               \n"  // load next 8 UV
@ -1471,7 +1471,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst,
                             int dst_width) {
-  asm volatile(
+  asm volatile (
      // change the stride to row 2 pointer
      "add         %1, %1, %0                    \n"
      "1:                                        \n"
@ -1506,7 +1506,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
  const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
  const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "vld1.16     {d0[0]}, [%0], %6             \n"
      "vld1.16     {d0[1]}, [%1], %6             \n"
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -26,7 +26,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
                        uint8_t* dst,
                        int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      // load even pixels into v0, odd into v1
      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
@ -48,7 +48,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
                              uint8_t* dst,
                              int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      // load even pixels into v0, odd into v1
      "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint8_t* dst,
                           int dst_width) {
-  asm volatile(
+  asm volatile (
      // change the stride to row 2 pointer
      "add         %1, %1, %0                    \n"
      "1:                                        \n"
@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
                        uint8_t* dst_ptr,
                        int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
@ -122,7 +122,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
  const uint8_t* src_ptr1 = src_ptr + src_stride;
  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "ld1         {v0.16b}, [%0], #16           \n"  // load up 16x4
      "ld1         {v1.16b}, [%2], #16           \n"
@ -159,7 +159,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
      "subs        %w2, %w2, #24                 \n"
@ -178,7 +178,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
                               int dst_width) {
-  asm volatile(
+  asm volatile (
      "movi        v20.8b, #3                    \n"
      "add         %3, %3, %0                    \n"
      "1:                                        \n"
@ -237,7 +237,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
                               int dst_width) {
-  asm volatile(
+  asm volatile (
      "movi        v20.8b, #3                    \n"
      "add         %3, %3, %0                    \n"
      "1:                                        \n"
@ -292,7 +292,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "ld1         {v3.16b}, [%3]                \n"
      "1:                                        \n"
      "ld1         {v0.16b,v1.16b}, [%0], #32    \n"
@ -317,7 +317,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
  ptrdiff_t tmp_src_stride = src_stride;
-  asm volatile(
+  asm volatile (
      "ld1         {v29.8h}, [%5]                \n"
      "ld1         {v30.16b}, [%6]               \n"
      "ld1         {v31.8h}, [%7]                \n"
@ -439,7 +439,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                               int dst_width) {
  // TODO(fbarchard): use src_stride directly for clang 3.5+.
  ptrdiff_t tmp_src_stride = src_stride;
-  asm volatile(
+  asm volatile (
      "ld1         {v30.8h}, [%4]                \n"
      "ld1         {v31.16b}, [%5]               \n"
      "add         %2, %2, %0                    \n"
@ -539,7 +539,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
  const uint8_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
      "movi        v31.8b, #3                    \n"
      "1:                                        \n"
@ -578,7 +578,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
  const uint8_t* src_temp = src_ptr + 1;
  const uint8_t* src_temp1 = src_ptr1 + 1;
-  asm volatile(
+  asm volatile (
      "movi        v31.8b, #3                    \n"
      "movi        v30.8h, #3                    \n"
@ -634,7 +634,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
  const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
      "movi        v31.8h, #3                    \n"
      "1:                                        \n"
@ -671,7 +671,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
  const uint16_t* src_temp = src_ptr + 1;
  const uint16_t* src_temp1 = src_ptr1 + 1;
-  asm volatile(
+  asm volatile (
      "movi        v31.8h, #3                    \n"
      "1:                                        \n"
@ -725,7 +725,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
  const uint16_t* src_temp = src_ptr + 1;
-  asm volatile(
+  asm volatile (
      "movi        v31.8h, #3                    \n"
      "1:                                        \n"
@ -770,7 +770,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
  const uint16_t* src_temp = src_ptr + 1;
  const uint16_t* src_temp1 = src_ptr1 + 1;
-  asm volatile(
+  asm volatile (
      "movi        v31.4h, #3                    \n"
      "movi        v30.4s, #3                    \n"
@ -825,7 +825,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
  const uint8_t* src_temp = src_ptr + 2;
-  asm volatile(
+  asm volatile (
      "movi        v31.8b, #3                    \n"
      "1:                                        \n"
@ -864,7 +864,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
  const uint8_t* src_temp = src_ptr + 2;
  const uint8_t* src_temp1 = src_ptr1 + 2;
-  asm volatile(
+  asm volatile (
      "movi        v31.8b, #3                    \n"
      "movi        v30.8h, #3                    \n"
@ -920,7 +920,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  int dst_width) {
  const uint16_t* src_temp = src_ptr + 2;
-  asm volatile(
+  asm volatile (
      "movi        v31.8h, #3                    \n"
      "1:                                        \n"
@ -967,7 +967,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
  const uint16_t* src_temp = src_ptr + 2;
  const uint16_t* src_temp1 = src_ptr1 + 2;
-  asm volatile(
+  asm volatile (
      "movi        v31.4h, #3                    \n"
      "movi        v30.4s, #3                    \n"
@ -1022,7 +1022,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
                      int src_width) {
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "ld1         {v1.8h, v2.8h}, [%1]          \n"  // load accumulator
      "ld1         {v0.16b}, [%0], #16           \n"  // load 16 bytes
@ -1123,7 +1123,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
                            uint8_t* dst,
                            int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
      "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
@ -1145,7 +1145,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
                                  uint8_t* dst_argb,
                                  int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
      "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
@ -1169,7 +1169,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
                               uint8_t* dst,
                               int dst_width) {
  const uint8_t* src_ptr1 = src_ptr + src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                       \n"
      "ld2     {v0.4s, v1.4s}, [%[src]], #32    \n"
      "ld2     {v20.4s, v21.4s}, [%[src1]], #32 \n"
@ -1200,7 +1200,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
  const uint8_t* src_argb3 = src_argb + src_stepx * 12;
  int64_t i = 0;
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                     \n"
      "ldr      w10, [%[src], %[i]]           \n"
      "ldr      w11, [%[src1], %[i]]          \n"
@ -1232,7 +1232,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
                                  int src_stepx,
                                  uint8_t* dst_argb,
                                  int dst_width) {
-  asm volatile(
+  asm volatile (
      "add         %1, %1, %0                    \n"
      "1:                                        \n"
      "ld1         {v0.8b}, [%0], %4             \n"  // Read 4 2x2 -> 2x1
@ -1287,7 +1287,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
  int64_t x64 = (int64_t)x;    // NOLINT
  int64_t dx64 = (int64_t)dx;  // NOLINT
  int64_t tmp64;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      // clang-format off
      LOAD1_DATA32_LANE(v0, 0)
@ -1394,7 +1394,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint16_t* dst,
                              int dst_width) {
-  asm volatile(
+  asm volatile (
      // change the stride to row 2 pointer
      "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
      "1:                                        \n"
@ -1426,7 +1426,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint16_t* dst,
                         int dst_width) {
-  asm volatile(
+  asm volatile (
      "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
      "movi        v0.8h, #9                     \n"  // constants
      "movi        v1.4s, #3                     \n"
@ -1477,7 +1477,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
                          uint8_t* dst,
                          int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@ -1496,7 +1496,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
                                uint8_t* dst,
                                int dst_width) {
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
      "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
@ -1515,7 +1515,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst,
                             int dst_width) {
-  asm volatile(
+  asm volatile (
      // change the stride to row 2 pointer
      "add         %1, %1, %0                    \n"
      "1:                                        \n"
@ -1550,7 +1550,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
  const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
  const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
  (void)src_stride;
-  asm volatile(
+  asm volatile (
      "1:                                        \n"
      "ld1         {v0.h}[0], [%0], %6           \n"
      "ld1         {v1.h}[0], [%1], %6           \n"
--- a/source/scale_rvv.cc
+++ b/source/scale_rvv.cc
@ -100,7 +100,7 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
  const uint32_t* src = (const uint32_t*)(src_argb);
  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  do {
    vuint8m4_t v_odd, v_even, v_dst;
    vuint32m4_t v_odd_32, v_even_32;
@ -165,7 +165,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
  const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  do {
    vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
    vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
@ -262,7 +262,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
  const int stride_byte = src_stepx * 4;
  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  do {
    vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
    vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
@ -340,7 +340,7 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
  (void)src_stride;
  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  do {
    vuint8m4_t v_s0, v_s1, v_dst;
    size_t vl = __riscv_vsetvl_e8m4(w);
@ -395,7 +395,7 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
  size_t w = (size_t)dst_width;
  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  do {
    size_t vl = __riscv_vsetvl_e8m4(w);
    vuint8m4_t v_s0, v_s1, v_t0, v_t1;
@ -528,7 +528,7 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
  size_t w = (size_t)dst_width;
  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  do {
    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
    vuint8m2_t v_t0, v_t1, v_t2, v_t3;
@ -698,7 +698,7 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
  const uint8_t* t = src_ptr + src_stride;
  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  do {
    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
    vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
@ -827,7 +827,7 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
  const uint8_t* t = src_ptr + src_stride;
  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  do {
    vuint8m2_t v_s0, v_s1, v_s2, v_s3;
    vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
@ -1490,7 +1490,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
  (void)src_stride;
  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  do {
    vuint8m4_t v_u0v0, v_u1v1, v_avg;
    vuint16m4_t v_u0v0_16, v_u1v1_16;
@ -1559,7 +1559,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
  size_t w = (size_t)dst_width;
  // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
  // register) is set to round-to-nearest-up mode(0).
-  asm volatile("csrwi vxrm, 0");
+  asm volatile ("csrwi vxrm, 0");
  do {
    vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
    vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;