Add volatile for gcc inline to avoid being removed

Bug: b/42280943 Change-Id: I4439077a92ffa6dff91d2d10accd5251b76f7544 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5671187 Reviewed-by: David Gao <davidgao@google.com>
2025-12-07 01:06:46 +08:00 · 2024-07-01 18:18:10 -07:00 · 2024-07-01 18:18:10 -07:00 · 616bee5420
commit 616bee5420
parent efd164d64e
21 changed files with 795 additions and 610 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1888
+Version: 1889
 License: BSD
 License File: LICENSE
 Shipped: yes
--- a/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
@ -20,7 +20,7 @@
  ({                                                   \
    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
    uint32_t val_m;                                    \
-    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"     \
+    asm("lw  %[val_m],  %[psrc_lw_m]  \n"              \
        : [val_m] "=r"(val_m)                          \
        : [psrc_lw_m] "m"(*psrc_lw_m));                \
    val_m;                                             \
@ -31,7 +31,7 @@
  ({                                                   \
    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
    uint64_t val_m = 0;                                \
-    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"     \
+    asm("ld  %[val_m],  %[psrc_ld_m]  \n"              \
        : [val_m] "=r"(val_m)                          \
        : [psrc_ld_m] "m"(*psrc_ld_m));                \
    val_m;                                             \
@ -55,7 +55,7 @@
  ({                                                    \
    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
    uint32_t val_m = (val);                             \
-    asm volatile("sw  %[val_m],  %[pdst_sw_m]  \n"      \
+    asm("sw  %[val_m],  %[pdst_sw_m]  \n"               \
        : [pdst_sw_m] "=m"(*pdst_sw_m)                  \
        : [val_m] "r"(val_m));                          \
  })
@ -65,7 +65,7 @@
  ({                                                    \
    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
    uint64_t val_m = (val);                             \
-    asm volatile("sd  %[val_m],  %[pdst_sd_m]  \n"      \
+    asm("sd  %[val_m],  %[pdst_sd_m]  \n"               \
        : [pdst_sd_m] "=m"(*pdst_sd_m)                  \
        : [val_m] "r"(val_m));                          \
  })
@ -86,8 +86,7 @@
    uint8_t* psrc_lw_m = (uint8_t*)(psrc);      \
    uint32_t val_lw_m;                          \
                                                \
-    __asm__ volatile(                           \
+    asm("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
        "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
        "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
                                                \
        : [val_lw_m] "=&r"(val_lw_m)            \
@ -102,8 +101,7 @@
    uint8_t* psrc_ld_m = (uint8_t*)(psrc);      \
    uint64_t val_ld_m = 0;                      \
                                                \
-    __asm__ volatile(                           \
+    asm("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
        "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
        "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
                                                \
        : [val_ld_m] "=&r"(val_ld_m)            \
@ -130,7 +128,7 @@
  ({                                                    \
    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
    uint32_t val_m = (val);                             \
-    asm volatile("usw  %[val_m],  %[pdst_sw_m]  \n"     \
+    asm("usw  %[val_m],  %[pdst_sw_m]  \n"              \
        : [pdst_sw_m] "=m"(*pdst_sw_m)                  \
        : [val_m] "r"(val_m));                          \
  })
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1888
+#define LIBYUV_VERSION 1889
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@ -29,7 +29,8 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                               int count) {
  uint64_t diff;
-      asm("xor         %3,%3                         \n"
+      asm volatile (
      "xor         %3,%3                         \n"
      "xor         %%r8,%%r8                     \n"
      "xor         %%r9,%%r9                     \n"
      "xor         %%r10,%%r10                   \n"
@ -76,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
                               int count) {
  uint32_t diff = 0u;
-  asm(
+  asm volatile (
      // Process 16 bytes per loop.
      LABELALIGN
      "1:                                        \n"
@ -120,7 +121,8 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
                               int count) {
  uint32_t diff;
-      asm("movdqa      %4,%%xmm2                     \n"
+  asm volatile (
      "movdqa      %4,%%xmm2                     \n"
      "movdqa      %5,%%xmm3                     \n"
      "pxor        %%xmm0,%%xmm0                 \n"
      "pxor        %%xmm1,%%xmm1                 \n"
@ -178,7 +180,8 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
                              int count) {
  uint32_t diff;
-      asm("vbroadcastf128 %4,%%ymm2                  \n"
+      asm volatile (
      "vbroadcastf128 %4,%%ymm2                  \n"
      "vbroadcastf128 %5,%%ymm3                  \n"
      "vpxor       %%ymm0,%%ymm0,%%ymm0          \n"
      "vpxor       %%ymm1,%%ymm1,%%ymm1          \n"
@ -231,7 +234,8 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
                             const uint8_t* src_b,
                             int count) {
  uint32_t sse;
-      asm("pxor        %%xmm0,%%xmm0                 \n"
+      asm volatile (
      "pxor        %%xmm0,%%xmm0                 \n"
      "pxor        %%xmm5,%%xmm5                 \n"
      LABELALIGN
@ -296,7 +300,8 @@ static const uvec32 kHashMul3 = {
 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
  uint32_t hash;
-      asm("movd        %2,%%xmm0                     \n"
+      asm volatile (
      "movd        %2,%%xmm0                     \n"
      "pxor        %%xmm7,%%xmm7                 \n"
      "movdqa      %4,%%xmm6                     \n"
--- a/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
                        uint8_t* dst,
                        int dst_stride,
                        int width) {
-  asm(
+  asm volatile (
      // Read in the data from the source pointer.
      // First round of bit swap.
      LABELALIGN
@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
                             uint8_t* dst,
                             int dst_stride,
                             int width) {
-  asm(
+  asm volatile (
      // Read in the data from the source pointer.
      // First round of bit swap.
      LABELALIGN
@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
                         uint8_t* dst_b,
                         int dst_stride_b,
                         int width) {
-  asm(
+  asm volatile (
      // Read in the data from the source pointer.
      // First round of bit swap.
      LABELALIGN
@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
                          uint8_t* dst,
                          int dst_stride,
                          int width) {
-  asm(
+  asm volatile (
      // Main loop transpose 4x4.  Read a column, write a row.
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"  // a b c d
@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
                          uint8_t* dst,
                          int dst_stride,
                          int width) {
-  asm(
+  asm volatile (
      // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
      "1:                                        \n"
      "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
--- a/source/rotate_msa.cc
+++ b/source/rotate_msa.cc
@ -51,6 +51,16 @@ extern "C" {
    out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
  }
 void TransposeWx16_C(const uint8_t* src,
                     int src_stride,
                     uint8_t* dst,
                     int dst_stride,
                     int width) {
  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
                 width);
 }
 void TransposeUVWx16_C(const uint8_t* src,
                       int src_stride,
                       uint8_t* dst_a,
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@ -27,7 +27,7 @@ void TransposeWx8_NEON(const uint8_t* src,
                       int dst_stride,
                       int width) {
  const uint8_t* temp;
-  asm(
+  asm volatile (
      // loops are on blocks of 8. loop will stop when
      // counter gets to or below 0. starting the counter
      // at w-8 allow for this
@ -95,7 +95,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                         int dst_stride_b,
                         int width) {
  const uint8_t* temp;
-  asm(
+  asm volatile (
      // loops are on blocks of 8. loop will stop when
      // counter gets to or below 0. starting the counter
      // at w-8 allow for this
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@ -27,7 +27,8 @@ void TransposeWx16_NEON(const uint8_t* src,
                        int dst_stride,
                        int width) {
  const uint8_t* src_temp;
-  asm("1:                                                \n"
+  asm volatile (
    "1:                                                \n"
      "mov   %[src_temp], %[src]                         \n"
      "ld1   {v16.16b}, [%[src_temp]], %[src_stride]     \n"
@ -144,7 +145,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
                         int dst_stride_b,
                         int width) {
  const uint8_t* temp;
-  asm(
+  asm volatile (
      // loops are on blocks of 8. loop will stop when
      // counter gets to or below 0. starting the counter
      // at w-8 allow for this
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
--- a/source/row_lsx.cc
+++ b/source/row_lsx.cc
@ -2805,8 +2805,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
                                 uint8_t* dst_y,
                                 int width,
                                 const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm("vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
@ -2864,8 +2863,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
                                 uint8_t* dst_y,
                                 int width,
                                 const struct RgbConstants* rgbconstants) {
-  asm volatile(
+  asm("vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
@ -2922,8 +2920,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
                      7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
                      0,  13, 0,  16, 0,  19, 0,  22, 0,  25, 0,  28, 0,
                      31, 0,  2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
-  asm volatile(
+  asm("vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
      "vldrepl.b      $vr2,  %3,    2             \n\t"  // load rgbconstants
      "vldrepl.h      $vr3,  %3,    4             \n\t"  // load rgbconstants
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -261,7 +261,7 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
  uint16_t alpha = 0xc000;
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
      "dup      v22.8h, %w[limit]                  \n"
      "dup      v23.8h, %w[alpha]                  \n"
      "1:                                          \n" READYUV210 NVTORGB
@ -289,7 +289,7 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
  uint16_t alpha = 0xc000;
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
      "dup      v22.8h, %w[limit]                  \n"
      "dup      v23.8h, %w[alpha]                  \n"
      "1:                                          \n" READYUV410 NVTORGB
@ -313,7 +313,7 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
      "movi        v19.8b, #255             \n"
      "1:                                   \n" READYUV210 NVTORGB RGBTORGB8
      "subs        %w[width], %w[width], #8 \n"
@ -335,7 +335,7 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
      "movi        v19.8b, #255             \n"
      "1:                                   \n" READYUV410 NVTORGB RGBTORGB8
      "subs        %w[width], %w[width], #8 \n"
@ -408,7 +408,7 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
      "1:                                     \n"
      "ld1        {v19.16b}, [%[src_a]], #16  \n" READYUV410
      "uqshrn     v19.8b, v19.8h, #2          \n" NVTORGB RGBTORGB8
@ -433,7 +433,7 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
                             uint8_t* dst_argb,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
      "1:                                        \n"
      "ld1         {v19.16b}, [%[src_a]], #16    \n" READYUV210
      "uqshrn      v19.8b, v19.8h, #2            \n" NVTORGB RGBTORGB8
@ -591,7 +591,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
                            uint8_t* dst_argb1555,
                            const struct YuvConstants* yuvconstants,
                            int width) {
-  asm(YUVTORGB_SETUP
+  asm volatile (YUVTORGB_SETUP
      "movi    v19.8h, #0x80, lsl #8             \n"
      "1:                                        \n"  //
      READYUV422 I4XXTORGB RGBTORGB8_TOP
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -139,7 +139,8 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "dup      z19.b, #255                             \n" /* A */
      "subs     %w[width], %w[width], %w[vl]            \n"
@ -181,7 +182,8 @@ void I400ToARGBRow_SVE2(const uint8_t* src_y,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n"
      "dup      z19.b, #255                             \n"  // A
      YUVTORGB_SVE_SETUP
@ -229,7 +231,8 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "dup      z19.b, #255                             \n" /* A */
      "subs     %w[width], %w[width], %w[vl]            \n"
@ -273,7 +276,8 @@ void I422ToRGBARow_SVE2(const uint8_t* src_y,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "dup      z19.b, #255                             \n"  // A
      "subs     %w[width], %w[width], %w[vl]            \n"
@ -318,7 +322,8 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "subs     %w[width], %w[width], %w[vl]            \n"
      "b.lt     2f                                      \n"
@ -366,7 +371,8 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
                             const struct YuvConstants* yuvconstants,
                             int width) {
  uint64_t vl;
-  asm("cnth     %[vl]                                   \n"
+  asm volatile (
      "cnth     %[vl]                                   \n"
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "subs     %w[width], %w[width], %w[vl]            \n"
      "b.lt     2f                                      \n"
@ -416,11 +422,13 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
                                    uint32_t nv_v_start,
                                    uint32_t nv_v_step) {
  uint64_t vl;
-  asm("cnth %0" : "=r"(vl));
+  asm volatile (
      "cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  width_last_y = width_last_y == 0 ? vl : width_last_y;
  int width_last_uv = width_last_y + (width_last_y & 1);
-  asm("ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
+  asm volatile (
      "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
      "index    z22.s, %w[nv_u_start], %w[nv_u_step]    \n"
      "index    z23.s, %w[nv_v_start], %w[nv_v_step]    \n"
      "dup      z19.b, #255                             \n"  // A
@ -746,7 +754,8 @@ void ARGBToRGB565Row_SVE2(const uint8_t* src_argb,
  unsigned bsl_mask = 0x7e0;
  uint64_t vl;
  width *= 2;
-  asm("mov     z3.h, #3                     \n"
+  asm volatile (
      "mov     z3.h, #3                     \n"
      "dup     z4.h, %w[bsl_mask]           \n"
      "cntb    %[vl]                        \n"
@ -787,7 +796,8 @@ void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb,
  unsigned bsl_mask = 0x7e0;
  uint64_t vl;
  width *= 2;
-  asm("mov     z3.h, #3                     \n"
+  asm volatile (
      "mov     z3.h, #3                     \n"
      "dup     z4.h, %w[bsl_mask]           \n"
      "dup     z2.s, %w[dither4]            \n"
      "zip1    z2.b, z2.b, z2.b             \n"
@ -844,7 +854,8 @@ void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555,
                            uint8_t* dst_argb,
                            int width) {
  uint64_t vl;
-  asm("mov     z4.h, #0x0300                           \n"
+  asm volatile (
      "mov     z4.h, #0x0300                           \n"
      "ptrue   p0.b                                    \n"
      "cnth    %x[vl]                                  \n"
@ -912,7 +923,8 @@ void AYUVToUVRow_SVE2(const uint8_t* src_ayuv,
  // Output a row of UV values, filtering 2x2 rows of AYUV.
  const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
  int vl;
-  asm("cntb    %x[vl]                            \n"
+  asm volatile (
      "cntb    %x[vl]                            \n"
      "subs    %w[width], %w[width], %w[vl]      \n"
      "b.lt    2f                                \n"
@ -950,7 +962,8 @@ void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
  // Output a row of VU values, filtering 2x2 rows of AYUV.
  const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
  int vl;
-  asm("cntb    %x[vl]                            \n"
+  asm volatile (
      "cntb    %x[vl]                            \n"
      "cmp     %w[width], %w[vl]                 \n"
      "subs    %w[width], %w[width], %w[vl]      \n"
      "b.lt    2f                                \n"
@ -990,10 +1003,12 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
  uint32_t nv_v_start = 0x0003'0003U;
  uint32_t nv_v_step = 0x0004'0004U;
  uint64_t vl;
-  asm("cnth %0" : "=r"(vl));
+  asm volatile (
      "cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
-  asm("ptrue    p0.b                                    \n"
+  asm volatile (
      "ptrue    p0.b                                    \n"
      "index    z22.s, %w[nv_u_start], %w[nv_u_step]    \n"
      "index    z23.s, %w[nv_v_start], %w[nv_v_step]    \n"
      "dup      z19.b, #255                             \n"  // A
@ -1047,10 +1062,12 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
  uint32_t nv_v_start = 0x0002'0002U;
  uint32_t nv_v_step = 0x0004'0004U;
  uint64_t vl;
-  asm("cnth %0" : "=r"(vl));
+  asm volatile (
      "cnth %0" : "=r"(vl));
  int width_last_y = width & (vl - 1);
  int width_last_uv = width_last_y + (width_last_y & 1);
-  asm("ptrue    p0.b                                    \n"
+  asm volatile (
      "ptrue    p0.b                                    \n"
      "index    z22.s, %w[nv_u_start], %w[nv_u_step]    \n"
      "index    z23.s, %w[nv_v_start], %w[nv_v_step]    \n"
      "dup      z19.b, #255                             \n"  // A
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@ -193,7 +193,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                        uint8_t* dst_ptr,
                        int dst_width) {
  (void)src_stride;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
        "m"(kShuf1),  // %1
        "m"(kShuf2)   // %2
  );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm2               \n"
@ -515,7 +515,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kMadd11),  // %1
        "m"(kRound34)  // %2
  );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm6                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@ -578,7 +578,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kRound34)  // %2
  );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm6                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@ -667,7 +667,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShufAb2),  // %2
        "m"(kScaleAb2)  // %3
  );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm1          \n"
@ -708,7 +708,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShufAc3),   // %1
        "m"(kScaleAc33)  // %2
  );
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm6          \n"
@ -821,7 +821,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               ptrdiff_t dst_stride,
                               int dst_width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "pxor        %%xmm0,%%xmm0                 \n"  // 0
      // above line
@ -1900,7 +1900,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
                       int dx) {
  (void)x;
  (void)dx;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%1),%%xmm0                   \n"
      "lea         0x10(%1),%1                   \n"
@ -1925,7 +1925,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int dst_width) {
  (void)src_stride;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1947,7 +1947,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                  uint8_t* dst_argb,
                                  int dst_width) {
  (void)src_stride;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1971,7 +1971,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                               ptrdiff_t src_stride,
                               uint8_t* dst_argb,
                               int dst_width) {
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -2153,7 +2153,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
                           int dx) {
  (void)x;
  (void)dx;
-  asm(LABELALIGN
+  asm volatile (LABELALIGN
      "1:                                        \n"
      "movdqu      (%1),%%xmm0                   \n"
      "lea         0x10(%1),%1                   \n"