YUY2ToARGB use ymm6/7 for shuffle constants

- 1 load and 2 shuffles from registers replaces 2 loads and 2 memory shuffles - vbroadcastf128 16 byte shuffler replaces 32 byte shufflers - bump version and apply clang-format libyuv_test '--gunit_filter=*.???2ToARGB_Opt' --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 AMD Zen2 I422ToARGB_Opt (272 ms) NV12ToARGB_Opt (255 ms) YUY2ToARGB_Opt (208 ms) Was YUY2ToARGB_Opt (214 ms) Change-Id: I1fa4d462d04536c877d1cab1a14586be8ed1b2f2 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5218447 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
2025-12-07 09:16:48 +08:00 · 2024-01-21 08:52:02 -08:00 · 2024-01-21 08:52:02 -08:00 · 3e435fe6d4
commit 3e435fe6d4
parent 914624f0b8
3 changed files with 315 additions and 389 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1885
+Version: 1886
 License: BSD
 License File: LICENSE
 Shipped: yes
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1885
+#define LIBYUV_VERSION 1886
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -137,24 +137,20 @@ static const uvec8 kShuffleMaskARGBToRGB24_0 = {
    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
 // YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+static const vec8 kShuffleYUY2Y = {0, 0, 2,  2,  4,  4,  6,  6,
-                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                   8, 8, 10, 10, 12, 12, 14, 14};
                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
 // YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+static const vec8 kShuffleYUY2UV = {1, 3,  1, 3,  5,  7,  5,  7,
-                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                    9, 11, 9, 11, 13, 15, 13, 15};
                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
 // UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+static const vec8 kShuffleUYVYY = {1, 1, 3,  3,  5,  5,  7,  7,
-                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                   9, 9, 11, 11, 13, 13, 15, 15};
                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
 // UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+static const vec8 kShuffleUYVYUV = {0, 2,  0, 2,  4,  6,  4,  6,
-                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                    8, 10, 8, 10, 12, 14, 12, 14};
                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
 // NV21 shuf 8 VU to 16 UV.
 static const lvec8 kShuffleNV21 = {
@ -479,9 +475,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 }
 void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
+      asm volatile("movdqa      %3,%%xmm6                     \n"
      "movdqa      %3,%%xmm6                     \n"
               LABELALIGN
      "1:                                        \n"
@ -514,13 +508,12 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
                 "+r"(dst),                    // %1
                 "+r"(width)                   // %2
               : "m"(kShuffleMaskARGBToRGB24)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
                 "xmm6");
 }
 void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
+      asm volatile("movdqa      %3,%%xmm6                     \n"
      "movdqa      %3,%%xmm6                     \n"
               LABELALIGN
      "1:                                        \n"
@ -553,7 +546,8 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
                 "+r"(dst),                  // %1
                 "+r"(width)                 // %2
               : "m"(kShuffleMaskARGBToRAW)  // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
                 "xmm6");
 }
 #ifdef HAS_ARGBTORGB24ROW_AVX2
@ -1096,9 +1090,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqa      %%xmm0,%%xmm1                 \n"
@ -1113,15 +1105,14 @@ void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
               : "+r"(src_argb),  // %0
                 "+r"(dst_ar64),  // %1
                 "+r"(width)      // %2
-      :
+                 ::"memory",
-      : "memory", "cc", "xmm0", "xmm1");
+                 "cc", "xmm0", "xmm1");
 }
 void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
                         uint16_t* dst_ab64,
                         int width) {
  asm volatile(
      "movdqa      %3,%%xmm2                     \n"
      "movdqa      %4,%%xmm3                     \n" LABELALIGN
      "1:                                        \n"
@ -1146,9 +1137,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1163,16 +1152,16 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
               : "+r"(src_ar64),  // %0
                 "+r"(dst_argb),  // %1
                 "+r"(width)      // %2
-      :
+                 ::"memory",
-      : "memory", "cc", "xmm0", "xmm1");
+                 "cc", "xmm0", "xmm1");
 }
 void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile(
+      asm volatile("movdqa      %3,%%xmm2                     \n"
-      "movdqa      %3,%%xmm2                     \n" LABELALIGN
+               LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1196,9 +1185,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
                        uint16_t* dst_ar64,
                        int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
@ -1214,8 +1201,8 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
               : "+r"(src_argb),  // %0
                 "+r"(dst_ar64),  // %1
                 "+r"(width)      // %2
-      :
+                 ::"memory",
-      : "memory", "cc", "xmm0", "xmm1");
+                 "cc", "xmm0", "xmm1");
 }
 #endif
@ -1224,7 +1211,6 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
                        uint16_t* dst_ab64,
                        int width) {
  asm volatile(
      "vbroadcastf128 %3,%%ymm2                  \n"
      "vbroadcastf128 %4,%%ymm3                  \n" LABELALIGN
      "1:                                        \n"
@ -1252,9 +1238,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -1271,8 +1255,8 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
               : "+r"(src_ar64),  // %0
                 "+r"(dst_argb),  // %1
                 "+r"(width)      // %2
-      :
+                 ::"memory",
-      : "memory", "cc", "xmm0", "xmm1");
+                 "cc", "xmm0", "xmm1");
 }
 #endif
@ -1280,9 +1264,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile(
+      asm volatile("vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
      "vbroadcastf128 %3,%%ymm2                  \n" LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -2467,21 +2449,25 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
-// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
+// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
 // xmm6 kShuffleYUY2Y,
 // xmm7 kShuffleYUY2UV
 #define READYUY2                                                  \
  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
-  "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
+  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n" \
-  "movdqu     (%[yuy2_buf]),%%xmm3                            \n" \
+  "movdqa     %%xmm4,%%xmm3                                   \n" \
-  "pshufb     %[kShuffleYUY2UV], %%xmm3                       \n" \
+  "pshufb     %%xmm6,%%xmm4                                   \n" \
-  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
+  "pshufb     %%xmm7,%%xmm3                                   \n"
-// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
+// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
 // xmm6 kShuffleUYVYY,
 // xmm7 kShuffleUYVYUV
 #define READUYVY                                                  \
  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
-  "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
+  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n" \
-  "movdqu     (%[uyvy_buf]),%%xmm3                            \n" \
+  "movdqa     %%xmm4,%%xmm3                                   \n" \
-  "pshufb     %[kShuffleUYVYUV], %%xmm3                       \n" \
+  "pshufb     %%xmm6,%%xmm4                                   \n" \
-  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
+  "pshufb     %%xmm7,%%xmm3                                   \n"
 // Read 4 UV from P210, upsample to 8 UV
 #define READP210                                                  \
@ -3200,6 +3186,8 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
                                int width) {
  // clang-format off
  asm volatile (
      "movdqa      %[kShuffleYUY2Y],%%xmm6       \n"
      "movdqa      %[kShuffleYUY2UV],%%xmm7      \n"
    YUVTORGB_SETUP(yuvconstants)
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
@ -3217,7 +3205,7 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
    : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
  // clang-format on
 }
@ -3228,6 +3216,8 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
                                int width) {
  // clang-format off
  asm volatile (
      "movdqa      %[kShuffleUYVYY],%%xmm6       \n"
      "movdqa      %[kShuffleUYVYUV],%%xmm7      \n"
    YUVTORGB_SETUP(yuvconstants)
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
@ -3598,19 +3588,21 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
  "lea        0x20(%[y_buf]),%[y_buf]                             \n"
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
 // ymm6 kShuffleYUY2Y,
 // ymm7 kShuffleYUY2UV
 #define READYUY2_AVX2                                                 \
-  "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
+  "vmovdqu    (%[yuy2_buf]),%%ymm1                                \n" \
-  "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
+  "vpshufb    %%ymm6,%%ymm1,%%ymm4                                \n" \
-  "vmovdqu    (%[yuy2_buf]),%%ymm3                                \n" \
+  "vpshufb    %%ymm7,%%ymm1,%%ymm3                                \n" \
  "vpshufb    %[kShuffleYUY2UV], %%ymm3, %%ymm3                   \n" \
  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
 // ymm6 kShuffleUYVYY,
 // ymm7 kShuffleUYVYUV
 #define READUYVY_AVX2                                                 \
-  "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
+  "vmovdqu    (%[uyvy_buf]),%%ymm1                                \n" \
-  "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
+  "vpshufb    %%ymm6,%%ymm1,%%ymm4                                \n" \
-  "vmovdqu    (%[uyvy_buf]),%%ymm3                                \n" \
+  "vpshufb    %%ymm7,%%ymm1,%%ymm3                                \n" \
  "vpshufb    %[kShuffleUYVYUV], %%ymm3, %%ymm3                   \n" \
  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
 // TODO(fbarchard): Remove broadcastb
@ -4414,6 +4406,8 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
                               int width) {
  // clang-format off
  asm volatile (
      "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6    \n"
      "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7   \n"
    YUVTORGB_SETUP_AVX2(yuvconstants)
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@ -4432,7 +4426,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
  // clang-format on
 }
@ -4447,6 +4441,8 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
                               int width) {
  // clang-format off
  asm volatile (
      "vbroadcastf128 %[kShuffleUYVYY],%%ymm6    \n"
      "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7   \n"
    YUVTORGB_SETUP_AVX2(yuvconstants)
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
@ -4465,7 +4461,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
    [kShuffleUYVYY]"m"(kShuffleUYVYY),
    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
    : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
  // clang-format on
 }
@ -4705,9 +4701,7 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
 void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
+      asm volatile("movdqa      %3,%%xmm5                     \n"
      "movdqa      %3,%%xmm5                     \n"
               LABELALIGN
      "1:                                        \n"
@ -4728,9 +4722,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_MIRRORROW_AVX2
 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
+      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
      "vbroadcastf128 %3,%%ymm5                  \n"
               LABELALIGN
      "1:                                        \n"
@ -4757,9 +4749,7 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
+      asm volatile("movdqa      %3,%%xmm5                     \n"
      "movdqa      %3,%%xmm5                     \n"
               LABELALIGN
      "1:                                        \n"
@ -4780,9 +4770,7 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
 #ifdef HAS_MIRRORUVROW_AVX2
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
+      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
      "vbroadcastf128 %3,%%ymm5                  \n"
               LABELALIGN
      "1:                                        \n"
@ -4886,9 +4874,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
+      asm volatile("lea         -0x10(%0,%2,4),%0             \n"
      "lea         -0x10(%0,%2,4),%0             \n"
               LABELALIGN
      "1:                                        \n"
@ -4912,9 +4898,7 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  intptr_t temp_width = (intptr_t)(width);
-  asm volatile(
+      asm volatile("vmovdqu     %3,%%ymm5                     \n"
      "vmovdqu     %3,%%ymm5                     \n"
               LABELALIGN
      "1:                                        \n"
@ -5563,9 +5547,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -5639,9 +5621,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_rgb,
                       int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      (%1),%%xmm1                   \n"
@ -5697,7 +5677,6 @@ void MergeARGBRow_SSE2(const uint8_t* src_r,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      "sub         %0,%1                         \n"
      "sub         %0,%2                         \n"
      "sub         %0,%3                         \n"
@ -5738,9 +5717,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "movq        (%2),%%xmm0                   \n"  // B
@ -5779,7 +5756,6 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      "sub         %0,%1                         \n"
      "sub         %0,%2                         \n"
      "sub         %0,%3                         \n"
@ -5830,7 +5806,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r,
      "1:                                        \n"
      "vmovdqu     (%2),%%xmm0                   \n"  // B
-      "vpcmpeqd    %%ymm1,%%ymm1,%%ymm1          \n"  // A(255)
+      "vpcmpeqb    %%ymm1,%%ymm1,%%ymm1          \n"  // A(255)
      "vinserti128 $0,(%1),%%ymm1,%%ymm1         \n"  // R
      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
@ -5856,8 +5832,8 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r,
        "+r"(src_b),     // %2
        "+r"(dst_argb),  // %3
        "+rm"(width)     // %4
-      :
+        ::"memory",
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+        "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEARGBROW_AVX2
@ -5869,7 +5845,6 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb,
                       uint8_t* dst_a,
                       int width) {
  asm volatile(
      "sub         %1,%2                         \n"
      "sub         %1,%3                         \n"
      "sub         %1,%4                         \n"
@ -5921,7 +5896,6 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb,
                       uint8_t* dst_b,
                       int width) {
  asm volatile(
      LABELALIGN
      "1:                                        \n"
@ -5972,7 +5946,6 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
                        uint8_t* dst_a,
                        int width) {
  asm volatile(
      "movdqa      %6,%%xmm3                     \n"
      "sub         %1,%2                         \n"
      "sub         %1,%3                         \n"
@ -6019,7 +5992,6 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
                        uint8_t* dst_b,
                        int width) {
  asm volatile(
      "movdqa      %5,%%xmm3                     \n"
      LABELALIGN
@ -6061,7 +6033,6 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
                       uint8_t* dst_a,
                       int width) {
  asm volatile(
      "sub         %1,%2                         \n"
      "sub         %1,%3                         \n"
      "sub         %1,%4                         \n"
@ -6113,7 +6084,6 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
                       uint8_t* dst_b,
                       int width) {
  asm volatile(
      "vmovdqa     %6,%%ymm3                     \n"
      "vbroadcastf128 %5,%%ymm4                  \n"
@ -6161,7 +6131,6 @@ void MergeXR30Row_AVX2(const uint16_t* src_r,
                       int width) {
  int shift = depth - 10;
  asm volatile(
      "sub         %0,%1                         \n"
      "sub         %0,%2                         \n"
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
@ -6228,7 +6197,6 @@ void MergeAR64Row_AVX2(const uint16_t* src_r,
  int mask = (1 << depth) - 1;
  mask = (mask << 16) + mask;
  asm volatile(
      "sub         %0,%1                         \n"
      "sub         %0,%2                         \n"
      "sub         %0,%3                         \n"
@ -6300,7 +6268,6 @@ void MergeXR64Row_AVX2(const uint16_t* src_r,
  int mask = (1 << depth) - 1;
  mask = (mask << 16) + mask;
  asm volatile(
      "sub         %0,%1                         \n"
      "sub         %0,%2                         \n"
      "vmovdqa     %7,%%ymm5                     \n"
@ -6364,7 +6331,6 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
                            int width) {
  int shift = depth - 8;
  asm volatile(
      "sub         %0,%1                         \n"
      "sub         %0,%2                         \n"
      "sub         %0,%3                         \n"
@ -6421,7 +6387,6 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
                            int width) {
  int shift = depth - 8;
  asm volatile(
      "sub         %0,%1                         \n"
      "sub         %0,%2                         \n"
      "vbroadcastf128 %6,%%ymm5                  \n"
@ -6505,9 +6470,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 #ifdef HAS_COPYROW_AVX
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -6530,9 +6493,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
 // Multiple of 1.
 void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
  size_t width_tmp = (size_t)(width);
-  asm volatile(
+      asm volatile("rep         movsb                         \n"
      "rep         movsb                         \n"
               : "+S"(src),       // %0
                 "+D"(dst),       // %1
                 "+c"(width_tmp)  // %2
@ -6609,9 +6570,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0), %%xmm0                  \n"
      "movdqu      0x10(%0), %%xmm1              \n"
@ -6744,9 +6703,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
  size_t width_tmp = (size_t)(width >> 2);
  const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-  asm volatile(
+      asm volatile("rep         stosl                         \n"
      "rep         stosl                         \n"
               : "+D"(dst),       // %0
                 "+c"(width_tmp)  // %1
               : "a"(v32)         // %2
@ -6755,9 +6712,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
 void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
  size_t width_tmp = (size_t)(width);
-  asm volatile(
+      asm volatile("rep         stosb                         \n"
      "rep         stosb                         \n"
               : "+D"(dst),       // %0
                 "+c"(width_tmp)  // %1
               : "a"(v8)          // %2
@ -6766,9 +6721,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
 void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
  size_t width_tmp = (size_t)(width);
-  asm volatile(
+      asm volatile("rep         stosl                         \n"
      "rep         stosl                         \n"
               : "+D"(dst_argb),  // %0
                 "+c"(width_tmp)  // %1
               : "a"(v32)         // %2
@ -6904,9 +6857,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
 }
 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -7032,9 +6983,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
                        int stride_yuy2,
                        uint8_t* dst_uv,
                        int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -7137,9 +7086,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
 }
 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile(
+  asm volatile(LABELALIGN
      LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -7935,9 +7882,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile(
+      asm volatile("pxor        %%xmm5,%%xmm5                 \n"
      "pxor        %%xmm5,%%xmm5                 \n"
               // 4 pixel loop.
               LABELALIGN
@ -7974,9 +7919,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
                          const uint8_t* src_argb1,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile(
+      asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
               // 4 pixel loop.
               LABELALIGN
@ -8823,9 +8766,7 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
                          uint8_t* dst_argb,
                          const uint8_t* shuffler,
                          int width) {
-  asm volatile(
+      asm volatile("movdqu      (%3),%%xmm5                   \n"
      "movdqu      (%3),%%xmm5                   \n"
               LABELALIGN
      "1:                                        \n"
@ -8853,9 +8794,7 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
                         uint8_t* dst_argb,
                         const uint8_t* shuffler,
                         int width) {
-  asm volatile(
+      asm volatile("vbroadcastf128 (%3),%%ymm5                \n"
      "vbroadcastf128 (%3),%%ymm5                \n"
               LABELALIGN
      "1:                                        \n"
@ -8884,9 +8823,7 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile(
+      asm volatile("sub         %1,%2                         \n"
      "sub         %1,%2                         \n"
               LABELALIGN
      "1:                                        \n"
@ -8920,9 +8857,7 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
                        int width) {
-  asm volatile(
+      asm volatile("sub         %1,%2                         \n"
      "sub         %1,%2                         \n"
               LABELALIGN
      "1:                                        \n"
@ -8956,9 +8891,7 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_yuy2,
                        int width) {
-  asm volatile(
+      asm volatile("sub         %1,%2                         \n"
      "sub         %1,%2                         \n"
               LABELALIGN
      "1:                                        \n"
@ -8995,9 +8928,7 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
                        const uint8_t* src_v,
                        uint8_t* dst_uyvy,
                        int width) {
-  asm volatile(
+      asm volatile("sub         %1,%2                         \n"
      "sub         %1,%2                         \n"
               LABELALIGN
      "1:                                        \n"
@ -9033,9 +8964,7 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            const float* poly,
                            int width) {
-  asm volatile(
+      asm volatile("pxor        %%xmm3,%%xmm3                 \n"
      "pxor        %%xmm3,%%xmm3                 \n"
               // 2 pixel loop.
               LABELALIGN
@ -9080,7 +9009,8 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
                 "+r"(dst_argb),  // %1
                 "+r"(width)      // %2
               : "r"(poly)        // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
                 "xmm6");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
@ -9572,9 +9502,7 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
+      asm volatile("movdqu      %3,%%xmm5                     \n"
      "movdqu      %3,%%xmm5                     \n"
               LABELALIGN
      "1:                                        \n"
@ -9598,9 +9526,7 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
 #ifdef HAS_SWAPUVROW_AVX2
 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
-  asm volatile(
+      asm volatile("vbroadcastf128 %3,%%ymm5                  \n"
      "vbroadcastf128 %3,%%ymm5                  \n"
               LABELALIGN
      "1:                                        \n"