diff --git a/BUILD.gn b/BUILD.gn
index 2196be415..8a6f96dba 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -73,7 +73,7 @@ group("libyuv") {
     deps += [ ":libyuv_mmi" ]
   }
 
-  if (!is_ios) {
+  if (!is_ios && !libyuv_disable_jpeg) {
     # Make sure that clients of libyuv link with libjpeg. This can't go in
     # libyuv_internal because in Windows x64 builds that will generate a clang
     # build of libjpeg, and we don't want two copies.
@@ -150,7 +150,7 @@ static_library("libyuv_internal") {
     configs += [ "//build/config/gcc:symbol_visibility_default" ]
   }
 
-  if (!is_ios) {
+  if (!is_ios && !libyuv_disable_jpeg) {
     defines += [ "HAVE_JPEG" ]
 
     # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 1aa151b62..f7cb4f8fe 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -734,7 +734,7 @@ void MirrorPlane(const uint8_t* src_y,
 #if defined(HAS_MIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 32)) {
       MirrorRow = MirrorRow_NEON;
     }
   }
diff --git a/source/rotate.cc b/source/rotate.cc
index d414186a5..cea835d10 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -142,7 +142,7 @@ void RotatePlane180(const uint8_t* src,
 #if defined(HAS_MIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 32)) {
       MirrorRow = MirrorRow_NEON;
     }
   }
@@ -207,11 +207,11 @@ void RotatePlane180(const uint8_t* src,
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
-    MirrorRow(src, row, width);  // Mirror first row into a buffer
-    src += src_stride;
+    CopyRow(src, row, width);        // Copy first row into buffer
     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    MirrorRow(row, dst_bot, width);  // Mirror buffer into last row
+    src += src_stride;
     dst += dst_stride;
-    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
     src_bot -= src_stride;
     dst_bot -= dst_stride;
   }
diff --git a/source/row_any.cc b/source/row_any.cc
index 9b29b2bfb..3592ffb6c 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -1156,7 +1156,7 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
 ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
 #endif
 #ifdef HAS_MIRRORROW_NEON
-ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
 #endif
 #ifdef HAS_MIRRORROW_MSA
 ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 3088bb755..fa7b8cb31 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -84,7 +84,7 @@ static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
                                 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
-                               0x8080u, 0x8080u, 0x8080u, 0x8080u};
+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
 
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
@@ -1101,10 +1101,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   "lea       0x40(%0),%0                     \n" \
   "phaddw    %%xmm0,%%xmm6                   \n" \
   "phaddw    %%xmm2,%%xmm1                   \n" \
-  "paddw     %%" #round                          \
-  ",%%xmm6             \n"                       \
-  "paddw     %%" #round                          \
-  ",%%xmm1             \n"                       \
+  "paddw     %%" #round ",%%xmm6             \n" \
+  "paddw     %%" #round ",%%xmm1             \n" \
   "psrlw     $0x8,%%xmm6                     \n" \
   "psrlw     $0x8,%%xmm1                     \n" \
   "packuswb  %%xmm1,%%xmm6                   \n" \
@@ -1113,35 +1111,33 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   "sub       $0x10,%2                        \n" \
   "jg        1b                              \n"
 
-#define RGBTOY_AVX2(round)                                       \
-  "1:                                        \n"                 \
-  "vmovdqu    (%0),%%ymm0                    \n"                 \
-  "vmovdqu    0x20(%0),%%ymm1                \n"                 \
-  "vmovdqu    0x40(%0),%%ymm2                \n"                 \
-  "vmovdqu    0x60(%0),%%ymm3                \n"                 \
-  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
-  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
-  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
-  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
-  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
-  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
-  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
-  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
-  "lea       0x80(%0),%0                     \n"                 \
-  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
-  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
-  "vpaddw     %%" #round                                         \
-  ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
-  "vpaddw     %%" #round                                         \
-  ",%%ymm2,%%ymm2     \n"                                        \
-  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
-  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
-  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
-  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
-  "vmovdqu    %%ymm0,(%1)                    \n"                 \
-  "lea       0x20(%1),%1                     \n"                 \
-  "sub       $0x20,%2                        \n"                 \
-  "jg        1b                              \n"                 \
+#define RGBTOY_AVX2(round)                                                  \
+  "1:                                        \n"                            \
+  "vmovdqu    (%0),%%ymm0                    \n"                            \
+  "vmovdqu    0x20(%0),%%ymm1                \n"                            \
+  "vmovdqu    0x40(%0),%%ymm2                \n"                            \
+  "vmovdqu    0x60(%0),%%ymm3                \n"                            \
+  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                            \
+  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                            \
+  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                            \
+  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                            \
+  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                            \
+  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                            \
+  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                            \
+  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                            \
+  "lea       0x80(%0),%0                     \n"                            \
+  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */             \
+  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                            \
+  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */ \
+  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n"                            \
+  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                            \
+  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                            \
+  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */             \
+  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */            \
+  "vmovdqu    %%ymm0,(%1)                    \n"                            \
+  "lea       0x20(%1),%1                     \n"                            \
+  "sub       $0x20,%2                        \n"                            \
+  "jg        1b                              \n"                            \
   "vzeroupper                                \n"
 
 #ifdef HAS_ARGBTOYROW_SSSE3
@@ -1152,15 +1148,15 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "movdqa    %4,%%xmm5                       \n"
       "movdqa    %5,%%xmm7                       \n"
 
-      LABELALIGN RGBTOY(xmm7)
+      LABELALIGN
+      RGBTOY(xmm7)
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kARGBToY),   // %3
         "m"(kSub128),    // %4
         "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
@@ -1172,7 +1168,8 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "movdqa    %3,%%xmm4                       \n"
       "movdqa    %4,%%xmm5                       \n"
 
-      LABELALIGN RGBTOY(xmm5)
+      LABELALIGN
+      RGBTOY(xmm5)
       : "+r"(src_argb),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1190,7 +1187,8 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
       "movdqa    %3,%%xmm4                       \n"
       "movdqa    %4,%%xmm5                       \n"
 
-      LABELALIGN RGBTOY(xmm5)
+      LABELALIGN
+      RGBTOY(xmm5)
       : "+r"(src_rgba),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
@@ -1212,7 +1210,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "vbroadcastf128 %5,%%ymm7                  \n"
       "vmovdqu    %6,%%ymm6                      \n"
 
-      LABELALIGN RGBTOY_AVX2(ymm7)
+      LABELALIGN
+      RGBTOY_AVX2(ymm7)
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1220,8 +1219,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
         "m"(kSub128),           // %4
         "m"(kAddY16),           // %5
         "m"(kPermdARGBToY_AVX)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOYROW_AVX2
 
@@ -1234,7 +1232,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
       "vbroadcastf128 %5,%%ymm7                  \n"
       "vmovdqu    %6,%%ymm6                      \n"
 
-      LABELALIGN RGBTOY_AVX2(ymm7)
+      LABELALIGN
+      RGBTOY_AVX2(ymm7)
       : "+r"(src_abgr),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1242,8 +1241,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
         "m"(kSub128),           // %4
         "m"(kAddY16),           // %5
         "m"(kPermdARGBToY_AVX)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_ABGRTOYROW_AVX2
 
@@ -1255,15 +1253,15 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu    %5,%%ymm6                      \n"
 
-      LABELALIGN RGBTOY_AVX2(ymm5)
+      LABELALIGN
+      RGBTOY_AVX2(ymm5)
       : "+r"(src_argb),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
       : "m"(kARGBToYJ),         // %3
         "m"(kSub128),           // %4
         "m"(kPermdARGBToY_AVX)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOYJROW_AVX2
 
@@ -1275,8 +1273,9 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
       "vbroadcastf128 %4,%%ymm5                  \n"
       "vmovdqu    %5,%%ymm6                      \n"
 
-      LABELALIGN RGBTOY_AVX2(
-          ymm5) "vzeroupper                                \n"
+      LABELALIGN
+      RGBTOY_AVX2(ymm5)
+      "vzeroupper                                \n"
       : "+r"(src_rgba),         // %0
         "+r"(dst_y),            // %1
         "+r"(width)             // %2
@@ -1537,7 +1536,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
         "+r"(dst_v),                       // %2
         "+rm"(width)                       // %3
       : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kSub128),                      // %5
+        "m"(kSub128),                   // %5
         "m"(kARGBToVJ),                    // %6
         "m"(kARGBToUJ),                    // %7
         "m"(kShufARGBToUV_AVX)             // %8
@@ -1607,7 +1606,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
       : "r"((intptr_t)(src_stride_argb)),  // %4
         "m"(kARGBToVJ),                    // %5
         "m"(kARGBToUJ),                    // %6
-        "m"(kSub128)                       // %7
+        "m"(kSub128)                    // %7
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_SSSE3
@@ -1676,15 +1675,15 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
       "movdqa    %4,%%xmm5                       \n"
       "movdqa    %5,%%xmm7                       \n"
 
-      LABELALIGN RGBTOY(xmm7)
+      LABELALIGN
+      RGBTOY(xmm7)
       : "+r"(src_bgra),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kBGRAToY),   // %3
         "m"(kSub128),    // %4
         "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 
 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
@@ -1756,15 +1755,15 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
       "movdqa    %4,%%xmm5                       \n"
       "movdqa    %5,%%xmm7                       \n"
 
-      LABELALIGN RGBTOY(xmm7)
+      LABELALIGN
+      RGBTOY(xmm7)
       : "+r"(src_abgr),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kABGRToY),   // %3
         "m"(kSub128),    // %4
         "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 
 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@@ -1773,15 +1772,15 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
       "movdqa    %4,%%xmm5                       \n"
       "movdqa    %5,%%xmm7                       \n"
 
-      LABELALIGN RGBTOY(xmm7)
+      LABELALIGN
+      RGBTOY(xmm7)
       : "+r"(src_rgba),  // %0
         "+r"(dst_y),     // %1
         "+r"(width)      // %2
       : "m"(kRGBAToY),   // %3
         "m"(kSub128),    // %4
         "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 
 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
diff --git a/source/row_mmi.cc b/source/row_mmi.cc
index 50cfca726..d7d34e47f 100644
--- a/source/row_mmi.cc
+++ b/source/row_mmi.cc
@@ -6040,90 +6040,93 @@ void I444ToARGBRow_MMI(const uint8_t* src_y,
                        uint8_t* rgb_buf,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec[2], g_vec[2], r_vec[2];
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
   uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
-  __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"  // yg
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"  // bb
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"  // ub
-      "or         %[ub],           %[ub],             %[mask]       \n\t"  // must
-                                                                           // sign
-                                                                           // extension
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"  // bg
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"  // ug
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"  // vg
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"  // br
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"  // vr
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"  // sign
-                                                                           // extension
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+  __asm__ volatile (
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
+    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"  // y*0x0101
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"  // y1
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
 
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"  // u
-      "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-      "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-      "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"//u
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
 
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"  // v
-      "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-      "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"  // u*ug
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"  // v*vg
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"//v
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
 
-      "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-      "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"  // v*vr
-      "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-      "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
 
-      "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"  // rrrrbbbb
-      "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"  // ffffgggg
-      "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-      "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"  // gbgbgbgb
-      "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"  // frfrfrfr
-      "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"  // frgbfrgb
-      "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"  // frgbfrgb
-      "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
+    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
-        [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
-        [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
-        [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
-        [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
-        [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
-        [five] "f"(0x55), [mask] "f"(mask)
-      : "memory");
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+    : [y]"=&f"(y),
+      [u]"=&f"(u),                         [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [alpha]"f"(-1),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask)
+    : "memory"
+  );
 }
 
 // Also used for 420
@@ -6133,96 +6136,99 @@ void I422ToARGBRow_MMI(const uint8_t* src_y,
                        uint8_t* rgb_buf,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec[2], g_vec[2], r_vec[2];
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
   uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"  // yg
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"  // bb
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"  // ub
-      "or         %[ub],           %[ub],             %[mask]       \n\t"  // must
-                                                                           // sign
-                                                                           // extension
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"  // bg
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"  // ug
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"  // vg
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"  // br
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"  // vr
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"  // sign
-                                                                           // extension
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"//yg
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"//bb
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"//ub
+    "or         %[ub],           %[ub],             %[mask]       \n\t"//must sign extension
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"//bg
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"//ug
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"//vg
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"//br
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"//vr
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"//sign extension
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"  // y*0x0101
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"  // y1
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
 
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"  // u
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-      "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-      "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
 
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"  // v
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-      "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"  // u*ug
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"  // v*vg
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"//v
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"//u*ug
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"//v*vg
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
 
-      "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-      "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"  // v*vr
-      "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-      "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"//v*vr
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
 
-      "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"  // rrrrbbbb
-      "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"  // ffffgggg
-      "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-      "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"  // gbgbgbgb
-      "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"  // frfrfrfr
-      "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"  // frgbfrgb
-      "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"  // frgbfrgb
-      "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
+    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"//ffffgggg
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//gbgbgbgb
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"//frfrfrfr
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"//frgbfrgb
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
-        [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
-        [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
-        [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
-        [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
-        [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
-        [five] "f"(0x55), [mask] "f"(mask)
-      : "memory");
+    : [y]"=&f"(y),
+      [u]"=&f"(u),                         [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [alpha]"f"(-1),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask)
+    : "memory"
+  );
 }
 
 // 10 bit YUV to ARGB
@@ -6232,96 +6238,102 @@ void I210ToARGBRow_MMI(const uint16_t* src_y,
                        uint8_t* rgb_buf,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec[2], g_vec[2], r_vec[2];
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
   uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask]       \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
 
-      "1:                                                           \n\t"
-      "gsldlc1    %[y],            0x07(%[y_ptr])                   \n\t"
-      "gsldrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "1:                                                           \n\t"
+    "gsldlc1    %[y],            0x07(%[y_ptr])                   \n\t"
+    "gsldrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
 
-      "psllh      %[y],            %[y],              %[six]        \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "psllh      %[y],            %[y],              %[six]        \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      "punpcklhw  %[u],            %[u],              %[u]          \n\t"
-      "psrah      %[u],            %[u],              %[two]        \n\t"
-      "punpcklhw  %[v],            %[v],              %[v]          \n\t"
-      "psrah      %[v],            %[v],              %[two]        \n\t"
-      "pminsh     %[u],            %[u],              %[mask1]      \n\t"
-      "pminsh     %[v],            %[v],              %[mask1]      \n\t"
+    "punpcklhw  %[u],            %[u],              %[u]          \n\t"
+    "psrah      %[u],            %[u],              %[two]        \n\t"
+    "punpcklhw  %[v],            %[v],              %[v]          \n\t"
+    "psrah      %[v],            %[v],              %[two]        \n\t"
+    "pminsh     %[u],            %[u],              %[mask1]      \n\t"
+    "pminsh     %[v],            %[v],              %[mask1]      \n\t"
 
-      "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-      "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
 
-      "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-      "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
 
-      "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-      "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
 
-      "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
-      "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
-      "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
 
-      "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-      "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-      "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
-      "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-      "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-      "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "packushb   %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[alpha]      \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x08          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x08          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x04          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
-        [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
-        [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
-        [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
-        [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
-        [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6),
-        [five] "f"(0x55), [mask] "f"(mask), [two] "f"(0x02),
-        [mask1] "f"(0x00ff00ff00ff00ff)
-      : "memory");
+    : [y]"=&f"(y),
+      [u]"=&f"(u),                         [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [alpha]"f"(-1),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask),                     [two]"f"(0x02),
+      [mask1]"f"(0x00ff00ff00ff00ff)
+    : "memory"
+  );
 }
 
 void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
@@ -6331,96 +6343,102 @@ void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
                             uint8_t* rgb_buf,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  uint64_t y, u, v, a;
-  uint64_t b_vec[2], g_vec[2], r_vec[2];
+  uint64_t y,u,v,a;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
   uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask]       \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
-      "gslwlc1    %[a],            0x03(%[a_ptr])                   \n\t"
-      "gslwrc1    %[a],            0x00(%[a_ptr])                   \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "gslwlc1    %[a],            0x03(%[a_ptr])                   \n\t"
+    "gslwrc1    %[a],            0x00(%[a_ptr])                   \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"  // y*0x0101
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"  // y1
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
 
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"  // u
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-      "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-      "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
 
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-      "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
 
-      "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-      "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-      "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
 
-      "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"  // rrrrbbbb
-      "packushb   %[g_vec0],       %[g_vec0],         %[a]          \n\t"
-      "punpcklwd  %[g_vec0],       %[g_vec0],         %[a]          \n\t"  // aaaagggg
-      "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-      "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
-      "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"//rrrrbbbb
+    "packushb   %[g_vec0],       %[g_vec0],         %[a]          \n\t"
+    "punpcklwd  %[g_vec0],       %[g_vec0],         %[a]          \n\t"//aaaagggg
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec1],       0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[a_ptr],        %[a_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[a_ptr],        %[a_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [a] "=&f"(a),
-        [b_vec0] "=&f"(b_vec[0]), [b_vec1] "=&f"(b_vec[1]),
-        [g_vec0] "=&f"(g_vec[0]), [g_vec1] "=&f"(g_vec[1]),
-        [r_vec0] "=&f"(r_vec[0]), [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub),
-        [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb),
-        [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [a_ptr] "r"(src_a), [zero] "f"(0x00),
-        [six] "f"(0x6), [five] "f"(0x55), [mask] "f"(mask)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),                         [a]"=&f"(a),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [a_ptr]"r"(src_a),                   [zero]"f"(0x00),
+      [six]"f"(0x6),                       [five]"f"(0x55),
+      [mask]"f"(mask)
+    : "memory"
+  );
 }
 
 void I422ToRGB24Row_MMI(const uint8_t* src_y,
@@ -6429,105 +6447,113 @@ void I422ToRGB24Row_MMI(const uint8_t* src_y,
                         uint8_t* rgb_buf,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  uint64_t y, u, v;
-  uint64_t b_vec[2], g_vec[2], r_vec[2];
+  uint64_t y,u,v;
+  uint64_t b_vec[2],g_vec[2],r_vec[2];
   uint64_t mask = 0xff00ff00ff00ff00ULL;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask]       \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"  // y*0x0101
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"  // y1
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
 
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"  // u
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
-      "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
-      "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec0],       %[y],              %[bb]         \n\t"
+    "pmullh     %[b_vec1],       %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec0],       %[b_vec0],         %[b_vec1]     \n\t"
+    "psrah      %[b_vec0],       %[b_vec0],         %[six]        \n\t"
 
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
-      "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
-      "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec0],       %[y],              %[bg]         \n\t"
+    "pmullh     %[g_vec1],       %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "pmullh     %[g_vec1],       %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec0],       %[g_vec0],         %[g_vec1]     \n\t"
+    "psrah      %[g_vec0],       %[g_vec0],         %[six]        \n\t"
 
-      "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
-      "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
-      "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
+    "paddsh     %[r_vec0],       %[y],              %[br]         \n\t"
+    "pmullh     %[r_vec1],       %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec0],       %[r_vec0],         %[r_vec1]     \n\t"
+    "psrah      %[r_vec0],       %[r_vec0],         %[six]        \n\t"
 
-      "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-      "packushb   %[g_vec0],       %[g_vec0],         %[zero]       \n\t"
-      "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
-      "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
-      "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
+    "packushb   %[r_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "packushb   %[g_vec0],       %[g_vec0],         %[zero]       \n\t"
+    "punpcklbh  %[b_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpckhbh  %[r_vec0],       %[r_vec0],         %[g_vec0]     \n\t"
+    "punpcklhw  %[g_vec0],       %[b_vec0],         %[r_vec0]     \n\t"
+    "punpckhhw  %[g_vec1],       %[b_vec0],         %[r_vec0]     \n\t"
 
-      "punpckhwd  %[r_vec0],       %[g_vec0],         %[g_vec0]     \n\t"
-      "psllw      %[r_vec1],       %[r_vec0],         %[lmove1]     \n\t"
-      "or         %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-      "psrlw      %[r_vec1],       %[r_vec0],         %[rmove1]     \n\t"
-      "pextrh     %[r_vec1],       %[r_vec1],         %[zero]       \n\t"
-      "pinsrh_2   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-      "pextrh     %[r_vec1],       %[g_vec1],         %[zero]       \n\t"
-      "pinsrh_3   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
-      "pextrh     %[r_vec1],       %[g_vec1],         %[one]        \n\t"
-      "punpckhwd  %[g_vec1],       %[g_vec1],         %[g_vec1]     \n\t"
-      "psllw      %[g_vec1],       %[g_vec1],         %[rmove1]     \n\t"
-      "or         %[g_vec1],       %[g_vec1],         %[r_vec1]     \n\t"
-      "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
-      "gsswlc1    %[g_vec1],       0x0b(%[rgbbuf_ptr])              \n\t"
-      "gsswrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
+    "punpckhwd  %[r_vec0],       %[g_vec0],         %[g_vec0]     \n\t"
+    "psllw      %[r_vec1],       %[r_vec0],         %[lmove1]     \n\t"
+    "or         %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
+    "psrlw      %[r_vec1],       %[r_vec0],         %[rmove1]     \n\t"
+    "pextrh     %[r_vec1],       %[r_vec1],         %[zero]       \n\t"
+    "pinsrh_2   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
+    "pextrh     %[r_vec1],       %[g_vec1],         %[zero]       \n\t"
+    "pinsrh_3   %[g_vec0],       %[g_vec0],         %[r_vec1]     \n\t"
+    "pextrh     %[r_vec1],       %[g_vec1],         %[one]        \n\t"
+    "punpckhwd  %[g_vec1],       %[g_vec1],         %[g_vec1]     \n\t"
+    "psllw      %[g_vec1],       %[g_vec1],         %[rmove1]     \n\t"
+    "or         %[g_vec1],       %[g_vec1],         %[r_vec1]     \n\t"
+    "gssdlc1    %[g_vec0],       0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec0],       0x00(%[rgbbuf_ptr])              \n\t"
+    "gsswlc1    %[g_vec1],       0x0b(%[rgbbuf_ptr])              \n\t"
+    "gsswrc1    %[g_vec1],       0x08(%[rgbbuf_ptr])              \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0c          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]),
-        [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]),
-        [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]),
-        [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug),
-        [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg),
-        [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
-        [mask] "f"(mask), [lmove1] "f"(0x18), [rmove1] "f"(0x8), [one] "f"(0x1)
-      : "memory");
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0c          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
+
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec0]"=&f"(b_vec[0]),             [b_vec1]"=&f"(b_vec[1]),
+      [g_vec0]"=&f"(g_vec[0]),             [g_vec1]"=&f"(g_vec[1]),
+      [r_vec0]"=&f"(r_vec[0]),             [r_vec1]"=&f"(r_vec[1]),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask]"f"(mask),
+      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
+      [one]"f"(0x1)
+    : "memory"
+  );
 }
 
 void I422ToARGB4444Row_MMI(const uint8_t* src_y,
@@ -6538,103 +6564,110 @@ void I422ToARGB4444Row_MMI(const uint8_t* src_y,
                            int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask]       \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask]       \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask]       \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask]       \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"  // y*0x0101
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"  // y1
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"//y*0x0101
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"//y1
 
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"  // u
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"//u
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "and        %[g_vec],        %[g_vec],          %[mask1]      \n\t"
-      "psrlw      %[g_vec],        %[g_vec],          %[four]       \n\t"
-      "psrlw      %[r_vec],        %[g_vec],          %[four]       \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-      "and        %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "and        %[g_vec],        %[g_vec],          %[mask1]      \n\t"
+    "psrlw      %[g_vec],        %[g_vec],          %[four]       \n\t"
+    "psrlw      %[r_vec],        %[g_vec],          %[four]       \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
+    "and        %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
 
-      "and        %[b_vec],        %[b_vec],          %[mask1]      \n\t"
-      "psrlw      %[b_vec],        %[b_vec],          %[four]       \n\t"
-      "psrlw      %[r_vec],        %[b_vec],          %[four]       \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
-      "and        %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "and        %[b_vec],        %[b_vec],          %[mask1]      \n\t"
+    "psrlw      %[b_vec],        %[b_vec],          %[four]       \n\t"
+    "psrlw      %[r_vec],        %[b_vec],          %[four]       \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpcklbh  %[r_vec],        %[alpha],          %[zero]       \n\t"
+    "and        %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[b_vec]      \n\t"
 
-      "gssdlc1    %[g_vec],        0x07(%[dst_argb4444])            \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[dst_argb4444])            \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[dst_argb4444])            \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_argb4444])            \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[dst_argb4444], %[dst_argb4444],   0x08          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[dst_argb4444], %[dst_argb4444],   0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [dst_argb4444] "r"(dst_argb4444), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
-        [mask] "f"(0xff00ff00ff00ff00), [four] "f"(0x4),
-        [mask1] "f"(0xf0f0f0f0f0f0f0f0), [alpha] "f"(-1)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [dst_argb4444]"r"(dst_argb4444),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask]"f"(0xff00ff00ff00ff00),
+      [four]"f"(0x4),                      [mask1]"f"(0xf0f0f0f0f0f0f0f0),
+      [alpha]"f"(-1)
+    : "memory"
+  );
 }
 
 void I422ToARGB1555Row_MMI(const uint8_t* src_y,
@@ -6645,118 +6678,125 @@ void I422ToARGB1555Row_MMI(const uint8_t* src_y,
                            int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "psrlw      %[temp],         %[g_vec],          %[three]      \n\t"
-      "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "or         %[g_vec],        %[g_vec],          %[mask3]      \n\t"
+    "psrlw      %[temp],         %[g_vec],          %[three]      \n\t"
+    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "or         %[g_vec],        %[g_vec],          %[mask3]      \n\t"
 
-      "psrlw      %[temp],         %[b_vec],          %[three]      \n\t"
-      "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "or         %[b_vec],        %[b_vec],          %[mask3]      \n\t"
+    "psrlw      %[temp],         %[b_vec],          %[three]      \n\t"
+    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "or         %[b_vec],        %[b_vec],          %[mask3]      \n\t"
 
-      "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
 
-      "gssdlc1    %[g_vec],        0x07(%[dst_argb1555])            \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[dst_argb1555])            \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[dst_argb1555])            \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_argb1555])            \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[dst_argb1555], %[dst_argb1555],   0x08          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[dst_argb1555], %[dst_argb1555],   0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [dst_argb1555] "r"(dst_argb1555), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
-        [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3),
-        [mask2] "f"(0x1f0000001f), [eight] "f"(0x8),
-        [mask3] "f"(0x800000008000), [lmove5] "f"(0x5)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [dst_argb1555]"r"(dst_argb1555),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
+      [eight]"f"(0x8),                     [mask3]"f"(0x800000008000),
+      [lmove5]"f"(0x5)
+    : "memory"
+  );
 }
 
 void I422ToRGB565Row_MMI(const uint8_t* src_y,
@@ -6767,120 +6807,127 @@ void I422ToRGB565Row_MMI(const uint8_t* src_y,
                          int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      // u3|u2|u1|u0 --> u1|u1|u0|u0
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    //u3|u2|u1|u0 --> u1|u1|u0|u0
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      // v3|v2|v1|v0 --> v1|v1|v0|v0
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    //v3|v2|v1|v0 --> v1|v1|v0|v0
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-      "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-      "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-      "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "paddb      %[temp],         %[three],          %[eight]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
+    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
 
-      "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-      "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-      "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-      "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "paddb      %[temp],         %[three],          %[eight]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
+    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[lmove5]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
 
-      "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [dst_rgb565] "r"(dst_rgb565), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
-        [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3),
-        [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7),
-        [lmove5] "f"(0x5)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [dst_rgb565]"r"(dst_rgb565),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
+      [eight]"f"(0x8),                     [seven]"f"(0x7),
+      [lmove5]"f"(0x5)
+    : "memory"
+  );
 }
 
 void NV12ToARGBRow_MMI(const uint8_t* src_y,
@@ -6890,83 +6937,91 @@ void NV12ToARGBRow_MMI(const uint8_t* src_y,
                        int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-      "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-      "pshufh     %[u],            %[u],              %[ushu]       \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
+    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-      "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1)
+    : "memory"
+  );
 }
 
 void NV21ToARGBRow_MMI(const uint8_t* src_y,
@@ -6976,83 +7031,91 @@ void NV21ToARGBRow_MMI(const uint8_t* src_y,
                        int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-      "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-      "pshufh     %[u],            %[u],              %[vshu]       \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
+    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-      "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1)
+    : "memory"
+  );
 }
 
 void NV12ToRGB24Row_MMI(const uint8_t* src_y,
@@ -7062,95 +7125,103 @@ void NV12ToRGB24Row_MMI(const uint8_t* src_y,
                         int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-      "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-      "pshufh     %[u],            %[u],              %[ushu]       \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
+    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-      "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-      "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-      "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-      "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-      "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-      "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-      "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-      "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-      "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-      "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
+    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
+    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
+    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
+    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
+    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
+    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
+    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [lmove1] "f"(0x18),
-        [one] "f"(0x1), [rmove1] "f"(0x8)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1),                      [lmove1]"f"(0x18),
+      [one]"f"(0x1),                       [rmove1]"f"(0x8)
+    : "memory"
+  );
 }
 
 void NV21ToRGB24Row_MMI(const uint8_t* src_y,
@@ -7160,95 +7231,103 @@ void NV21ToRGB24Row_MMI(const uint8_t* src_y,
                         int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
-      "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "pshufh     %[v],            %[u],              %[ushu]       \n\t"
-      "pshufh     %[u],            %[u],              %[vshu]       \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[vu_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[vu_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[ushu]       \n\t"
+    "pshufh     %[u],            %[u],              %[vshu]       \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
-      "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
-      "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
-      "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
-      "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
-      "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
-      "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
-      "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
-      "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-      "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
-      "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+    "punpckhwd  %[r_vec],        %[g_vec],          %[g_vec]      \n\t"
+    "psllw      %[temp],         %[r_vec],          %[lmove1]     \n\t"
+    "or         %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrlw      %[temp],         %[r_vec],          %[rmove1]     \n\t"
+    "pextrh     %[temp],         %[temp],           %[zero]       \n\t"
+    "pinsrh_2   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[zero]       \n\t"
+    "pinsrh_3   %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pextrh     %[temp],         %[b_vec],          %[one]        \n\t"
+    "punpckhwd  %[b_vec],        %[b_vec],          %[b_vec]      \n\t"
+    "psllw      %[b_vec],        %[b_vec],          %[rmove1]     \n\t"
+    "or         %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gsswlc1    %[b_vec],        0x0b(%[rgbbuf_ptr])              \n\t"
+    "gsswrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[vu_ptr],       %[vu_ptr],         0x04          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x0C          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [lmove1] "f"(0x18),
-        [rmove1] "f"(0x8), [one] "f"(0x1)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [vu_ptr]"r"(src_vu),
+      [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [lmove1]"f"(0x18),                   [rmove1]"f"(0x8),
+      [one]"f"(0x1)
+    : "memory"
+  );
 }
 
 void NV12ToRGB565Row_MMI(const uint8_t* src_y,
@@ -7258,115 +7337,123 @@ void NV12ToRGB565Row_MMI(const uint8_t* src_y,
                          int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
-      "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "pshufh     %[v],            %[u],              %[vshu]       \n\t"
-      "pshufh     %[u],            %[u],              %[ushu]       \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[uv_ptr])                  \n\t"
+    "gslwrc1    %[u],            0x00(%[uv_ptr])                  \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "pshufh     %[v],            %[u],              %[vshu]       \n\t"
+    "pshufh     %[u],            %[u],              %[ushu]       \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
-      "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-      "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-      "psubb      %[y],            %[eight],          %[three]      \n\t"  // 5
-      "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-      "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "paddb      %[temp],         %[three],          %[eight]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "psrlh      %[temp],         %[g_vec],          %[three]      \n\t"
+    "and        %[g_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
+    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[g_vec],        %[g_vec],          %[r_vec]      \n\t"
 
-      "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
-      "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
-      "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
-      "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
-      "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
-      "psubb      %[y],            %[eight],          %[three]      \n\t"  // 5
-      "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "paddb      %[r_vec],        %[three],          %[six]        \n\t"
-      "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
-      "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
-      "paddb      %[temp],         %[three],          %[eight]      \n\t"
-      "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "psrlh      %[temp],         %[b_vec],          %[three]      \n\t"
+    "and        %[b_vec],        %[temp],           %[mask2]      \n\t"
+    "psrlw      %[temp],         %[temp],           %[seven]      \n\t"
+    "psrlw      %[r_vec],        %[mask1],          %[eight]      \n\t"
+    "and        %[r_vec],        %[temp],           %[r_vec]      \n\t"
+    "psubb      %[y],            %[eight],          %[three]      \n\t"//5
+    "psllw      %[r_vec],        %[r_vec],          %[y]          \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "paddb      %[r_vec],        %[three],          %[six]        \n\t"
+    "psrlw      %[temp],         %[temp],           %[r_vec]      \n\t"
+    "and        %[r_vec],        %[temp],           %[mask2]      \n\t"
+    "paddb      %[temp],         %[three],          %[eight]      \n\t"
+    "psllw      %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "or         %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[r_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[g_vec],          %[b_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[r_vec],          %[b_vec]      \n\t"
 
-      "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[dst_rgb565])             \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[dst_rgb565])             \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
-      "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+	"daddiu     %[uv_ptr],       %[uv_ptr],         0x04          \n\t"
+    "daddiu     %[dst_rgb565],   %[dst_rgb565],     0x08          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [dst_rgb565] "r"(dst_rgb565),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [three] "f"(0x3),
-        [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [uv_ptr]"r"(src_uv),
+      [dst_rgb565]"r"(dst_rgb565),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [three]"f"(0x3),                     [mask2]"f"(0x1f0000001f),
+      [eight]"f"(0x8),                     [seven]"f"(0x7)
+    : "memory"
+  );
 }
 
 void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
@@ -7375,83 +7462,90 @@ void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
                        int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
 
-      "1:                                                           \n\t"
-      "gsldlc1    %[y],            0x07(%[yuy2_ptr])                \n\t"
-      "gsldrc1    %[y],            0x00(%[yuy2_ptr])                \n\t"
-      "psrlh      %[temp],         %[y],              %[eight]      \n\t"
-      "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-      "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
+    "1:                                                           \n\t"
+    "gsldlc1    %[y],            0x07(%[yuy2_ptr])                \n\t"
+    "gsldrc1    %[y],            0x00(%[yuy2_ptr])                \n\t"
+    "psrlh      %[temp],         %[y],              %[eight]      \n\t"
+    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
+    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
 
-      "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-      "and        %[y],            %[y],              %[temp]       \n\t"
-      "psllh      %[temp],         %[y],              %[eight]      \n\t"
-      "or         %[y],            %[y],              %[temp]       \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
+    "and        %[y],            %[y],              %[temp]       \n\t"
+    "psllh      %[temp],         %[y],              %[eight]      \n\t"
+    "or         %[y],            %[y],              %[temp]       \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
 
-      "daddiu     %[yuy2_ptr],     %[yuy2_ptr],       0x08          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[yuy2_ptr],     %[yuy2_ptr],       0x08          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [yuy2_ptr] "r"(src_yuy2), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [yuy2_ptr]"r"(src_yuy2),             [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1),                      [eight]"f"(0x8)
+    : "memory"
+  );
 }
 
 void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
@@ -7460,83 +7554,90 @@ void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
                        int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
 
-      "1:                                                           \n\t"
-      "gsldlc1    %[y],            0x07(%[uyvy_ptr])                \n\t"
-      "gsldrc1    %[y],            0x00(%[uyvy_ptr])                \n\t"
-      "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
-      "and        %[temp],         %[y],              %[temp]       \n\t"
-      "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
-      "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
+    "1:                                                           \n\t"
+    "gsldlc1    %[y],            0x07(%[uyvy_ptr])                \n\t"
+    "gsldrc1    %[y],            0x00(%[uyvy_ptr])                \n\t"
+    "psrlh      %[temp],         %[mask1],          %[eight]      \n\t"
+    "and        %[temp],         %[y],              %[temp]       \n\t"
+    "pshufh     %[u],            %[temp],           %[ushu]       \n\t"
+    "pshufh     %[v],            %[temp],           %[vshu]       \n\t"
 
-      "psrlh      %[y],            %[y],              %[eight]      \n\t"
-      "psllh      %[temp],         %[y],              %[eight]      \n\t"
-      "or         %[y],            %[y],              %[temp]       \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "psrlh      %[y],            %[y],              %[eight]      \n\t"
+    "psllh      %[temp],         %[y],              %[eight]      \n\t"
+    "or         %[y],            %[y],              %[temp]       \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
-      "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[g_vec],          %[alpha]      \n\t"
+    "punpcklbh  %[b_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[r_vec],          %[g_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
-      "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
-      "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[g_vec],        0x07(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[g_vec],        0x00(%[rgbbuf_ptr])              \n\t"
+    "gssdlc1    %[b_vec],        0x0f(%[rgbbuf_ptr])              \n\t"
+    "gssdrc1    %[b_vec],        0x08(%[rgbbuf_ptr])              \n\t"
 
-      "daddiu     %[uyvy_ptr],     %[uyvy_ptr],       0x08          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[uyvy_ptr],     %[uyvy_ptr],       0x08          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [uyvy_ptr] "r"(src_uyvy), [rgbbuf_ptr] "r"(rgb_buf),
-        [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00),
-        [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00),
-        [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [uyvy_ptr]"r"(src_uyvy),             [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [ushu]"f"(0xA0),                     [vshu]"f"(0xf5),
+      [alpha]"f"(-1),                      [eight]"f"(0x8)
+    : "memory"
+  );
 }
 
 void I422ToRGBARow_MMI(const uint8_t* src_y,
@@ -7547,104 +7648,112 @@ void I422ToRGBARow_MMI(const uint8_t* src_y,
                        int width) {
   uint64_t y, u, v;
   uint64_t b_vec, g_vec, r_vec, temp;
-  uint64_t ub, ug, vg, vr, bb, bg, br, yg;
+  uint64_t ub,ug,vg,vr,bb,bg,br,yg;
 
   __asm__ volatile(
-      "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
-      "or         %[ub],           %[ub],             %[mask1]      \n\t"
-      "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
-      "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
-      "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
-      "pshufh     %[vg],           %[vg],             %[five]       \n\t"
-      "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
-      "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
-      "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
-      "pshufh     %[vr],           %[vr],             %[five]       \n\t"
-      "or         %[vr],           %[vr],             %[mask1]      \n\t"
+    "ldc1       %[yg],           0xc0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[bb],           0x60(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ub],           0x00(%[yuvcons_ptr])             \n\t"
+    "or         %[ub],           %[ub],             %[mask1]      \n\t"
+    "ldc1       %[bg],           0x80(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[ug],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[ug],           %[ug],             %[zero]       \n\t"
+    "pshufh     %[ug],           %[ug],             %[zero]       \n\t"
+    "ldc1       %[vg],           0x20(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vg],           %[vg],             %[zero]       \n\t"
+    "pshufh     %[vg],           %[vg],             %[five]       \n\t"
+    "ldc1       %[br],           0xa0(%[yuvcons_ptr])             \n\t"
+    "ldc1       %[vr],           0x40(%[yuvcons_ptr])             \n\t"
+    "punpcklbh  %[vr],           %[vr],             %[zero]       \n\t"
+    "pshufh     %[vr],           %[vr],             %[five]       \n\t"
+    "or         %[vr],           %[vr],             %[mask1]      \n\t"
 
-      "1:                                                           \n\t"
-      "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
-      "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
-      "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
-      "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
-      "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
-      "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
+    "1:                                                           \n\t"
+    "gslwlc1    %[y],            0x03(%[y_ptr])                   \n\t"
+    "gslwrc1    %[y],            0x00(%[y_ptr])                   \n\t"
+    "gslwlc1    %[u],            0x03(%[u_ptr])                   \n\t"
+    "gslwrc1    %[u],            0x00(%[u_ptr])                   \n\t"
+    "gslwlc1    %[v],            0x03(%[v_ptr])                   \n\t"
+    "gslwrc1    %[v],            0x00(%[v_ptr])                   \n\t"
 
-      "punpcklbh  %[y],            %[y],              %[y]          \n\t"
-      "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
+    "punpcklbh  %[y],            %[y],              %[y]          \n\t"
+    "pmulhuh    %[y],            %[y],              %[yg]         \n\t"
 
-      "punpcklbh  %[u],            %[u],              %[u]          \n\t"
-      "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
-      "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ub]         \n\t"
-      "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
-      "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
+    "punpcklbh  %[u],            %[u],              %[u]          \n\t"
+    "punpcklbh  %[u],            %[u],              %[zero]       \n\t"
+    "paddsh     %[b_vec],        %[y],              %[bb]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ub]         \n\t"
+    "psubsh     %[b_vec],        %[b_vec],          %[temp]       \n\t"
+    "psrah      %[b_vec],        %[b_vec],          %[six]        \n\t"
 
-      "punpcklbh  %[v],            %[v],              %[v]          \n\t"
-      "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
-      "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
-      "pmullh     %[temp],         %[u],              %[ug]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "pmullh     %[temp],         %[v],              %[vg]         \n\t"
-      "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
-      "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
+    "punpcklbh  %[v],            %[v],              %[v]          \n\t"
+    "punpcklbh  %[v],            %[v],              %[zero]       \n\t"
+    "paddsh     %[g_vec],        %[y],              %[bg]         \n\t"
+    "pmullh     %[temp],         %[u],              %[ug]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "pmullh     %[temp],         %[v],              %[vg]         \n\t"
+    "psubsh     %[g_vec],        %[g_vec],          %[temp]       \n\t"
+    "psrah      %[g_vec],        %[g_vec],          %[six]        \n\t"
 
-      "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
-      "pmullh     %[temp],         %[v],              %[vr]         \n\t"
-      "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
-      "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
+    "paddsh     %[r_vec],        %[y],              %[br]         \n\t"
+    "pmullh     %[temp],         %[v],              %[vr]         \n\t"
+    "psubsh     %[r_vec],        %[r_vec],          %[temp]       \n\t"
+    "psrah      %[r_vec],        %[r_vec],          %[six]        \n\t"
 
-      "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
-      "punpcklwd  %[g_vec],        %[alpha],          %[g_vec]      \n\t"
-      "punpcklbh  %[b_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "punpckhbh  %[r_vec],        %[g_vec],          %[r_vec]      \n\t"
-      "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
-      "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[r_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "packushb   %[g_vec],        %[g_vec],          %[zero]       \n\t"
+    "punpcklwd  %[g_vec],        %[alpha],          %[g_vec]      \n\t"
+    "punpcklbh  %[b_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "punpckhbh  %[r_vec],        %[g_vec],          %[r_vec]      \n\t"
+    "punpcklhw  %[g_vec],        %[b_vec],          %[r_vec]      \n\t"
+    "punpckhhw  %[b_vec],        %[b_vec],          %[r_vec]      \n\t"
 
-      "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
-      "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
-      "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[g_vec],       0x07(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[g_vec],       0x00(%[rgbbuf_ptr])               \n\t"
+    "gssdlc1    %[b_vec],       0x0f(%[rgbbuf_ptr])               \n\t"
+    "gssdrc1    %[b_vec],       0x08(%[rgbbuf_ptr])               \n\t"
 
-      "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
-      "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
-      "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
-      "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
-      "daddi      %[width],        %[width],          -0x04         \n\t"
-      "bnez       %[width],        1b                               \n\t"
+    "daddiu     %[y_ptr],        %[y_ptr],          0x04          \n\t"
+    "daddiu     %[u_ptr],        %[u_ptr],          0x02          \n\t"
+    "daddiu     %[v_ptr],        %[v_ptr],          0x02          \n\t"
+    "daddiu     %[rgbbuf_ptr],   %[rgbbuf_ptr],     0x10          \n\t"
+    "daddi      %[width],        %[width],          -0x04         \n\t"
+    "bnez       %[width],        1b                               \n\t"
 
-      : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec),
-        [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp),
-        [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr),
-        [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg)
-      : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v),
-        [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants),
-        [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6),
-        [mask1] "f"(0xff00ff00ff00ff00), [alpha] "f"(-1)
-      : "memory");
+    : [y]"=&f"(y),                         [u]"=&f"(u),
+      [v]"=&f"(v),
+      [b_vec]"=&f"(b_vec),                 [g_vec]"=&f"(g_vec),
+      [r_vec]"=&f"(r_vec),                 [temp]"=&f"(temp),
+      [ub]"=&f"(ub),                       [ug]"=&f"(ug),
+      [vg]"=&f"(vg),                       [vr]"=&f"(vr),
+      [bb]"=&f"(bb),                       [bg]"=&f"(bg),
+      [br]"=&f"(br),                       [yg]"=&f"(yg)
+    : [y_ptr]"r"(src_y),                   [u_ptr]"r"(src_u),
+      [v_ptr]"r"(src_v),                   [rgbbuf_ptr]"r"(rgb_buf),
+      [yuvcons_ptr]"r"(yuvconstants),      [width]"r"(width),
+      [zero]"f"(0x00),                     [five]"f"(0x55),
+      [six]"f"(0x6),                       [mask1]"f"(0xff00ff00ff00ff00),
+      [alpha]"f"(-1)
+    : "memory"
+  );
 }
 
 void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
-  __asm__ volatile(
-      "punpcklwd  %[v32],          %[v32],            %[v32]        \n\t"
-      "1:                                                           \n\t"
-      "gssdlc1    %[v32],          0x07(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[v32],          0x00(%[dst_ptr])                 \n\t"
-      "gssdlc1    %[v32],          0x0f(%[dst_ptr])                 \n\t"
-      "gssdrc1    %[v32],          0x08(%[dst_ptr])                 \n\t"
+  __asm__ volatile (
+    "punpcklwd  %[v32],          %[v32],            %[v32]        \n\t"
+    "1:                                                           \n\t"
+    "gssdlc1    %[v32],          0x07(%[dst_ptr])                 \n\t"
+    "gssdrc1    %[v32],          0x00(%[dst_ptr])                 \n\t"
+    "gssdlc1    %[v32],          0x0f(%[dst_ptr])                 \n\t"
+    "gssdrc1    %[v32],          0x08(%[dst_ptr])                 \n\t"
 
-      "daddi      %[width],        %[width],         -0x04          \n\t"
-      "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
-      "bnez       %[width],        1b                               \n\t"
-      : [v32] "+&f"(v32)
-      : [dst_ptr] "r"(dst_argb), [width] "r"(width)
-      : "memory");
+    "daddi      %[width],        %[width],         -0x04          \n\t"
+    "daddiu     %[dst_ptr],      %[dst_ptr],        0x10          \n\t"
+    "bnez       %[width],        1b                               \n\t"
+    : [v32]"+&f"(v32)
+    : [dst_ptr]"r"(dst_argb),           [width]"r"(width)
+    : "memory"
+  );
 }
 
 // 10 bit YUV to ARGB
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 1cf8eefea..99e7db97f 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -682,22 +682,23 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       // Start at end of source row.
-      "mov        r3, #-16                       \n"
       "add        %0, %0, %2                     \n"
-      "sub        %0, #16                        \n"
+      "sub        %0, %0, #32                    \n"  // 32 bytes per loop
 
       "1:                                        \n"
-      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-      "subs       %2, #16                        \n"  // 16 pixels per loop.
-      "vrev64.8   q0, q0                         \n"
-      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-      "vst1.8     {d0}, [%1]!                    \n"
+      "vld1.8     {q1, q2}, [%0], %3             \n"  // src -= 32
+      "subs       %2, #32                        \n"  // 32 pixels per loop.
+      "vrev64.8   q0, q2                         \n"
+      "vrev64.8   q1, q1                         \n"
+      "vswp       d0, d1                         \n"
+      "vswp       d2, d3                         \n"
+      "vst1.8     {q0, q1}, [%1]!                \n"  // dst += 32
       "bgt        1b                             \n"
       : "+r"(src),   // %0
         "+r"(dst),   // %1
         "+r"(width)  // %2
-      :
-      : "cc", "memory", "r3", "q0");
+      : "r"(-32)     // %3
+      : "cc", "memory", "q0", "q1", "q2");
 }
 
 void MirrorUVRow_NEON(const uint8_t* src_uv,
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 866e7bfc6..5646da8a2 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -723,23 +723,29 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
       : "cc", "memory", "v0");
 }
 
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
+
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
   asm volatile(
       // Start at end of source row.
+      "ld1        {v3.16b}, [%4]                 \n"  // shuffler
       "add        %0, %0, %w2, sxtw              \n"
-      "sub        %0, %0, #16                    \n"
+      "sub        %0, %0, #32                    \n"
       "1:                                        \n"
-      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
-      "rev64      v0.16b, v0.16b                 \n"
-      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-      "st1        {v0.D}[0], [%1], #8            \n"
+      "ld1        {v1.16b,v2.16b}, [%0], %3      \n"  // src -= 32
+      "subs       %w2, %w2, #32                  \n"  // 32 pixels per loop.
+      "tbl        v1.16b, {v1.16b}, v3.16b       \n"
+      "tbl        v0.16b, {v2.16b}, v3.16b       \n"
+      "st1        {v0.16b, v1.16b}, [%1], #32    \n"  // store 32 pixels
       "b.gt       1b                             \n"
       : "+r"(src),           // %0
         "+r"(dst),           // %1
         "+r"(width)          // %2
-      : "r"((ptrdiff_t)-16)  // %3
-      : "cc", "memory", "v0");
+      : "r"((ptrdiff_t)-32), // %3
+        "r"(&kShuffleMirror) // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
 void MirrorUVRow_NEON(const uint8_t* src_uv,
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index fb9632a94..698cd9562 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -40,9 +40,9 @@
 #endif
 
 // Some functions fail on big endian. Enable these tests on all cpus except
-// PowerPC
-#if !defined(__powerpc__)
-#define LITTLE_ENDIAN_TEST 1
+// PowerPC, but they are not optimized so disabled by default.
+#if !defined(__powerpc__) && defined(ENABLE_SLOW_TESTS)
+#define INTEL_TEST 1
 #endif
 
 namespace libyuv {
@@ -691,7 +691,7 @@ TESTPLANARTOB(J420, 2, 2, RAW, 3, 3, 1)
 TESTPLANARTOB(J420, 2, 2, RGB24, 3, 3, 1)
 TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1)
 TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
 TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1)
 TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1)
@@ -723,7 +723,7 @@ TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
 TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
 TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
 TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1)
 TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #endif
@@ -876,7 +876,7 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3, 2)
 TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3, 2)
 TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3, 2)
 TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3, 2)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2, 9)
 #endif
 TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2)
@@ -1012,7 +1012,7 @@ TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
 TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
 TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR)
 TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
 TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
 #endif
@@ -1022,7 +1022,7 @@ TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
 TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
 TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
 TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, ARM_YUV_ERROR)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
 #endif
 TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
@@ -1200,20 +1200,20 @@ TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
 // TODO(fbarchard): make ARM version of C code that matches NEON.
 TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1, 0)
 TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0)
 #endif
 TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1, 0)
 #endif
 TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1, 0)
 TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1, 0)
 #endif
 TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0)
 #endif
 TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
@@ -1226,7 +1226,7 @@ TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 0)
 TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1, 0)
 TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
 TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
 #endif
 TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
@@ -1245,7 +1245,7 @@ TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1, 0)
 TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
 TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
 TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1, 0)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
 #endif
 TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
@@ -1348,7 +1348,7 @@ TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0)
   TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
                   HEIGHT_B, DIFF)
 
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
 #endif
 
@@ -2447,7 +2447,7 @@ TEST_F(LibYUVConvertTest, TestDither) {
   TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,       \
                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
 
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
 #endif
 #define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12)                                \
@@ -2591,7 +2591,7 @@ TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
 TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
 TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3)
 TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
@@ -2738,7 +2738,7 @@ TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
                 _Opt, +, 0, FMT_C, BPP_C)
 
 // Caveat: Destination needs to be 4 bytes
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
 TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
 TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
@@ -2929,7 +2929,7 @@ TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1, 2)
 TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1, 2)
 TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1, 2)
 TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1, 2)
-#ifdef LITTLE_ENDIAN_TEST
+#ifdef INTEL_TEST
 TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 2)
 TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 2)
 TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2)
diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index 93b77f56a..1dfa47f86 100644
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -306,7 +306,9 @@ TEST_SCALETO(ARGBScale, 320, 240)
 TEST_SCALETO(ARGBScale, 569, 480)
 TEST_SCALETO(ARGBScale, 640, 360)
 TEST_SCALETO(ARGBScale, 1280, 720)
+#ifdef ENABLE_SLOW_TESTS
 TEST_SCALETO(ARGBScale, 1920, 1080)
+#endif  // ENABLE_SLOW_TESTS
 #undef TEST_SCALETO1
 #undef TEST_SCALETO
 
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 94a785900..ac1e26ea3 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -500,7 +500,7 @@ static int I444TestFilter_16(int src_width,
 #define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
 #define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
 
-#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
+#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff)                     \
   TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) {                \
     int diff = I420TestFilter(                                               \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
@@ -517,7 +517,7 @@ static int I444TestFilter_16(int src_width,
         benchmark_cpu_info_);                                                \
     EXPECT_LE(diff, max_diff);                                               \
   }                                                                          \
-  TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter##_16) {           \
+  TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_16) { \
     int diff = I420TestFilter_16(                                            \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
@@ -525,7 +525,7 @@ static int I444TestFilter_16(int src_width,
         benchmark_cpu_info_);                                                \
     EXPECT_LE(diff, max_diff);                                               \
   }                                                                          \
-  TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter##_16) {           \
+  TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_16) { \
     int diff = I444TestFilter_16(                                            \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
@@ -536,11 +536,19 @@ static int I444TestFilter_16(int src_width,
 
 // Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
+#ifdef ENABLE_SLOW_TESTS
 #define TEST_FACTOR(name, nom, denom, boxdiff) \
-  TEST_FACTOR1(name, None, nom, denom, 0)      \
-  TEST_FACTOR1(name, Linear, nom, denom, 3)    \
-  TEST_FACTOR1(name, Bilinear, nom, denom, 3)  \
-  TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+  TEST_FACTOR1(, name, None, nom, denom, 0)      \
+  TEST_FACTOR1(, name, Linear, nom, denom, 3)    \
+  TEST_FACTOR1(, name, Bilinear, nom, denom, 3)  \
+  TEST_FACTOR1(, name, Box, nom, denom, boxdiff)
+#else
+#define TEST_FACTOR(name, nom, denom, boxdiff) \
+  TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0)      \
+  TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3)    \
+  TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3)  \
+  TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff)
+#endif
 
 TEST_FACTOR(2, 1, 2, 0)
 TEST_FACTOR(4, 1, 4, 0)
@@ -553,7 +561,7 @@ TEST_FACTOR(3, 1, 3, 0)
 #undef SX
 #undef DX
 
-#define TEST_SCALETO1(name, width, height, filter, max_diff)                  \
+#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff)                  \
   TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) {      \
     int diff = I420TestFilter(benchmark_width_, benchmark_height_, width,     \
                               height, kFilter##filter, benchmark_iterations_, \
@@ -566,13 +574,13 @@ TEST_FACTOR(3, 1, 3, 0)
                               disable_cpu_flags_, benchmark_cpu_info_);       \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
-  TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter##_16) { \
+  TEST_F(LibYUVScaleTest, DISABLED_##I420##name##To##width##x##height##_##filter##_16) { \
     int diff = I420TestFilter_16(                                             \
         benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
         benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
-  TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter##_16) { \
+  TEST_F(LibYUVScaleTest, DISABLED_##I444##name##To##width##x##height##_##filter##_16) { \
     int diff = I444TestFilter_16(                                             \
         benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
         benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
@@ -593,7 +601,7 @@ TEST_FACTOR(3, 1, 3, 0)
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
-         I420##name##From##width##x##height##_##filter##_16) {                \
+         DISABLED_##I420##name##From##width##x##height##_##filter##_16) {                \
     int diff = I420TestFilter_16(width, height, Abs(benchmark_width_),        \
                                  Abs(benchmark_height_), kFilter##filter,     \
                                  benchmark_iterations_, disable_cpu_flags_,   \
@@ -601,7 +609,7 @@ TEST_FACTOR(3, 1, 3, 0)
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
   TEST_F(LibYUVScaleTest,                                                     \
-         I444##name##From##width##x##height##_##filter##_16) {                \
+         DISABLED_##I444##name##From##width##x##height##_##filter##_16) {                \
     int diff = I444TestFilter_16(width, height, Abs(benchmark_width_),        \
                                  Abs(benchmark_height_), kFilter##filter,     \
                                  benchmark_iterations_, disable_cpu_flags_,   \
@@ -609,19 +617,30 @@ TEST_FACTOR(3, 1, 3, 0)
     EXPECT_LE(diff, max_diff);                                                \
   }
 
+#ifdef ENABLE_SLOW_TESTS
 // Test scale to a specified size with all 4 filters.
 #define TEST_SCALETO(name, width, height)         \
-  TEST_SCALETO1(name, width, height, None, 0)     \
-  TEST_SCALETO1(name, width, height, Linear, 3)   \
-  TEST_SCALETO1(name, width, height, Bilinear, 3) \
-  TEST_SCALETO1(name, width, height, Box, 3)
+  TEST_SCALETO1(, name, width, height, None, 0)     \
+  TEST_SCALETO1(, name, width, height, Linear, 3)   \
+  TEST_SCALETO1(, name, width, height, Bilinear, 3) \
+  TEST_SCALETO1(, name, width, height, Box, 3)
+#else
+  // Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)         \
+  TEST_SCALETO1(DISABLED_, name, width, height, None, 0)     \
+  TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3)   \
+  TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
+  TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
+#endif
 
 TEST_SCALETO(Scale, 1, 1)
 TEST_SCALETO(Scale, 320, 240)
 TEST_SCALETO(Scale, 569, 480)
 TEST_SCALETO(Scale, 640, 360)
 TEST_SCALETO(Scale, 1280, 720)
+#ifdef ENABLE_SLOW_TESTS
 TEST_SCALETO(Scale, 1920, 1080)
+#endif  // ENABLE_SLOW_TESTS
 #undef TEST_SCALETO1
 #undef TEST_SCALETO
 
@@ -879,7 +898,7 @@ static int TestPlaneFilter_16(int src_width,
 #define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
 
 #define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
-  TEST_F(LibYUVScaleTest, ScalePlaneDownBy##name##_##filter##_16) {          \
+  TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) {\
     int diff = TestPlaneFilter_16(                                           \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \