From 6c1b2d38c685e769cf7db2806e27c8ec4c028fe3 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Fri, 26 Oct 2012 22:49:18 +0000
Subject: [PATCH] Mips port of libyuv.  Includes functionality for convert,
 rotate, scale and memcpy. BUG=126 TESTED=tested by mips Review URL:
 https://webrtc-codereview.appspot.com/930005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@449 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium            |   2 +-
 include/libyuv.h           |   2 +
 include/libyuv/row.h       |  25 ++
 include/libyuv/version.h   |   2 +-
 libyuv.gyp                 |   3 +-
 source/convert_argb.cc     |  18 ++
 source/convert_from.cc     |  39 +++
 source/cpu_id.cc           |   6 +-
 source/memcpy_mips.S       | 514 +++++++++++++++++++------------------
 source/planar_functions.cc |  13 +
 source/rotate.cc           |  44 ++++
 source/rotate_mips.cc      | 485 ++++++++++++++++++++++++++++++++++
 source/row_mips.cc         | 401 +++++++++++++++++++++++++++++
 source/scale.cc            |  54 +++-
 source/scale_mips.cc       | 454 ++++++++++++++++++++++++++++++++
 unit_test/planar_test.cc   |  66 +++++
 16 files changed, 1867 insertions(+), 261 deletions(-)
 create mode 100644 source/rotate_mips.cc

diff --git a/README.chromium b/README.chromium
index 0936990f3..d8a9a5d86 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 447
+Version: 449
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv.h b/include/libyuv.h
index 1c57a41dd..bd45c8e1c 100644
--- a/include/libyuv.h
+++ b/include/libyuv.h
@@ -19,9 +19,11 @@
 #include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/format_conversion.h"
+#include "libyuv/mjpeg_decoder.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
 #include "libyuv/scale.h"
 #include "libyuv/scale_argb.h"
 #include "libyuv/version.h"
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 00b20b1de..f68f6ddb4 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -175,8 +175,14 @@ extern "C" {
 
 // The following are available on Mips platforms
 #if !defined(YUV_DISABLE_ASM) && defined(__mips__)
+#define HAS_COPYROW_MIPS
 #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 #define HAS_SPLITUV_MIPS_DSPR2
+#define HAS_MIRRORROW_MIPS_DSPR2
+#define HAS_MIRRORROWUV_MIPS_DSPR2
+#define HAS_I422TOARGBROW_MIPS_DSPR2
+#define HAS_I422TOBGRAROW_MIPS_DSPR2
+#define HAS_I422TOABGRROW_MIPS_DSPR2
 #endif
 #endif
 
@@ -282,6 +288,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRowUV_MIPS_DSPR2(const uint8* src, uint8* dst_u, uint8* dst_v,
+                            int width);
 void MirrorRow_C(const uint8* src, uint8* dst, int width);
 
 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
@@ -321,6 +330,7 @@ void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_X86(const uint8* src, uint8* dst, int count);
 void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
 void CopyRow_C(const uint8* src, uint8* dst, int count);
 
 void SetRow8_X86(uint8* dst, uint32 v32, int count);
@@ -694,6 +704,21 @@ void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
                             const uint8* uv_buf,
                             uint8* argb_buf,
                             int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
 
 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 06aee083e..75411b9ed 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 447
+#define LIBYUV_VERSION 449
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/libyuv.gyp b/libyuv.gyp
index 0fd33b35a..9607bb3a5 100644
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -75,11 +75,12 @@
         'source/convert_from_argb.cc',
         'source/cpu_id.cc',
         'source/format_conversion.cc',
-        'source/memcpy_mips.S',
+        'source/memcpy_mips.S',  # TODO(fbarchard): Move into row_mips.cc
         'source/mjpeg_decoder.cc',
         'source/planar_functions.cc',
         'source/rotate.cc',
         'source/rotate_argb.cc',
+        'source/rotate_mips.cc',
         'source/rotate_neon.cc',
         'source/row_common.cc',
         'source/row_mips.cc',
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 74d04dbee..5b8d285cd 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -132,6 +132,14 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
       I422ToARGBRow = I422ToARGBRow_NEON;
     }
   }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif
 
   for (int y = 0; y < height; ++y) {
@@ -756,6 +764,11 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
       I422ToARGBRow = I422ToARGBRow_NEON;
     }
   }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif
 
   SIMD_ALIGNED(uint8 rowy[kMaxStride]);
@@ -829,6 +842,11 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
       I422ToARGBRow = I422ToARGBRow_NEON;
     }
   }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif
 
   SIMD_ALIGNED(uint8 rowy[kMaxStride]);
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 73ed900bd..cb00480a5 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -599,6 +599,14 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
       I422ToARGBRow = I422ToARGBRow_NEON;
     }
   }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif
 
   for (int y = 0; y < height; ++y) {
@@ -652,6 +660,14 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
       I422ToBGRARow = I422ToBGRARow_NEON;
     }
   }
+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
 #endif
 
   for (int y = 0; y < height; ++y) {
@@ -909,6 +925,13 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
       I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif
 
   SIMD_ALIGNED(uint8 row[kMaxStride]);
@@ -975,6 +998,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
       I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif
 
   SIMD_ALIGNED(uint8 row[kMaxStride]);
@@ -1041,6 +1072,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
       I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif
 
   SIMD_ALIGNED(uint8 row[kMaxStride]);
diff --git a/source/cpu_id.cc b/source/cpu_id.cc
index a75739cbe..4032080f9 100644
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -174,7 +174,7 @@ int InitCpuFlags(void) {
     }
   }
 #endif
-  // environment variable overrides for testing.
+  // Environment variable overrides for testing.
   if (TestEnv("LIBYUV_DISABLE_X86")) {
     cpu_info_ &= ~kCpuHasX86;
   }
@@ -197,7 +197,7 @@ int InitCpuFlags(void) {
     cpu_info_ &= ~kCpuHasAVX2;
   }
 #elif defined(__mips__) && defined(__linux__)
-  // linux mips parse text file for dsp detect.
+  // Linux mips parse text file for dsp detect.
   cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
 #if defined(__mips_dspr2)
   cpu_info_ |= kCpuHasMIPS_DSPR2;
@@ -215,7 +215,7 @@ int InitCpuFlags(void) {
   }
 #elif defined(__arm__)
 #if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-  // linux arm parse text file for neon detect.
+  // Linux arm parse text file for neon detect.
   cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
 #elif defined(__ARM_NEON__)
   // gcc -mfpu=neon defines __ARM_NEON__
diff --git a/source/memcpy_mips.S b/source/memcpy_mips.S
index 83292d2f9..722ef4fcf 100644
--- a/source/memcpy_mips.S
+++ b/source/memcpy_mips.S
@@ -1,171 +1,179 @@
 #if defined (__mips__)
-
-    .globl  memcpy_MIPS;
-    .align  2;
-    .type   memcpy_MIPS,@function;
-    .ent    memcpy_MIPS,0;
+#
+#  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+#
+#  Use of this source code is governed by a BSD-style license
+#  that can be found in the LICENSE file in the root of the source
+#  tree. An additional intellectual property rights grant can be found
+#  in the file PATENTS.  All contributing project authors may
+#  be found in the AUTHORS file in the root of the source tree.
+#
+  .globl  memcpy_MIPS;
+  .align  2;
+  .type   memcpy_MIPS,@function;
+  .ent    memcpy_MIPS,0;
 memcpy_MIPS:
-    .frame  $sp,0,$ra
-  .set  noreorder
-  .set  noat
+  .frame  $sp,0,$ra
+  .set    noreorder
+  .set    noat
 
-  slti  $at,$a2,8
+  slti    $at,$a2,8
   bne     $at,$zero,last8
-  move  $v0,$a0 # memcpy returns the dst pointer
+   move   $v0,$a0 # memcpy returns the dst pointer
 
 # Test if the src and dst are word-aligned, or can be made word-aligned
-  xor $t8,$a1,$a0
-  andi  $t8,$t8,0x3   # t8 is a0/a1 word-displacement
+  xor     $t8,$a1,$a0
+  andi    $t8,$t8,0x3   # t8 is a0/a1 word-displacement
 
-  bne $t8,$zero,unaligned
-  negu  $a3,$a0
+  bne     $t8,$zero,unaligned
+  negu    $a3,$a0
 
-  andi  $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
-  beq $a3,$zero,chk16w  # when a3=0 then the dst (a0) is word-aligned
-  subu  $a2,$a2,$a3 # now a2 is the remining bytes count
+  andi    $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
+  beq     $a3,$zero,chk16w  # when a3=0 then the dst (a0) is
+   subu   $a2,$a2,$a3 # word-aligned now a2 is the remining bytes count
 
-  lwr  $t8,0($a1)
-  addu  $a1,$a1,$a3
-  swr  $t8,0($a0)
-  addu  $a0,$a0,$a3
+  lwr     $t8,0($a1)
+  addu    $a1,$a1,$a3
+  swr     $t8,0($a0)
+  addu    $a0,$a0,$a3
 
 # Now the dst/src are mutually word-aligned with word-aligned addresses
-chk16w: andi  $t8,$a2,0x3f  # any whole 64-byte chunks?
-        # t8 is the byte count after 64-byte chunks
-
-  beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks
-        # There will be at most 1 32-byte chunk after it
-  subu  $a3,$a2,$t8 # subtract from a2 the reminder
-                                # Here a3 counts bytes in 16w chunks
-  addu  $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
-
-  addu  $t0,$a0,$a2 # t0 is the "past the end" address
+chk16w:
+  andi    $t8,$a2,0x3f  # any whole 64-byte chunks?
+                        # t8 is the byte count after 64-byte chunks
+  beq     $a2,$t8,chk8w # if a2==t8, no 64-byte chunks
+                        # There will be at most 1 32-byte chunk after it
+   subu   $a3,$a2,$t8 # subtract from a2 the reminder
+                      # Here a3 counts bytes in 16w chunks
+  addu    $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
+  addu    $t0,$a0,$a2 # t0 is the "past the end" address
 
 # When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
 # the "t0-32" address
 # This means: for x=128 the last "safe" a0 address is "t0-160"
 # Alternatively, for x=64 the last "safe" a0 address is "t0-96"
 # In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
-  subu  $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
+  subu    $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
 
   pref    0,0($a1)    # bring the first line of src, addr 0
   pref    0,32($a1) # bring the second line of src, addr 32
   pref    0,64($a1) # bring the third line of src, addr 64
-  pref  30,32($a0)  # safe, as we have at least 64 bytes ahead
+  pref    30,32($a0)  # safe, as we have at least 64 bytes ahead
 # In case the a0 > t9 don't use "pref 30" at all
-  sgtu  $v1,$a0,$t9
-  bgtz  $v1,loop16w # skip "pref 30,64(a0)" for too short arrays
-  nop
+  sgtu    $v1,$a0,$t9
+  bgtz    $v1,loop16w # skip "pref 30,64(a0)" for too short arrays
+   nop
 # otherwise, start with using pref30
-  pref  30,64($a0)
+  pref    30,64($a0)
 loop16w:
-  pref  0,96($a1)
-  lw  $t0,0($a1)
-  bgtz  $v1,skip_pref30_96  # skip "pref 30,96(a0)"
-  lw  $t1,4($a1)
+  pref    0,96($a1)
+  lw      $t0,0($a1)
+  bgtz    $v1,skip_pref30_96  # skip "pref 30,96(a0)"
+   lw     $t1,4($a1)
   pref    30,96($a0)   # continue setting up the dest, addr 96
 skip_pref30_96:
-  lw  $t2,8($a1)
-  lw  $t3,12($a1)
-  lw  $t4,16($a1)
-  lw  $t5,20($a1)
-  lw  $t6,24($a1)
-  lw  $t7,28($a1)
-        pref    0,128($a1)    # bring the next lines of src, addr 128
+  lw      $t2,8($a1)
+  lw      $t3,12($a1)
+  lw      $t4,16($a1)
+  lw      $t5,20($a1)
+  lw      $t6,24($a1)
+  lw      $t7,28($a1)
+  pref    0,128($a1)    # bring the next lines of src, addr 128
 
-  sw  $t0,0($a0)
-  sw  $t1,4($a0)
-  sw  $t2,8($a0)
-  sw  $t3,12($a0)
-  sw  $t4,16($a0)
-  sw  $t5,20($a0)
-  sw  $t6,24($a0)
-  sw  $t7,28($a0)
+  sw      $t0,0($a0)
+  sw      $t1,4($a0)
+  sw      $t2,8($a0)
+  sw      $t3,12($a0)
+  sw      $t4,16($a0)
+  sw      $t5,20($a0)
+  sw      $t6,24($a0)
+  sw      $t7,28($a0)
 
-  lw  $t0,32($a1)
-  bgtz  $v1,skip_pref30_128 # skip "pref 30,128(a0)"
-  lw  $t1,36($a1)
+  lw      $t0,32($a1)
+  bgtz    $v1,skip_pref30_128 # skip "pref 30,128(a0)"
+  lw      $t1,36($a1)
   pref    30,128($a0)   # continue setting up the dest, addr 128
 skip_pref30_128:
-  lw  $t2,40($a1)
-  lw  $t3,44($a1)
-  lw  $t4,48($a1)
-  lw  $t5,52($a1)
-  lw  $t6,56($a1)
-  lw  $t7,60($a1)
-    pref    0, 160($a1)    # bring the next lines of src, addr 160
+  lw      $t2,40($a1)
+  lw      $t3,44($a1)
+  lw      $t4,48($a1)
+  lw      $t5,52($a1)
+  lw      $t6,56($a1)
+  lw      $t7,60($a1)
+  pref    0, 160($a1)    # bring the next lines of src, addr 160
 
-  sw  $t0,32($a0)
-  sw  $t1,36($a0)
-  sw  $t2,40($a0)
-  sw  $t3,44($a0)
-  sw  $t4,48($a0)
-  sw  $t5,52($a0)
-  sw  $t6,56($a0)
-  sw  $t7,60($a0)
+  sw      $t0,32($a0)
+  sw      $t1,36($a0)
+  sw      $t2,40($a0)
+  sw      $t3,44($a0)
+  sw      $t4,48($a0)
+  sw      $t5,52($a0)
+  sw      $t6,56($a0)
+  sw      $t7,60($a0)
 
-  addiu $a0,$a0,64  # adding 64 to dest
-  sgtu  $v1,$a0,$t9
-  bne $a0,$a3,loop16w
-  addiu $a1,$a1,64  # adding 64 to src
-  move  $a2,$t8
+  addiu   $a0,$a0,64  # adding 64 to dest
+  sgtu    $v1,$a0,$t9
+  bne     $a0,$a3,loop16w
+   addiu  $a1,$a1,64  # adding 64 to src
+  move    $a2,$t8
 
 # Here we have src and dest word-aligned but less than 64-bytes to go
 
 chk8w:
   pref 0, 0x0($a1)
-  andi  $t8,$a2,0x1f  # is there a 32-byte chunk?
-        # the t8 is the reminder count past 32-bytes
-  beq $a2,$t8,chk1w # when a2=t8, no 32-byte chunk
+  andi    $t8,$a2,0x1f  # is there a 32-byte chunk?
+                        # the t8 is the reminder count past 32-bytes
+  beq     $a2,$t8,chk1w # when a2=t8, no 32-byte chunk
    nop
 
-  lw  $t0,0($a1)
-  lw  $t1,4($a1)
-  lw  $t2,8($a1)
-  lw  $t3,12($a1)
-  lw  $t4,16($a1)
-  lw  $t5,20($a1)
-  lw  $t6,24($a1)
-  lw  $t7,28($a1)
-  addiu $a1,$a1,32
+  lw      $t0,0($a1)
+  lw      $t1,4($a1)
+  lw      $t2,8($a1)
+  lw      $t3,12($a1)
+  lw      $t4,16($a1)
+  lw      $t5,20($a1)
+  lw      $t6,24($a1)
+  lw      $t7,28($a1)
+  addiu   $a1,$a1,32
 
-  sw  $t0,0($a0)
-  sw  $t1,4($a0)
-  sw  $t2,8($a0)
-  sw  $t3,12($a0)
-  sw  $t4,16($a0)
-  sw  $t5,20($a0)
-  sw  $t6,24($a0)
-  sw  $t7,28($a0)
-  addiu $a0,$a0,32
+  sw      $t0,0($a0)
+  sw      $t1,4($a0)
+  sw      $t2,8($a0)
+  sw      $t3,12($a0)
+  sw      $t4,16($a0)
+  sw      $t5,20($a0)
+  sw      $t6,24($a0)
+  sw      $t7,28($a0)
+  addiu   $a0,$a0,32
 
 chk1w:
-  andi  $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
-  beq $a2,$t8,last8
-  subu  $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
-  addu  $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
+  andi    $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
+  beq     $a2,$t8,last8
+   subu   $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
+  addu    $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
 
 # copying in words (4-byte chunks)
 wordCopy_loop:
-  lw  $t3,0($a1)  # the first t3 may be equal t0 ... optimize?
-  addiu $a1,$a1,4
-  addiu $a0,$a0,4
-  bne $a0,$a3,wordCopy_loop
-  sw  $t3,-4($a0)
+  lw      $t3,0($a1)  # the first t3 may be equal t0 ... optimize?
+  addiu   $a1,$a1,4
+  addiu   $a0,$a0,4
+  bne     $a0,$a3,wordCopy_loop
+   sw     $t3,-4($a0)
 
 # For the last (<8) bytes
 last8:
-  blez  $a2,leave
-  addu  $a3,$a0,$a2 # a3 is the last dst address
+  blez    $a2,leave
+   addu   $a3,$a0,$a2 # a3 is the last dst address
 last8loop:
-  lb  $v1,0($a1)
-  addiu $a1,$a1,1
-  addiu $a0,$a0,1
-  bne $a0,$a3,last8loop
-  sb  $v1,-1($a0)
+  lb      $v1,0($a1)
+  addiu   $a1,$a1,1
+  addiu   $a0,$a0,1
+  bne     $a0,$a3,last8loop
+   sb     $v1,-1($a0)
 
-leave:  j $ra
-  nop
+leave:
+  j       $ra
+   nop
 
 #
 # UNALIGNED case
@@ -173,174 +181,172 @@ leave:  j $ra
 
 unaligned:
   # got here with a3="negu a0"
-  andi  $a3,$a3,0x3 # test if the a0 is word aligned
-  beqz  $a3,ua_chk16w
-  subu  $a2,$a2,$a3 # bytes left after initial a3 bytes
+  andi    $a3,$a3,0x3 # test if the a0 is word aligned
+  beqz    $a3,ua_chk16w
+   subu   $a2,$a2,$a3 # bytes left after initial a3 bytes
 
-  lwr  $v1,0($a1)
-  lwl  $v1,3($a1)
-  addu  $a1,$a1,$a3 # a3 may be here 1, 2 or 3
-  swr  $v1,0($a0)
-  addu  $a0,$a0,$a3 # below the dst will be word aligned (NOTE1)
-
-ua_chk16w:  andi  $t8,$a2,0x3f  # any whole 64-byte chunks?
-        # t8 is the byte count after 64-byte chunks
-  beq $a2,$t8,ua_chk8w  # if a2==t8, no 64-byte chunks
-        # There will be at most 1 32-byte chunk after it
-  subu  $a3,$a2,$t8 # subtract from a2 the reminder
-                                # Here a3 counts bytes in 16w chunks
-  addu  $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
-
-  addu  $t0,$a0,$a2 # t0 is the "past the end" address
-
-  subu  $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
+  lwr     $v1,0($a1)
+  lwl     $v1,3($a1)
+  addu    $a1,$a1,$a3 # a3 may be here 1, 2 or 3
+  swr     $v1,0($a0)
+  addu    $a0,$a0,$a3 # below the dst will be word aligned (NOTE1)
 
+ua_chk16w:
+  andi    $t8,$a2,0x3f  # any whole 64-byte chunks?
+                        # t8 is the byte count after 64-byte chunks
+  beq     $a2,$t8,ua_chk8w  # if a2==t8, no 64-byte chunks
+  # There will be at most 1 32-byte chunk after it
+  subu    $a3,$a2,$t8 # subtract from a2 the reminder
+                      # Here a3 counts bytes in 16w chunks
+  addu    $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
+  addu    $t0,$a0,$a2 # t0 is the "past the end" address
+  subu    $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
   pref    0,0($a1)    # bring the first line of src, addr 0
   pref    0,32($a1) # bring the second line of src, addr 32
   pref    0,64($a1) # bring the third line of src, addr 64
-  pref  30,32($a0)  # safe, as we have at least 64 bytes ahead
+  pref    30,32($a0)  # safe, as we have at least 64 bytes ahead
 # In case the a0 > t9 don't use "pref 30" at all
-  sgtu  $v1,$a0,$t9
-  bgtz  $v1,ua_loop16w  # skip "pref 30,64(a0)" for too short arrays
-  nop
+  sgtu    $v1,$a0,$t9
+  bgtz    $v1,ua_loop16w  # skip "pref 30,64(a0)" for too short arrays
+   nop
 # otherwise, start with using pref30
-  pref  30,64($a0)
+  pref    30,64($a0)
 ua_loop16w:
-  pref  0,96($a1)
-  lwr  $t0,0($a1)
-  lwl  $t0,3($a1)
-  lwr  $t1,4($a1)
-  bgtz  $v1,ua_skip_pref30_96
-  lwl  $t1,7($a1)
+  pref    0,96($a1)
+  lwr     $t0,0($a1)
+  lwl     $t0,3($a1)
+  lwr     $t1,4($a1)
+  bgtz    $v1,ua_skip_pref30_96
+   lwl    $t1,7($a1)
   pref    30,96($a0)   # continue setting up the dest, addr 96
 ua_skip_pref30_96:
-  lwr  $t2,8($a1)
-  lwl  $t2,11($a1)
-  lwr  $t3,12($a1)
-  lwl  $t3,15($a1)
-  lwr  $t4,16($a1)
-  lwl  $t4,19($a1)
-  lwr  $t5,20($a1)
-  lwl  $t5,23($a1)
-  lwr  $t6,24($a1)
-  lwl  $t6,27($a1)
-  lwr  $t7,28($a1)
-  lwl  $t7,31($a1)
-        pref    0,128($a1)    # bring the next lines of src, addr 128
+  lwr     $t2,8($a1)
+  lwl     $t2,11($a1)
+  lwr     $t3,12($a1)
+  lwl     $t3,15($a1)
+  lwr     $t4,16($a1)
+  lwl     $t4,19($a1)
+  lwr     $t5,20($a1)
+  lwl     $t5,23($a1)
+  lwr     $t6,24($a1)
+  lwl     $t6,27($a1)
+  lwr     $t7,28($a1)
+  lwl     $t7,31($a1)
+  pref    0,128($a1)    # bring the next lines of src, addr 128
 
-  sw  $t0,0($a0)
-  sw  $t1,4($a0)
-  sw  $t2,8($a0)
-  sw  $t3,12($a0)
-  sw  $t4,16($a0)
-  sw  $t5,20($a0)
-  sw  $t6,24($a0)
-  sw  $t7,28($a0)
+  sw      $t0,0($a0)
+  sw      $t1,4($a0)
+  sw      $t2,8($a0)
+  sw      $t3,12($a0)
+  sw      $t4,16($a0)
+  sw      $t5,20($a0)
+  sw      $t6,24($a0)
+  sw      $t7,28($a0)
 
-  lwr  $t0,32($a1)
-  lwl  $t0,35($a1)
-  lwr  $t1,36($a1)
-  bgtz  $v1,ua_skip_pref30_128
-  lwl  $t1,39($a1)
+  lwr     $t0,32($a1)
+  lwl     $t0,35($a1)
+  lwr     $t1,36($a1)
+  bgtz    $v1,ua_skip_pref30_128
+   lwl    $t1,39($a1)
   pref    30,128($a0)   # continue setting up the dest, addr 128
 ua_skip_pref30_128:
-  lwr  $t2,40($a1)
-  lwl  $t2,43($a1)
-  lwr  $t3,44($a1)
-  lwl  $t3,47($a1)
-  lwr  $t4,48($a1)
-  lwl  $t4,51($a1)
-  lwr  $t5,52($a1)
-  lwl  $t5,55($a1)
-  lwr  $t6,56($a1)
-  lwl  $t6,59($a1)
-  lwr  $t7,60($a1)
-  lwl  $t7,63($a1)
-        pref    0, 160($a1)    # bring the next lines of src, addr 160
+  lwr     $t2,40($a1)
+  lwl     $t2,43($a1)
+  lwr     $t3,44($a1)
+  lwl     $t3,47($a1)
+  lwr     $t4,48($a1)
+  lwl     $t4,51($a1)
+  lwr     $t5,52($a1)
+  lwl     $t5,55($a1)
+  lwr     $t6,56($a1)
+  lwl     $t6,59($a1)
+  lwr     $t7,60($a1)
+  lwl     $t7,63($a1)
+  pref    0, 160($a1)    # bring the next lines of src, addr 160
 
-  sw  $t0,32($a0)
-  sw  $t1,36($a0)
-  sw  $t2,40($a0)
-  sw  $t3,44($a0)
-  sw  $t4,48($a0)
-  sw  $t5,52($a0)
-  sw  $t6,56($a0)
-  sw  $t7,60($a0)
+  sw      $t0,32($a0)
+  sw      $t1,36($a0)
+  sw      $t2,40($a0)
+  sw      $t3,44($a0)
+  sw      $t4,48($a0)
+  sw      $t5,52($a0)
+  sw      $t6,56($a0)
+  sw      $t7,60($a0)
 
-  addiu $a0,$a0,64  # adding 64 to dest
-  sgtu  $v1,$a0,$t9
-  bne $a0,$a3,ua_loop16w
-  addiu $a1,$a1,64  # adding 64 to src
-  move  $a2,$t8
+  addiu   $a0,$a0,64  # adding 64 to dest
+  sgtu    $v1,$a0,$t9
+  bne     $a0,$a3,ua_loop16w
+   addiu  $a1,$a1,64  # adding 64 to src
+  move    $a2,$t8
 
 # Here we have src and dest word-aligned but less than 64-bytes to go
 
 ua_chk8w:
-  pref 0, 0x0($a1)
-  andi  $t8,$a2,0x1f  # is there a 32-byte chunk?
-        # the t8 is the reminder count
-  beq $a2,$t8,ua_chk1w  # when a2=t8, no 32-byte chunk
+  pref    0, 0x0($a1)
+  andi    $t8,$a2,0x1f  # is there a 32-byte chunk?
+                        # the t8 is the reminder count
+  beq     $a2,$t8,ua_chk1w  # when a2=t8, no 32-byte chunk
 
-  lwr  $t0,0($a1)
-  lwl  $t0,3($a1)
-  lwr  $t1,4($a1)
-  lwl  $t1,7($a1)
-  lwr  $t2,8($a1)
-  lwl  $t2,11($a1)
-  lwr  $t3,12($a1)
-  lwl  $t3,15($a1)
-  lwr  $t4,16($a1)
-  lwl  $t4,19($a1)
-  lwr  $t5,20($a1)
-  lwl  $t5,23($a1)
-  lwr  $t6,24($a1)
-  lwl  $t6,27($a1)
-  lwr  $t7,28($a1)
-  lwl  $t7,31($a1)
-  addiu $a1,$a1,32
+   lwr    $t0,0($a1)
+  lwl     $t0,3($a1)
+  lwr     $t1,4($a1)
+  lwl     $t1,7($a1)
+  lwr     $t2,8($a1)
+  lwl     $t2,11($a1)
+  lwr     $t3,12($a1)
+  lwl     $t3,15($a1)
+  lwr     $t4,16($a1)
+  lwl     $t4,19($a1)
+  lwr     $t5,20($a1)
+  lwl     $t5,23($a1)
+  lwr     $t6,24($a1)
+  lwl     $t6,27($a1)
+  lwr     $t7,28($a1)
+  lwl     $t7,31($a1)
+  addiu   $a1,$a1,32
 
-  sw  $t0,0($a0)
-  sw  $t1,4($a0)
-  sw  $t2,8($a0)
-  sw  $t3,12($a0)
-  sw  $t4,16($a0)
-  sw  $t5,20($a0)
-  sw  $t6,24($a0)
-  sw  $t7,28($a0)
-  addiu $a0,$a0,32
+  sw      $t0,0($a0)
+  sw      $t1,4($a0)
+  sw      $t2,8($a0)
+  sw      $t3,12($a0)
+  sw      $t4,16($a0)
+  sw      $t5,20($a0)
+  sw      $t6,24($a0)
+  sw      $t7,28($a0)
+  addiu   $a0,$a0,32
 
 ua_chk1w:
-  andi  $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
-  beq $a2,$t8,ua_smallCopy
-  subu  $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
-  addu  $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
+  andi    $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
+  beq     $a2,$t8,ua_smallCopy
+  subu    $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
+  addu    $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
 
 # copying in words (4-byte chunks)
 ua_wordCopy_loop:
-  lwr  $v1,0($a1)
-  lwl  $v1,3($a1)
-  addiu $a1,$a1,4
-  addiu $a0,$a0,4   # note: dst=a0 is word aligned here, see NOTE1
-  bne $a0,$a3,ua_wordCopy_loop
-  sw  $v1,-4($a0)
+  lwr     $v1,0($a1)
+  lwl     $v1,3($a1)
+  addiu   $a1,$a1,4
+  addiu   $a0,$a0,4   # note: dst=a0 is word aligned here, see NOTE1
+  bne     $a0,$a3,ua_wordCopy_loop
+   sw     $v1,-4($a0)
 
 # Now less than 4 bytes (value in a2) left to copy
 ua_smallCopy:
-  beqz  $a2,leave
-  addu  $a3,$a0,$a2 # a3 is the last dst address
+  beqz    $a2,leave
+   addu   $a3,$a0,$a2 # a3 is the last dst address
 ua_smallCopy_loop:
-  lb  $v1,0($a1)
-  addiu $a1,$a1,1
-  addiu $a0,$a0,1
-  bne $a0,$a3,ua_smallCopy_loop
-  sb  $v1,-1($a0)
+  lb      $v1,0($a1)
+  addiu   $a1,$a1,1
+  addiu   $a0,$a0,1
+  bne     $a0,$a3,ua_smallCopy_loop
+   sb     $v1,-1($a0)
 
-  j $ra
-  nop
+  j       $ra
+   nop
 
-  .set  at
-  .set  reorder
-    .end    memcpy_MIPS;
-    .size   memcpy_MIPS,.-memcpy_MIPS
+  .set    at
+  .set    reorder
+  .end    memcpy_MIPS;
+  .size   memcpy_MIPS,.-memcpy_MIPS
 
 #endif // if defined (__mips__)
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 6de5f0342..58b6ace2f 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -46,6 +46,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
     CopyRow = CopyRow_SSE2;
   }
 #endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
 
   // Copy plane
   for (int y = 0; y < height; ++y) {
@@ -424,6 +429,14 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
       }
     }
   }
+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
 #endif
 
   for (int y = 0; y < height; ++y) {
diff --git a/source/rotate.cc b/source/rotate.cc
index 8f9883f47..0601dec07 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -56,6 +56,23 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                          int width);
 #endif  // defined(__ARM_NEON__)
 
+#if !defined(YUV_DISABLE_ASM) && defined(__mips__)
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride, int width);
+#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width);
+#endif
+#endif
+
+
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_TRANSPOSE_WX8_SSSE3
 __declspec(naked) __declspec(align(16))
@@ -794,6 +811,16 @@ void TransposePlane(const uint8* src, int src_stride,
     TransposeWx8 = TransposeWx8_FAST_SSSE3;
   }
 #endif
+#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    if (IS_ALIGNED(width, 4) &&
+        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+      TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
+    } else {
+      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
+    }
+  }
+#endif
 
   // Work across the source in 8x8 tiles
   int i = height;
@@ -856,6 +883,13 @@ void RotatePlane180(const uint8* src, int src_stride,
       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
     MirrorRow = MirrorRow_SSSE3;
   }
+#endif
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
+  }
 #endif
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_NEON)
@@ -952,6 +986,11 @@ void TransposeUV(const uint8* src, int src_stride,
       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
     TransposeUVWx8 = TransposeUVWx8_SSE2;
   }
+#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
+  }
 #endif
 
   // Work through the source in 8x8 tiles.
@@ -1021,6 +1060,11 @@ void RotateUV180(const uint8* src, int src_stride,
       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
     MirrorRowUV = MirrorRowUV_SSSE3;
   }
+#elif defined(HAS_MIRRORROWUV_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    MirrorRowUV = MirrorRowUV_MIPS_DSPR2;
+  }
 #endif
 
   dst_a += dst_stride_a * (height - 1);
diff --git a/source/rotate_mips.cc b/source/rotate_mips.cc
new file mode 100644
index 000000000..430e953ee
--- /dev/null
+++ b/source/rotate_mips.cc
@@ -0,0 +1,485 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(YUV_DISABLE_ASM) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride,
+                             int width) {
+   __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+    "1:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "sw               $s0, 0(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "sw               $s1, 4(%[dst])                   \n"
+      "bnez             %[width], 1b                     \n"
+      " addu            %[dst], %[dst], %[dst_stride]    \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+   "11:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "swr              $s0, 0(%[dst])                   \n"
+      "swl              $s0, 3(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "swr              $s1, 4(%[dst])                   \n"
+      "swl              $s1, 7(%[dst])                   \n"
+      "bnez             %[width], 11b                    \n"
+       "addu             %[dst], %[dst], %[dst_stride]   \n"
+    "2:                                                  \n"
+      ".set pop                                          \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1",  "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1"
+  );
+}
+
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride,
+                                  int width) {
+  __asm__ __volatile__ (
+      ".set noat                                         \n"
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz             %[width], 2f                     \n"
+      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+
+      "srl              $AT, %[width], 0x2               \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+      "1:                                                \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "sw              $s4, 0(%[dst])                    \n"
+      "sw              $t0, 4(%[dst])                    \n"
+      "sw              $s6, 0($s0)                       \n"
+      "sw              $t8, 4($s0)                       \n"
+      "sw              $s5, 0($s1)                       \n"
+      "sw              $t1, 4($s1)                       \n"
+      "sw              $s7, 0($s2)                       \n"
+      "sw              $t9, 4($s2)                       \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 1b                          \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+      "11:                                               \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "swr              $s4, 0(%[dst])                   \n"
+      "swl              $s4, 3(%[dst])                   \n"
+      "swr              $t0, 4(%[dst])                   \n"
+      "swl              $t0, 7(%[dst])                   \n"
+      "swr              $s6, 0($s0)                      \n"
+      "swl              $s6, 3($s0)                      \n"
+      "swr              $t8, 4($s0)                      \n"
+      "swl              $t8, 7($s0)                      \n"
+      "swr              $s5, 0($s1)                      \n"
+      "swl              $s5, 3($s1)                      \n"
+      "swr              $t1, 4($s1)                      \n"
+      "swl              $t1, 7($s1)                      \n"
+      "swr              $s7, 0($s2)                      \n"
+      "swl              $s7, 3($s2)                      \n"
+      "swr              $t9, 4($s2)                      \n"
+      "swl              $t9, 7($s2)                      \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 11b                         \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "2:                                                \n"
+      ".set pop                                          \n"
+      ".set at                                           \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4",
+        "s5", "s6", "s7"
+  );
+}
+
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz            %[width], 2f                      \n"
+      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
+      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
+      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
+      "addu            $t3, $t2, %[src_stride]           \n"
+      "addu            $t5, $t4, %[src_stride]           \n"
+      "addu            $t6, $t2, $t4                     \n"
+      "subu            $t7, $t9, %[src_stride]           \n"
+      "srl             $t1, %[width], 1                  \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+      "andi            $t0, %[dst_a], 0x3                \n"
+      "andi            $t8, %[dst_b], 0x3                \n"
+      "or              $t0, $t0, $t8                     \n"
+      "andi            $t8, %[dst_stride_a], 0x3         \n"
+      "andi            $s5, %[dst_stride_b], 0x3         \n"
+      "or              $t8, $t8, $s5                     \n"
+      "or              $t0, $t0, $t8                     \n"
+      "bnez            $t0, 11f                          \n"
+      " nop                                              \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "sw              $s3, 0($s5)                       \n"
+      "sw              $s4, 0($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "sw              $s3, 0(%[dst_a])                  \n"
+      "sw              $s4, 0(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+      "sw              $s3, 4($s5)                       \n"
+      "sw              $s4, 4($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "sw              $s3, 4(%[dst_a])                  \n"
+      "sw              $s4, 4(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 1b                           \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+      "b               2f                                \n"
+      " nop                                              \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+   "11:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "swr             $s3, 0($s5)                       \n"
+      "swl             $s3, 3($s5)                       \n"
+      "swr             $s4, 0($s6)                       \n"
+      "swl             $s4, 3($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "swr             $s3, 0(%[dst_a])                  \n"
+      "swl             $s3, 3(%[dst_a])                  \n"
+      "swr             $s4, 0(%[dst_b])                  \n"
+      "swl             $s4, 3(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+
+      "swr             $s3, 4($s5)                       \n"
+      "swl             $s3, 7($s5)                       \n"
+      "swr             $s4, 4($s6)                       \n"
+      "swl             $s4, 7($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "swr             $s3, 4(%[dst_a])                  \n"
+      "swl             $s3, 7(%[dst_a])                  \n"
+      "swr             $s4, 4(%[dst_b])                  \n"
+      "swl             $s4, 7(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 11b                          \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+
+      "2:                                                \n"
+      ".set pop                                          \n"
+      : [src] "+r" (src),
+        [dst_a] "+r" (dst_a),
+        [dst_b] "+r" (dst_b),
+        [width] "+r" (width),
+        [src_stride] "+r" (src_stride)
+      : [dst_stride_a] "r" (dst_stride_a),
+        [dst_stride_b] "r" (dst_stride_b)
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/source/row_mips.cc b/source/row_mips.cc
index 20f5a4fa5..df4542fbf 100644
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -16,6 +16,13 @@ extern "C" {
 #endif
 
 #if !defined(YUV_DISABLE_ASM) && defined(__mips__)
+#if defined HAS_COPYROW_MIPS
+extern "C" void  memcpy_MIPS(uint8* dst, const uint8* src, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+  memcpy_MIPS(dst, src, count);
+}
+#endif
+
 #ifdef HAS_SPLITUV_MIPS_DSPR2
 void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                         int width) {
@@ -166,6 +173,400 @@ void SplitUV_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
 }
 #endif  // HAS_SPLITUV_MIPS_DSPR2
 
+#ifdef HAS_MIRRORROW_MIPS_DSPR2
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
+  __asm__ __volatile__ (
+      ".set push                             \n"
+      ".set noreorder                        \n"
+
+      "srl       $t4, %[width], 4            \n"  // multiplies of 16
+      "andi      $t5, %[width], 0xf          \n"
+      "blez      $t4, 2f                     \n"
+      " addu     %[src], %[src], %[width]    \n"  // src += width
+
+    "1:                                      \n"
+      "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
+      "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
+      "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
+      "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
+      "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
+      "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
+      "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
+      "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
+      "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
+      "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
+      "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
+      "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
+      "addiu     %[src], %[src], -16         \n"
+      "addiu     $t4, $t4, -1                \n"
+      "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
+      "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
+      "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
+      "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
+      "bgtz      $t4, 1b                     \n"
+      " addiu    %[dst], %[dst], 16          \n"
+      "beqz      $t5, 3f                     \n"
+      " nop                                  \n"
+
+    "2:                                      \n"
+      "lbu       $t0, -1(%[src])             \n"
+      "addiu     $t5, $t5, -1                \n"
+      "addiu     %[src], %[src], -1          \n"
+      "sb        $t0, 0(%[dst])              \n"
+      "bgez      $t5, 2b                     \n"
+      " addiu    %[dst], %[dst], 1           \n"
+
+    "3:                                      \n"
+      ".set pop                              \n"
+      : [src] "+r" (src), [dst] "+r" (dst)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4", "t5"
+  );
+}
+#endif  // HAS_MIRRORROW_MIPS_DSPR2
+
+#ifdef HAS_MIRRORROWUV_MIPS_DSPR2
+void MirrorRowUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width) {
+  int x = 0;
+  int y = 0;
+  __asm__ __volatile__ (
+      ".set push                                    \n"
+      ".set noreorder                               \n"
+
+      "addu            $t4, %[width], %[width]      \n"
+      "srl             %[x], %[width], 4            \n"
+      "andi            %[y], %[width], 0xf          \n"
+      "blez            %[x], 2f                     \n"
+      " addu           %[src_uv], %[src_uv], $t4    \n"
+
+    "1:                                             \n"
+      "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
+      "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
+      "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
+      "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
+      "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
+      "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
+      "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
+      "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
+
+      "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
+      "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
+      "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
+      "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
+      "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
+      "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
+      "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
+      "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
+      "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
+      "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
+      "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
+      "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
+      "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
+      "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
+      "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
+      "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
+      "addiu           %[src_uv], %[src_uv], -32    \n"
+      "addiu           %[x], %[x], -1               \n"
+      "swr             $t4, 0(%[dst_u])             \n"
+      "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
+      "swr             $t6, 0(%[dst_v])             \n"
+      "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
+      "swr             $t2, 4(%[dst_u])             \n"
+      "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
+      "swr             $t3, 4(%[dst_v])             \n"
+      "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
+      "swr             $t0, 8(%[dst_u])             \n"
+      "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
+      "swr             $t1, 8(%[dst_v])             \n"
+      "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
+      "swr             $t9, 12(%[dst_u])            \n"
+      "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
+      "swr             $t5, 12(%[dst_v])            \n"
+      "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
+      "addiu           %[dst_v], %[dst_v], 16       \n"
+      "bgtz            %[x], 1b                     \n"
+      " addiu          %[dst_u], %[dst_u], 16       \n"
+      "beqz            %[y], 3f                     \n"
+      " nop                                         \n"
+      "b               2f                           \n"
+      " nop                                         \n"
+
+    "2:                                             \n"
+      "lbu             $t0, -2(%[src_uv])           \n"
+      "lbu             $t1, -1(%[src_uv])           \n"
+      "addiu           %[src_uv], %[src_uv], -2     \n"
+      "addiu           %[y], %[y], -1               \n"
+      "sb              $t0, 0(%[dst_u])             \n"
+      "sb              $t1, 0(%[dst_v])             \n"
+      "addiu           %[dst_u], %[dst_u], 1        \n"
+      "bgtz            %[y], 2b                     \n"
+      " addiu          %[dst_v], %[dst_v], 1        \n"
+
+    "3:                                             \n"
+      ".set pop                                     \n"
+      : [src_uv] "+r" (src_uv),
+        [dst_u] "+r" (dst_u),
+        [dst_v] "+r" (dst_v),
+        [x] "=&r" (x),
+        [y] "+r" (y)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t7", "t8", "t9"
+  );
+}
+#endif  // HAS_MIRRORROWUV_MIPS_DSPR2
+
+
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define I422ToTransientMipsRGB                                                 \
+        "lw                $t0, 0(%[y_buf])       \n"                          \
+        "lhu               $t1, 0(%[u_buf])       \n"                          \
+        "lhu               $t2, 0(%[v_buf])       \n"                          \
+        "preceu.ph.qbr     $t1, $t1               \n"                          \
+        "preceu.ph.qbr     $t2, $t2               \n"                          \
+        "preceu.ph.qbra    $t3, $t0               \n"                          \
+        "preceu.ph.qbla    $t0, $t0               \n"                          \
+        "subu.ph           $t1, $t1, $s5          \n"                          \
+        "subu.ph           $t2, $t2, $s5          \n"                          \
+        "subu.ph           $t3, $t3, $s4          \n"                          \
+        "subu.ph           $t0, $t0, $s4          \n"                          \
+        "mul.ph            $t3, $t3, $s0          \n"                          \
+        "mul.ph            $t0, $t0, $s0          \n"                          \
+        "shll.ph           $t4, $t1, 0x7          \n"                          \
+        "subu.ph           $t4, $t4, $t1          \n"                          \
+        "mul.ph            $t6, $t1, $s1          \n"                          \
+        "mul.ph            $t1, $t2, $s2          \n"                          \
+        "addq_s.ph         $t5, $t4, $t3          \n"                          \
+        "addq_s.ph         $t4, $t4, $t0          \n"                          \
+        "shra.ph           $t5, $t5, 6            \n"                          \
+        "shra.ph           $t4, $t4, 6            \n"                          \
+        "addiu             %[u_buf], 2            \n"                          \
+        "addiu             %[v_buf], 2            \n"                          \
+        "addu.ph           $t6, $t6, $t1          \n"                          \
+        "mul.ph            $t1, $t2, $s3          \n"                          \
+        "addu.ph           $t9, $t6, $t3          \n"                          \
+        "addu.ph           $t8, $t6, $t0          \n"                          \
+        "shra.ph           $t9, $t9, 6            \n"                          \
+        "shra.ph           $t8, $t8, 6            \n"                          \
+        "addu.ph           $t2, $t1, $t3          \n"                          \
+        "addu.ph           $t1, $t1, $t0          \n"                          \
+        "shra.ph           $t2, $t2, 6            \n"                          \
+        "shra.ph           $t1, $t1, 6            \n"                          \
+        "subu.ph           $t5, $t5, $s5          \n"                          \
+        "subu.ph           $t4, $t4, $s5          \n"                          \
+        "subu.ph           $t9, $t9, $s5          \n"                          \
+        "subu.ph           $t8, $t8, $s5          \n"                          \
+        "subu.ph           $t2, $t2, $s5          \n"                          \
+        "subu.ph           $t1, $t1, $s5          \n"                          \
+        "shll_s.ph         $t5, $t5, 8            \n"                          \
+        "shll_s.ph         $t4, $t4, 8            \n"                          \
+        "shll_s.ph         $t9, $t9, 8            \n"                          \
+        "shll_s.ph         $t8, $t8, 8            \n"                          \
+        "shll_s.ph         $t2, $t2, 8            \n"                          \
+        "shll_s.ph         $t1, $t1, 8            \n"                          \
+        "shra.ph           $t5, $t5, 8            \n"                          \
+        "shra.ph           $t4, $t4, 8            \n"                          \
+        "shra.ph           $t9, $t9, 8            \n"                          \
+        "shra.ph           $t8, $t8, 8            \n"                          \
+        "shra.ph           $t2, $t2, 8            \n"                          \
+        "shra.ph           $t1, $t1, 8            \n"                          \
+        "addu.ph           $t5, $t5, $s5          \n"                          \
+        "addu.ph           $t4, $t4, $s5          \n"                          \
+        "addu.ph           $t9, $t9, $s5          \n"                          \
+        "addu.ph           $t8, $t8, $s5          \n"                          \
+        "addu.ph           $t2, $t2, $s5          \n"                          \
+        "addu.ph           $t1, $t1, $s5          \n"
+
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+      ".set push                                \n"
+      ".set noreorder                           \n"
+      "beqz              %[width], 2f           \n"
+      " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+      "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+      "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+      "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+      "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+      "repl.ph           $s5, 128               \n"  // |128|128| // clipping
+      "lui               $s6, 0xff00            \n"
+      "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
+    "1:                                         \n"
+      I422ToTransientMipsRGB
+// Arranging into argb format
+      "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
+      "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
+      "addiu             %[width], -4           \n"
+      "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
+      "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
+      "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+
+      "addiu             %[y_buf], 4            \n"
+      "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+      "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+      "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
+      "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
+      "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
+      "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
+      "sll               $t9, $t9, 16           \n"
+      "sll               $t8, $t8, 16           \n"
+      "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
+      "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
+// Store results.
+      "sw                $t2, 0(%[rgb_buf])     \n"
+      "sw                $t0, 4(%[rgb_buf])     \n"
+      "sw                $t1, 8(%[rgb_buf])     \n"
+      "sw                $t3, 12(%[rgb_buf])    \n"
+      "bnez              %[width], 1b           \n"
+      " addiu            %[rgb_buf], 16         \n"
+    "2:                                         \n"
+      ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+      ".set push                                \n\t"
+      ".set noreorder                           \n\t"
+      "beqz              %[width], 2f           \n\t"
+      " repl.ph          $s0, 74                \n\t"  // |YG|YG| = |74|74|
+      "repl.ph           $s1, -25               \n\t"  // |UG|UG| = |-25|-25|
+      "repl.ph           $s2, -52               \n\t"  // |VG|VG| = |-52|-52|
+      "repl.ph           $s3, 102               \n\t"  // |VR|VR| = |102|102|
+      "repl.ph           $s4, 16                \n\t"  // |0|16|0|16|
+      "repl.ph           $s5, 128               \n\t"  // |128|128|
+      "lui               $s6, 0xff00            \n\t"
+      "ori               $s6, 0xff00            \n\t"  // |ff|00|ff|00|
+    "1:                                         \n"
+      I422ToTransientMipsRGB
+// Arranging into abgr format
+      "precr.qb.ph      $t0, $t8, $t1           \n\t"  // |G1|g1|R1|r1|
+      "precr.qb.ph      $t3, $t9, $t2           \n\t"  // |G0|g0|R0|r0|
+      "precrq.qb.ph     $t8, $t0, $t3           \n\t"  // |G1|R1|G0|R0|
+      "precr.qb.ph      $t9, $t0, $t3           \n\t"  // |g1|r1|g0|r0|
+
+      "precr.qb.ph       $t2, $t4, $t5          \n\t"  // |B1|b1|B0|b0|
+      "addiu             %[width], -4           \n\t"
+      "addiu             %[y_buf], 4            \n\t"
+      "preceu.ph.qbla    $t1, $t2               \n\t"  // |0 |B1|0 |B0|
+      "preceu.ph.qbra    $t2, $t2               \n\t"  // |0 |b1|0 |b0|
+      "or                $t1, $t1, $s6          \n\t"  // |ff|B1|ff|B0|
+      "or                $t2, $t2, $s6          \n\t"  // |ff|b1|ff|b0|
+      "precrq.ph.w       $t0, $t2, $t9          \n\t"  // |ff|b1|g1|r1|
+      "precrq.ph.w       $t3, $t1, $t8          \n\t"  // |ff|B1|G1|R1|
+      "sll               $t9, $t9, 16           \n\t"
+      "sll               $t8, $t8, 16           \n\t"
+      "packrl.ph         $t2, $t2, $t9          \n\t"  // |ff|b0|g0|r0|
+      "packrl.ph         $t1, $t1, $t8          \n\t"  // |ff|B0|G0|R0|
+// Store results.
+      "sw                $t2, 0(%[rgb_buf])     \n\t"
+      "sw                $t0, 4(%[rgb_buf])     \n\t"
+      "sw                $t1, 8(%[rgb_buf])     \n\t"
+      "sw                $t3, 12(%[rgb_buf])    \n\t"
+      "bnez              %[width], 1b           \n\t"
+      " addiu            %[rgb_buf], 16         \n\t"
+    "2:                                         \n\t"
+      ".set pop                                 \n\t"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+      ".set push                                \n"
+      ".set noreorder                           \n"
+      "beqz              %[width], 2f           \n"
+      " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |
+      "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+      "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+      "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+      "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+      "repl.ph           $s5, 128               \n"  // |128|128|
+      "lui               $s6, 0xff              \n"
+      "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
+    "1:                                         \n"
+      I422ToTransientMipsRGB
+      // Arranging into bgra format
+      "precr.qb.ph      $t4, $t4, $t8           \n"  // |B1|b1|G1|g1|
+      "precr.qb.ph      $t5, $t5, $t9           \n"  // |B0|b0|G0|g0|
+      "precrq.qb.ph     $t8, $t4, $t5           \n"  // |B1|G1|B0|G0|
+      "precr.qb.ph      $t9, $t4, $t5           \n"  // |b1|g1|b0|g0|
+
+      "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+      "addiu             %[width], -4           \n"
+      "addiu             %[y_buf], 4            \n"
+      "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+      "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+      "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |
+      "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |
+      "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|
+      "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|
+      "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|
+      "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|
+      "sll               $t1, $t1, 16           \n"
+      "sll               $t2, $t2, 16           \n"
+      "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|
+      "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|
+// Store results.
+      "sw                $t2, 0(%[rgb_buf])     \n"
+      "sw                $t0, 4(%[rgb_buf])     \n"
+      "sw                $t1, 8(%[rgb_buf])     \n"
+      "sw                $t3, 12(%[rgb_buf])    \n"
+      "bnez              %[width], 1b           \n"
+      " addiu            %[rgb_buf], 16         \n"
+    "2:                                         \n"
+      ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
 #endif  // __mips__
 
 #ifdef __cplusplus
diff --git a/source/scale.cc b/source/scale.cc
index a7732c4bc..1793b6f19 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1957,6 +1957,26 @@ void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
                                 const unsigned char* src_ptr,
                                 ptrdiff_t src_stride,
                                 int dst_width, int source_y_fraction);
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                              uint8* dst, int dst_width);
+void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                               uint8* dst, int dst_width);
+void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                               uint8* dst, int dst_width);
+void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 
 // CPU agnostic row functions
@@ -2331,7 +2351,7 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
     ScaleRowDown2 = filtering ?
-                    ScaleRowDown2Int_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
+        ScaleRowDown2Int_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
   }
 #endif
 
@@ -2368,6 +2388,13 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
     ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
   }
+#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Int_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
+  }
 #endif
 
   for (int y = 0; y < dst_height; ++y) {
@@ -2461,6 +2488,19 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_MIPS_DSPR2;
+    }
+  }
+#endif
 
   for (int y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
@@ -2541,6 +2581,18 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
       ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
     }
   }
+#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_MIPS_DSPR2;
+    }
+  }
 #endif
 
   for (int y = 0; y < dst_height - 2; y += 3) {
diff --git a/source/scale_mips.cc b/source/scale_mips.cc
index ce7241662..8f380d475 100644
--- a/source/scale_mips.cc
+++ b/source/scale_mips.cc
@@ -173,6 +173,460 @@ void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
   );
 }
 
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                                     uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                    \n"
+      ".set noreorder                               \n"
+
+      "srl            $t9, %[dst_width], 3          \n"
+      "beqz           $t9, 2f                       \n"
+      " nop                                         \n"
+
+     "1:                                            \n"
+      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
+      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
+      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
+      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
+      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
+      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
+      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
+      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
+      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
+      "addiu          %[src_ptr], %[src_ptr], 32    \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sw             $t1, 0(%[dst])                \n"
+      "sw             $t5, 4(%[dst])                \n"
+      "bgtz           $t9, 1b                       \n"
+      " addiu         %[dst], %[dst], 8             \n"
+
+    "2:                                             \n"
+      "andi           $t9, %[dst_width], 7          \n"  // residue
+      "beqz           $t9, 3f                       \n"
+      " nop                                         \n"
+
+    "21:                                            \n"
+      "lbu            $t1, 0(%[src_ptr])            \n"
+      "addiu          %[src_ptr], %[src_ptr], 4     \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sb             $t1, 0(%[dst])                \n"
+      "bgtz           $t9, 21b                      \n"
+      " addiu         %[dst], %[dst], 1             \n"
+
+    "3:                                             \n"
+      ".set pop                                     \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst)
+      : [dst_width] "r" (dst_width)
+      : "t1", "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                        uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  const uint8* s2 = s1 + stride;
+  const uint8* s3 = s2 + stride;
+
+  __asm__ __volatile__ (
+      ".set push                                  \n"
+      ".set noreorder                             \n"
+
+      "srl           $t9, %[dst_width], 1         \n"
+      "andi          $t8, %[dst_width], 1         \n"
+
+     "1:                                          \n"
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
+      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
+      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
+      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
+      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
+      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "add           $t4, $t4, $t5                \n"
+      "add           $t6, $t6, $t7                \n"
+      "add           $t4, $t4, $t6                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "shra_r.w      $t4, $t4, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+      "sb            $t4, 1(%[dst])               \n"
+      "addiu         %[src_ptr], %[src_ptr], 8    \n"
+      "addiu         %[s1], %[s1], 8              \n"
+      "addiu         %[s2], %[s2], 8              \n"
+      "addiu         %[s3], %[s3], 8              \n"
+      "addiu         $t9, $t9, -1                 \n"
+      "bgtz          $t9, 1b                      \n"
+      " addiu        %[dst], %[dst], 2            \n"
+      "beqz          $t8, 2f                      \n"
+      " nop                                       \n"
+
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+
+      "2:                                         \n"
+      ".set pop                                   \n"
+
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [s3] "+r" (s3)
+      : [dst_width] "r" (dst_width)
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                               uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                          \n"
+      ".set noreorder                                     \n"
+    "1:                                                   \n"
+      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
+      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
+      "addiu           %[dst_width], %[dst_width], -24    \n"
+      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
+      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
+      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
+      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
+      "addiu           %[src_ptr], %[src_ptr], 32         \n"
+      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
+      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
+      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
+      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
+      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
+      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
+      "sw              $t1, 0(%[dst])                     \n"
+      "sw              $t0, 4(%[dst])                     \n"
+      "sw              $t3, 8(%[dst])                     \n"
+      "sw              $t5, 12(%[dst])                    \n"
+      "sw              $t9, 16(%[dst])                    \n"
+      "sw              $t7, 20(%[dst])                    \n"
+      "bnez            %[dst_width], 1b                   \n"
+      " addiu          %[dst], %[dst], 24                 \n"
+      ".set pop                                           \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "repl.ph          $t3, 3                           \n"  // 0x00030003
+    "1:                                                  \n"
+      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
+      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
+      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                        \n"
+      "raddu.w.qb        $t1, $t1                        \n"
+      "shra_r.w          $t0, $t0, 1                     \n"
+      "shra_r.w          $t1, $t1, 1                     \n"
+      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
+      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
+      "addu.ph           $t2, $t2, $t4                   \n"
+      "addu.ph           $t6, $t6, $t5                   \n"
+      "sll               $t5, $t0, 1                     \n"
+      "add               $t0, $t5, $t0                   \n"
+      "shra_r.ph         $t2, $t2, 2                     \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "shll.ph           $t4, $t2, 1                     \n"
+      "addq.ph           $t4, $t4, $t2                   \n"
+      "addu              $t0, $t0, $t1                   \n"
+      "addiu             %[src_ptr], %[src_ptr], 4       \n"
+      "shra_r.w          $t0, $t0, 2                     \n"
+      "addu.ph           $t6, $t6, $t4                   \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "srl               $t1, $t6, 16                    \n"
+      "addiu             %[dst_width], %[dst_width], -3  \n"
+      "sb                $t1, 0(%[d])                    \n"
+      "sb                $t0, 1(%[d])                    \n"
+      "sb                $t6, 2(%[d])                    \n"
+      "bgtz              %[dst_width], 1b                \n"
+      " addiu            %[d], %[d], 3                   \n"
+    "3:                                                  \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                           \n"
+      ".set noreorder                                      \n"
+      "repl.ph           $t2, 3                            \n"  // 0x00030003
+    "1:                                                    \n"
+      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
+      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
+      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                          \n"
+      "raddu.w.qb        $t1, $t1                          \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "shra_r.w          $t1, $t1, 1                       \n"
+      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
+      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
+      "addu.ph           $t4, $t4, $t3                     \n"
+      "addu.ph           $t6, $t6, $t5                     \n"
+      "shra_r.ph         $t6, $t6, 2                       \n"
+      "shra_r.ph         $t4, $t4, 2                       \n"
+      "addu.ph           $t6, $t6, $t4                     \n"
+      "addiu             %[src_ptr], %[src_ptr], 4         \n"
+      "shra_r.ph         $t6, $t6, 1                       \n"
+      "addu              $t0, $t0, $t1                     \n"
+      "addiu             %[dst_width], %[dst_width], -3    \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "srl               $t1, $t6, 16                      \n"
+      "sb                $t1, 0(%[d])                      \n"
+      "sb                $t0, 1(%[d])                      \n"
+      "sb                $t6, 2(%[d])                      \n"
+      "bgtz              %[dst_width], 1b                  \n"
+      " addiu            %[d], %[d], 3                     \n"
+    "3:                                                    \n"
+      ".set pop                                            \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                               uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                     \n"
+      ".set noreorder                                \n"
+    "1:                                              \n"
+      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
+      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
+      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
+      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
+      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
+      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
+      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
+      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
+      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
+      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
+      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
+      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
+      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
+      "addiu      %[src_ptr], %[src_ptr], 32         \n"
+      "addiu      %[dst_width], %[dst_width], -12    \n"
+      "addiu      $t8,%[dst_width], -12              \n"
+      "sw         $t1, 0(%[dst])                     \n"
+      "sw         $t4, 4(%[dst])                     \n"
+      "sw         $t6, 8(%[dst])                     \n"
+      "bgez       $t8, 1b                            \n"
+      " addiu     %[dst], %[dst], 12                 \n"
+      ".set pop                                      \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* t = src_ptr + stride;
+  const int c = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
+      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
+      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
+      "srl             $t4, $t4, 2                       \n"  // t4 / 4
+      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
+      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
+      "addu            $t6, $t5, $t6                     \n"
+      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
+      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
+      "addu            $t0, $t0, $t2                     \n"
+      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[t], %[t], 8                     \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t4, -1(%[dst_ptr])               \n"
+      "sb              $t6, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [t] "+r" (t),
+        [dst_width] "+r" (dst_width)
+      : [c] "r" (c)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  stride += stride;
+  const uint8* s2 = src_ptr + stride;
+  const int c1 = 0x1C71;
+  const int c2 = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
+      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
+      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
+      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
+      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
+      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
+      "addu            $t7, $t7, $t8                     \n"
+      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
+      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
+      "addu            $t6, $t6, $t8                     \n"
+      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
+      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
+      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
+      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
+      "addu            $t7, $t7, $t8                     \n"
+      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
+      "raddu.w.qb      $t0, $t0                          \n"
+      "raddu.w.qb      $t2, $t2                          \n"
+      "raddu.w.qb      $t4, $t4                          \n"
+      "addu            $t0, $t0, $t2                     \n"
+      "addu            $t0, $t0, $t4                     \n"
+      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[s1], %[s1], 8                   \n"
+      "addiu           %[s2], %[s2], 8                   \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t7, $t7, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t6, -1(%[dst_ptr])               \n"
+      "sb              $t7, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [dst_width] "+r" (dst_width)
+      : [c1] "r" (c1), [c2] "r" (c2)
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
 void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
                                 const unsigned char* src_ptr,
                                 ptrdiff_t src_stride,
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index a13fcbabf..7ed9bd0fa 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -630,4 +630,70 @@ TEST_F(libyuvTest, TestAffine) {
 #endif
 }
 
+TEST_F(libyuvTest, TestCopyPlane) {
+  int err = 0;
+  int yw = benchmark_width_;
+  int yh = benchmark_height_;
+  int b = 12;
+  int i, j;
+
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  srandom(time(NULL));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(dst_c, y_plane_size)
+  align_buffer_16(dst_opt, y_plane_size);
+
+  memset(orig_y, 0, y_plane_size);
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 0, y_plane_size);
+
+  // Fill image buffers with random data.
+  for (i = b; i < (yh + b); ++i) {
+    for (j = b; j < (yw + b); ++j) {
+      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
+    }
+  }
+
+  // Fill destination buffers with random data.
+  for (i = 0; i < y_plane_size; ++i) {
+    uint8 random_number = random() & 0x7f;
+    dst_c[i] = random_number;
+    dst_opt[i] = dst_c[i];
+  }
+
+  int y_off = b * (yw + b * 2) + b;
+
+  int y_st = yw + b * 2;
+  int stride = 8;
+
+  // Disable all optimizations.
+  MaskCpuFlags(0);
+  double c_time = get_time();
+  for (j = 0; j < benchmark_iterations_; j++) {
+    CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh);
+  }
+  c_time = (get_time() - c_time) / benchmark_iterations_;
+
+  // Enable optimizations.
+  MaskCpuFlags(-1);
+  double opt_time = get_time();
+  for (j = 0; j < benchmark_iterations_; j++) {
+    CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  printf(" %8d us C - %8d us OPT\n",
+         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  for (i = 0; i < y_plane_size; ++i) {
+    if (dst_c[i] != dst_opt[i])
+      ++err;
+  }
+
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(dst_c)
+  free_aligned_buffer_16(dst_opt)
+
+  EXPECT_EQ(0, err);
+}
+
 }  // namespace libyuv