Mips port of libyuv. Includes functionality for convert, rotate, scale and memcpy.

BUG=126 TESTED=tested by mips Review URL: https://webrtc-codereview.appspot.com/930005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@449 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2012-10-26 22:49:18 +00:00 · 2012-10-26 22:49:18 +00:00 · 6c1b2d38c6
commit 6c1b2d38c6
parent 1f399dfaf8
16 changed files with 1867 additions and 261 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 447
+Version: 449
 License: BSD
 License File: LICENSE

--- a/include/libyuv.h
+++ b/include/libyuv.h
@ -19,9 +19,11 @@
 #include "libyuv/convert_from_argb.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/format_conversion.h"
+#include "libyuv/mjpeg_decoder.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
 #include "libyuv/scale.h"
 #include "libyuv/scale_argb.h"
 #include "libyuv/version.h"
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -175,8 +175,14 @@ extern "C" {

 // The following are available on Mips platforms
 #if !defined(YUV_DISABLE_ASM) && defined(__mips__)
+#define HAS_COPYROW_MIPS
 #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 #define HAS_SPLITUV_MIPS_DSPR2
+#define HAS_MIRRORROW_MIPS_DSPR2
+#define HAS_MIRRORROWUV_MIPS_DSPR2
+#define HAS_I422TOARGBROW_MIPS_DSPR2
+#define HAS_I422TOBGRAROW_MIPS_DSPR2
+#define HAS_I422TOABGRROW_MIPS_DSPR2
 #endif
 #endif

@ -282,6 +288,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRowUV_MIPS_DSPR2(const uint8* src, uint8* dst_u, uint8* dst_v,
+                            int width);
 void MirrorRow_C(const uint8* src, uint8* dst, int width);

 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
@ -321,6 +330,7 @@ void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_X86(const uint8* src, uint8* dst, int count);
 void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
 void CopyRow_C(const uint8* src, uint8* dst, int count);

 void SetRow8_X86(uint8* dst, uint32 v32, int count);
@ -694,6 +704,21 @@ void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
                            const uint8* uv_buf,
                            uint8* argb_buf,
                            int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);

 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 447
+#define LIBYUV_VERSION 449

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/libyuv.gyp
+++ b/libyuv.gyp
@ -75,11 +75,12 @@
        'source/convert_from_argb.cc',
        'source/cpu_id.cc',
        'source/format_conversion.cc',
-        'source/memcpy_mips.S',
+        'source/memcpy_mips.S',  # TODO(fbarchard): Move into row_mips.cc
        'source/mjpeg_decoder.cc',
        'source/planar_functions.cc',
        'source/rotate.cc',
        'source/rotate_argb.cc',
+        'source/rotate_mips.cc',
        'source/rotate_neon.cc',
        'source/row_common.cc',
        'source/row_mips.cc',
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -132,6 +132,14 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif

  for (int y = 0; y < height; ++y) {
@ -756,6 +764,11 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif

  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
@ -829,6 +842,11 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif

  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -599,6 +599,14 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif

  for (int y = 0; y < height; ++y) {
@ -652,6 +660,14 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
      I422ToBGRARow = I422ToBGRARow_NEON;
    }
  }
+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
 #endif

  for (int y = 0; y < height; ++y) {
@ -909,6 +925,13 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
      I422ToARGBRow = I422ToARGBRow_SSSE3;
    }
  }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif

  SIMD_ALIGNED(uint8 row[kMaxStride]);
@ -975,6 +998,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
      I422ToARGBRow = I422ToARGBRow_SSSE3;
    }
  }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif

  SIMD_ALIGNED(uint8 row[kMaxStride]);
@ -1041,6 +1072,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
      I422ToARGBRow = I422ToARGBRow_SSSE3;
    }
  }
+#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
 #endif

  SIMD_ALIGNED(uint8 row[kMaxStride]);
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@ -174,7 +174,7 @@ int InitCpuFlags(void) {
    }
  }
 #endif
-  // environment variable overrides for testing.
+  // Environment variable overrides for testing.
  if (TestEnv("LIBYUV_DISABLE_X86")) {
    cpu_info_ &= ~kCpuHasX86;
  }
@ -197,7 +197,7 @@ int InitCpuFlags(void) {
    cpu_info_ &= ~kCpuHasAVX2;
  }
 #elif defined(__mips__) && defined(__linux__)
-  // linux mips parse text file for dsp detect.
+  // Linux mips parse text file for dsp detect.
  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
 #if defined(__mips_dspr2)
  cpu_info_ |= kCpuHasMIPS_DSPR2;
@ -215,7 +215,7 @@ int InitCpuFlags(void) {
  }
 #elif defined(__arm__)
 #if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-  // linux arm parse text file for neon detect.
+  // Linux arm parse text file for neon detect.
  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
 #elif defined(__ARM_NEON__)
  // gcc -mfpu=neon defines __ARM_NEON__
--- a/source/memcpy_mips.S
+++ b/source/memcpy_mips.S
@ -1,5 +1,13 @@
 #if defined (__mips__)
-
+#
+#  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+#
+#  Use of this source code is governed by a BSD-style license
+#  that can be found in the LICENSE file in the root of the source
+#  tree. An additional intellectual property rights grant can be found
+#  in the file PATENTS.  All contributing project authors may
+#  be found in the AUTHORS file in the root of the source tree.
+#
  .globl  memcpy_MIPS;
  .align  2;
  .type   memcpy_MIPS,@function;
@ -21,8 +29,8 @@ memcpy_MIPS:
  negu    $a3,$a0

  andi    $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
-  beq $a3,$zero,chk16w  # when a3=0 then the dst (a0) is word-aligned
-  subu  $a2,$a2,$a3 # now a2 is the remining bytes count
+  beq     $a3,$zero,chk16w  # when a3=0 then the dst (a0) is
+   subu   $a2,$a2,$a3 # word-aligned now a2 is the remining bytes count

  lwr     $t8,0($a1)
  addu    $a1,$a1,$a3
@ -30,15 +38,14 @@ memcpy_MIPS:
  addu    $a0,$a0,$a3

 # Now the dst/src are mutually word-aligned with word-aligned addresses
-chk16w: andi  $t8,$a2,0x3f  # any whole 64-byte chunks?
+chk16w:
+  andi    $t8,$a2,0x3f  # any whole 64-byte chunks?
                        # t8 is the byte count after 64-byte chunks
-
  beq     $a2,$t8,chk8w # if a2==t8, no 64-byte chunks
                        # There will be at most 1 32-byte chunk after it
   subu   $a3,$a2,$t8 # subtract from a2 the reminder
                      # Here a3 counts bytes in 16w chunks
  addu    $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
-
  addu    $t0,$a0,$a2 # t0 is the "past the end" address

 # When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
@ -164,7 +171,8 @@ last8loop:
  bne     $a0,$a3,last8loop
   sb     $v1,-1($a0)

-leave:  j $ra
+leave:
+  j       $ra
   nop

 #
@ -183,18 +191,16 @@ unaligned:
  swr     $v1,0($a0)
  addu    $a0,$a0,$a3 # below the dst will be word aligned (NOTE1)

-ua_chk16w:  andi  $t8,$a2,0x3f  # any whole 64-byte chunks?
+ua_chk16w:
+  andi    $t8,$a2,0x3f  # any whole 64-byte chunks?
                        # t8 is the byte count after 64-byte chunks
  beq     $a2,$t8,ua_chk8w  # if a2==t8, no 64-byte chunks
  # There will be at most 1 32-byte chunk after it
  subu    $a3,$a2,$t8 # subtract from a2 the reminder
                      # Here a3 counts bytes in 16w chunks
  addu    $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
-
  addu    $t0,$a0,$a2 # t0 is the "past the end" address
-
  subu    $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
-
  pref    0,0($a1)    # bring the first line of src, addr 0
  pref    0,32($a1) # bring the second line of src, addr 32
  pref    0,64($a1) # bring the third line of src, addr 64
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -46,6 +46,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
    CopyRow = CopyRow_SSE2;
  }
 #endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif

  // Copy plane
  for (int y = 0; y < height; ++y) {
@ -424,6 +429,14 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
      }
    }
  }
+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
 #endif

  for (int y = 0; y < height; ++y) {
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -56,6 +56,23 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         int width);
 #endif  // defined(__ARM_NEON__)

+#if !defined(YUV_DISABLE_ASM) && defined(__mips__)
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride, int width);
+#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width);
+#endif
+#endif
+
+
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_TRANSPOSE_WX8_SSSE3
 __declspec(naked) __declspec(align(16))
@ -794,6 +811,16 @@ void TransposePlane(const uint8* src, int src_stride,
    TransposeWx8 = TransposeWx8_FAST_SSSE3;
  }
 #endif
+#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    if (IS_ALIGNED(width, 4) &&
+        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+      TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
+    } else {
+      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
+    }
+  }
+#endif

  // Work across the source in 8x8 tiles
  int i = height;
@ -856,6 +883,13 @@ void RotatePlane180(const uint8* src, int src_stride,
      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
    MirrorRow = MirrorRow_SSSE3;
  }
+#endif
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
+  }
 #endif
  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_NEON)
@ -952,6 +986,11 @@ void TransposeUV(const uint8* src, int src_stride,
      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
    TransposeUVWx8 = TransposeUVWx8_SSE2;
  }
+#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
+  }
 #endif

  // Work through the source in 8x8 tiles.
@ -1021,6 +1060,11 @@ void RotateUV180(const uint8* src, int src_stride,
      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
    MirrorRowUV = MirrorRowUV_SSSE3;
  }
+#elif defined(HAS_MIRRORROWUV_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    MirrorRowUV = MirrorRowUV_MIPS_DSPR2;
+  }
 #endif

  dst_a += dst_stride_a * (height - 1);
--- a/source/rotate_mips.cc
+++ b/source/rotate_mips.cc
@ -0,0 +1,485 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(YUV_DISABLE_ASM) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride,
+                             int width) {
+   __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+    "1:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "sw               $s0, 0(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "sw               $s1, 4(%[dst])                   \n"
+      "bnez             %[width], 1b                     \n"
+      " addu            %[dst], %[dst], %[dst_stride]    \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+   "11:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "swr              $s0, 0(%[dst])                   \n"
+      "swl              $s0, 3(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "swr              $s1, 4(%[dst])                   \n"
+      "swl              $s1, 7(%[dst])                   \n"
+      "bnez             %[width], 11b                    \n"
+       "addu             %[dst], %[dst], %[dst_stride]   \n"
+    "2:                                                  \n"
+      ".set pop                                          \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1",  "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1"
+  );
+}
+
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride,
+                                  int width) {
+  __asm__ __volatile__ (
+      ".set noat                                         \n"
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz             %[width], 2f                     \n"
+      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+
+      "srl              $AT, %[width], 0x2               \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+      "1:                                                \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "sw              $s4, 0(%[dst])                    \n"
+      "sw              $t0, 4(%[dst])                    \n"
+      "sw              $s6, 0($s0)                       \n"
+      "sw              $t8, 4($s0)                       \n"
+      "sw              $s5, 0($s1)                       \n"
+      "sw              $t1, 4($s1)                       \n"
+      "sw              $s7, 0($s2)                       \n"
+      "sw              $t9, 4($s2)                       \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 1b                          \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+      "11:                                               \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "swr              $s4, 0(%[dst])                   \n"
+      "swl              $s4, 3(%[dst])                   \n"
+      "swr              $t0, 4(%[dst])                   \n"
+      "swl              $t0, 7(%[dst])                   \n"
+      "swr              $s6, 0($s0)                      \n"
+      "swl              $s6, 3($s0)                      \n"
+      "swr              $t8, 4($s0)                      \n"
+      "swl              $t8, 7($s0)                      \n"
+      "swr              $s5, 0($s1)                      \n"
+      "swl              $s5, 3($s1)                      \n"
+      "swr              $t1, 4($s1)                      \n"
+      "swl              $t1, 7($s1)                      \n"
+      "swr              $s7, 0($s2)                      \n"
+      "swl              $s7, 3($s2)                      \n"
+      "swr              $t9, 4($s2)                      \n"
+      "swl              $t9, 7($s2)                      \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 11b                         \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "2:                                                \n"
+      ".set pop                                          \n"
+      ".set at                                           \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4",
+        "s5", "s6", "s7"
+  );
+}
+
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz            %[width], 2f                      \n"
+      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
+      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
+      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
+      "addu            $t3, $t2, %[src_stride]           \n"
+      "addu            $t5, $t4, %[src_stride]           \n"
+      "addu            $t6, $t2, $t4                     \n"
+      "subu            $t7, $t9, %[src_stride]           \n"
+      "srl             $t1, %[width], 1                  \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+      "andi            $t0, %[dst_a], 0x3                \n"
+      "andi            $t8, %[dst_b], 0x3                \n"
+      "or              $t0, $t0, $t8                     \n"
+      "andi            $t8, %[dst_stride_a], 0x3         \n"
+      "andi            $s5, %[dst_stride_b], 0x3         \n"
+      "or              $t8, $t8, $s5                     \n"
+      "or              $t0, $t0, $t8                     \n"
+      "bnez            $t0, 11f                          \n"
+      " nop                                              \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "sw              $s3, 0($s5)                       \n"
+      "sw              $s4, 0($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "sw              $s3, 0(%[dst_a])                  \n"
+      "sw              $s4, 0(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+      "sw              $s3, 4($s5)                       \n"
+      "sw              $s4, 4($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "sw              $s3, 4(%[dst_a])                  \n"
+      "sw              $s4, 4(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 1b                           \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+      "b               2f                                \n"
+      " nop                                              \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+   "11:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "swr             $s3, 0($s5)                       \n"
+      "swl             $s3, 3($s5)                       \n"
+      "swr             $s4, 0($s6)                       \n"
+      "swl             $s4, 3($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "swr             $s3, 0(%[dst_a])                  \n"
+      "swl             $s3, 3(%[dst_a])                  \n"
+      "swr             $s4, 0(%[dst_b])                  \n"
+      "swl             $s4, 3(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+
+      "swr             $s3, 4($s5)                       \n"
+      "swl             $s3, 7($s5)                       \n"
+      "swr             $s4, 4($s6)                       \n"
+      "swl             $s4, 7($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "swr             $s3, 4(%[dst_a])                  \n"
+      "swl             $s3, 7(%[dst_a])                  \n"
+      "swr             $s4, 4(%[dst_b])                  \n"
+      "swl             $s4, 7(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 11b                          \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+
+      "2:                                                \n"
+      ".set pop                                          \n"
+      : [src] "+r" (src),
+        [dst_a] "+r" (dst_a),
+        [dst_b] "+r" (dst_b),
+        [width] "+r" (width),
+        [src_stride] "+r" (src_stride)
+      : [dst_stride_a] "r" (dst_stride_a),
+        [dst_stride_b] "r" (dst_stride_b)
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@ -16,6 +16,13 @@ extern "C" {
 #endif

 #if !defined(YUV_DISABLE_ASM) && defined(__mips__)
+#if defined HAS_COPYROW_MIPS
+extern "C" void  memcpy_MIPS(uint8* dst, const uint8* src, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+  memcpy_MIPS(dst, src, count);
+}
+#endif
+
 #ifdef HAS_SPLITUV_MIPS_DSPR2
 void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                        int width) {
@ -166,6 +173,400 @@ void SplitUV_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
 }
 #endif  // HAS_SPLITUV_MIPS_DSPR2

+#ifdef HAS_MIRRORROW_MIPS_DSPR2
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
+  __asm__ __volatile__ (
+      ".set push                             \n"
+      ".set noreorder                        \n"
+
+      "srl       $t4, %[width], 4            \n"  // multiplies of 16
+      "andi      $t5, %[width], 0xf          \n"
+      "blez      $t4, 2f                     \n"
+      " addu     %[src], %[src], %[width]    \n"  // src += width
+
+    "1:                                      \n"
+      "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
+      "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
+      "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
+      "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
+      "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
+      "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
+      "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
+      "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
+      "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
+      "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
+      "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
+      "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
+      "addiu     %[src], %[src], -16         \n"
+      "addiu     $t4, $t4, -1                \n"
+      "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
+      "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
+      "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
+      "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
+      "bgtz      $t4, 1b                     \n"
+      " addiu    %[dst], %[dst], 16          \n"
+      "beqz      $t5, 3f                     \n"
+      " nop                                  \n"
+
+    "2:                                      \n"
+      "lbu       $t0, -1(%[src])             \n"
+      "addiu     $t5, $t5, -1                \n"
+      "addiu     %[src], %[src], -1          \n"
+      "sb        $t0, 0(%[dst])              \n"
+      "bgez      $t5, 2b                     \n"
+      " addiu    %[dst], %[dst], 1           \n"
+
+    "3:                                      \n"
+      ".set pop                              \n"
+      : [src] "+r" (src), [dst] "+r" (dst)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4", "t5"
+  );
+}
+#endif  // HAS_MIRRORROW_MIPS_DSPR2
+
+#ifdef HAS_MIRRORROWUV_MIPS_DSPR2
+void MirrorRowUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width) {
+  int x = 0;
+  int y = 0;
+  __asm__ __volatile__ (
+      ".set push                                    \n"
+      ".set noreorder                               \n"
+
+      "addu            $t4, %[width], %[width]      \n"
+      "srl             %[x], %[width], 4            \n"
+      "andi            %[y], %[width], 0xf          \n"
+      "blez            %[x], 2f                     \n"
+      " addu           %[src_uv], %[src_uv], $t4    \n"
+
+    "1:                                             \n"
+      "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
+      "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
+      "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
+      "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
+      "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
+      "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
+      "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
+      "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
+
+      "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
+      "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
+      "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
+      "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
+      "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
+      "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
+      "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
+      "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
+      "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
+      "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
+      "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
+      "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
+      "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
+      "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
+      "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
+      "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
+      "addiu           %[src_uv], %[src_uv], -32    \n"
+      "addiu           %[x], %[x], -1               \n"
+      "swr             $t4, 0(%[dst_u])             \n"
+      "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
+      "swr             $t6, 0(%[dst_v])             \n"
+      "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
+      "swr             $t2, 4(%[dst_u])             \n"
+      "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
+      "swr             $t3, 4(%[dst_v])             \n"
+      "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
+      "swr             $t0, 8(%[dst_u])             \n"
+      "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
+      "swr             $t1, 8(%[dst_v])             \n"
+      "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
+      "swr             $t9, 12(%[dst_u])            \n"
+      "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
+      "swr             $t5, 12(%[dst_v])            \n"
+      "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
+      "addiu           %[dst_v], %[dst_v], 16       \n"
+      "bgtz            %[x], 1b                     \n"
+      " addiu          %[dst_u], %[dst_u], 16       \n"
+      "beqz            %[y], 3f                     \n"
+      " nop                                         \n"
+      "b               2f                           \n"
+      " nop                                         \n"
+
+    "2:                                             \n"
+      "lbu             $t0, -2(%[src_uv])           \n"
+      "lbu             $t1, -1(%[src_uv])           \n"
+      "addiu           %[src_uv], %[src_uv], -2     \n"
+      "addiu           %[y], %[y], -1               \n"
+      "sb              $t0, 0(%[dst_u])             \n"
+      "sb              $t1, 0(%[dst_v])             \n"
+      "addiu           %[dst_u], %[dst_u], 1        \n"
+      "bgtz            %[y], 2b                     \n"
+      " addiu          %[dst_v], %[dst_v], 1        \n"
+
+    "3:                                             \n"
+      ".set pop                                     \n"
+      : [src_uv] "+r" (src_uv),
+        [dst_u] "+r" (dst_u),
+        [dst_v] "+r" (dst_v),
+        [x] "=&r" (x),
+        [y] "+r" (y)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t7", "t8", "t9"
+  );
+}
+#endif  // HAS_MIRRORROWUV_MIPS_DSPR2
+
+
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define I422ToTransientMipsRGB                                                 \
+        "lw                $t0, 0(%[y_buf])       \n"                          \
+        "lhu               $t1, 0(%[u_buf])       \n"                          \
+        "lhu               $t2, 0(%[v_buf])       \n"                          \
+        "preceu.ph.qbr     $t1, $t1               \n"                          \
+        "preceu.ph.qbr     $t2, $t2               \n"                          \
+        "preceu.ph.qbra    $t3, $t0               \n"                          \
+        "preceu.ph.qbla    $t0, $t0               \n"                          \
+        "subu.ph           $t1, $t1, $s5          \n"                          \
+        "subu.ph           $t2, $t2, $s5          \n"                          \
+        "subu.ph           $t3, $t3, $s4          \n"                          \
+        "subu.ph           $t0, $t0, $s4          \n"                          \
+        "mul.ph            $t3, $t3, $s0          \n"                          \
+        "mul.ph            $t0, $t0, $s0          \n"                          \
+        "shll.ph           $t4, $t1, 0x7          \n"                          \
+        "subu.ph           $t4, $t4, $t1          \n"                          \
+        "mul.ph            $t6, $t1, $s1          \n"                          \
+        "mul.ph            $t1, $t2, $s2          \n"                          \
+        "addq_s.ph         $t5, $t4, $t3          \n"                          \
+        "addq_s.ph         $t4, $t4, $t0          \n"                          \
+        "shra.ph           $t5, $t5, 6            \n"                          \
+        "shra.ph           $t4, $t4, 6            \n"                          \
+        "addiu             %[u_buf], 2            \n"                          \
+        "addiu             %[v_buf], 2            \n"                          \
+        "addu.ph           $t6, $t6, $t1          \n"                          \
+        "mul.ph            $t1, $t2, $s3          \n"                          \
+        "addu.ph           $t9, $t6, $t3          \n"                          \
+        "addu.ph           $t8, $t6, $t0          \n"                          \
+        "shra.ph           $t9, $t9, 6            \n"                          \
+        "shra.ph           $t8, $t8, 6            \n"                          \
+        "addu.ph           $t2, $t1, $t3          \n"                          \
+        "addu.ph           $t1, $t1, $t0          \n"                          \
+        "shra.ph           $t2, $t2, 6            \n"                          \
+        "shra.ph           $t1, $t1, 6            \n"                          \
+        "subu.ph           $t5, $t5, $s5          \n"                          \
+        "subu.ph           $t4, $t4, $s5          \n"                          \
+        "subu.ph           $t9, $t9, $s5          \n"                          \
+        "subu.ph           $t8, $t8, $s5          \n"                          \
+        "subu.ph           $t2, $t2, $s5          \n"                          \
+        "subu.ph           $t1, $t1, $s5          \n"                          \
+        "shll_s.ph         $t5, $t5, 8            \n"                          \
+        "shll_s.ph         $t4, $t4, 8            \n"                          \
+        "shll_s.ph         $t9, $t9, 8            \n"                          \
+        "shll_s.ph         $t8, $t8, 8            \n"                          \
+        "shll_s.ph         $t2, $t2, 8            \n"                          \
+        "shll_s.ph         $t1, $t1, 8            \n"                          \
+        "shra.ph           $t5, $t5, 8            \n"                          \
+        "shra.ph           $t4, $t4, 8            \n"                          \
+        "shra.ph           $t9, $t9, 8            \n"                          \
+        "shra.ph           $t8, $t8, 8            \n"                          \
+        "shra.ph           $t2, $t2, 8            \n"                          \
+        "shra.ph           $t1, $t1, 8            \n"                          \
+        "addu.ph           $t5, $t5, $s5          \n"                          \
+        "addu.ph           $t4, $t4, $s5          \n"                          \
+        "addu.ph           $t9, $t9, $s5          \n"                          \
+        "addu.ph           $t8, $t8, $s5          \n"                          \
+        "addu.ph           $t2, $t2, $s5          \n"                          \
+        "addu.ph           $t1, $t1, $s5          \n"
+
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+      ".set push                                \n"
+      ".set noreorder                           \n"
+      "beqz              %[width], 2f           \n"
+      " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+      "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+      "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+      "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+      "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+      "repl.ph           $s5, 128               \n"  // |128|128| // clipping
+      "lui               $s6, 0xff00            \n"
+      "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
+    "1:                                         \n"
+      I422ToTransientMipsRGB
+// Arranging into argb format
+      "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
+      "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
+      "addiu             %[width], -4           \n"
+      "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
+      "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
+      "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+
+      "addiu             %[y_buf], 4            \n"
+      "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+      "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+      "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
+      "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
+      "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
+      "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
+      "sll               $t9, $t9, 16           \n"
+      "sll               $t8, $t8, 16           \n"
+      "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
+      "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
+// Store results.
+      "sw                $t2, 0(%[rgb_buf])     \n"
+      "sw                $t0, 4(%[rgb_buf])     \n"
+      "sw                $t1, 8(%[rgb_buf])     \n"
+      "sw                $t3, 12(%[rgb_buf])    \n"
+      "bnez              %[width], 1b           \n"
+      " addiu            %[rgb_buf], 16         \n"
+    "2:                                         \n"
+      ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+      ".set push                                \n\t"
+      ".set noreorder                           \n\t"
+      "beqz              %[width], 2f           \n\t"
+      " repl.ph          $s0, 74                \n\t"  // |YG|YG| = |74|74|
+      "repl.ph           $s1, -25               \n\t"  // |UG|UG| = |-25|-25|
+      "repl.ph           $s2, -52               \n\t"  // |VG|VG| = |-52|-52|
+      "repl.ph           $s3, 102               \n\t"  // |VR|VR| = |102|102|
+      "repl.ph           $s4, 16                \n\t"  // |0|16|0|16|
+      "repl.ph           $s5, 128               \n\t"  // |128|128|
+      "lui               $s6, 0xff00            \n\t"
+      "ori               $s6, 0xff00            \n\t"  // |ff|00|ff|00|
+    "1:                                         \n"
+      I422ToTransientMipsRGB
+// Arranging into abgr format
+      "precr.qb.ph      $t0, $t8, $t1           \n\t"  // |G1|g1|R1|r1|
+      "precr.qb.ph      $t3, $t9, $t2           \n\t"  // |G0|g0|R0|r0|
+      "precrq.qb.ph     $t8, $t0, $t3           \n\t"  // |G1|R1|G0|R0|
+      "precr.qb.ph      $t9, $t0, $t3           \n\t"  // |g1|r1|g0|r0|
+
+      "precr.qb.ph       $t2, $t4, $t5          \n\t"  // |B1|b1|B0|b0|
+      "addiu             %[width], -4           \n\t"
+      "addiu             %[y_buf], 4            \n\t"
+      "preceu.ph.qbla    $t1, $t2               \n\t"  // |0 |B1|0 |B0|
+      "preceu.ph.qbra    $t2, $t2               \n\t"  // |0 |b1|0 |b0|
+      "or                $t1, $t1, $s6          \n\t"  // |ff|B1|ff|B0|
+      "or                $t2, $t2, $s6          \n\t"  // |ff|b1|ff|b0|
+      "precrq.ph.w       $t0, $t2, $t9          \n\t"  // |ff|b1|g1|r1|
+      "precrq.ph.w       $t3, $t1, $t8          \n\t"  // |ff|B1|G1|R1|
+      "sll               $t9, $t9, 16           \n\t"
+      "sll               $t8, $t8, 16           \n\t"
+      "packrl.ph         $t2, $t2, $t9          \n\t"  // |ff|b0|g0|r0|
+      "packrl.ph         $t1, $t1, $t8          \n\t"  // |ff|B0|G0|R0|
+// Store results.
+      "sw                $t2, 0(%[rgb_buf])     \n\t"
+      "sw                $t0, 4(%[rgb_buf])     \n\t"
+      "sw                $t1, 8(%[rgb_buf])     \n\t"
+      "sw                $t3, 12(%[rgb_buf])    \n\t"
+      "bnez              %[width], 1b           \n\t"
+      " addiu            %[rgb_buf], 16         \n\t"
+    "2:                                         \n\t"
+      ".set pop                                 \n\t"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+      ".set push                                \n"
+      ".set noreorder                           \n"
+      "beqz              %[width], 2f           \n"
+      " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |
+      "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+      "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+      "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+      "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+      "repl.ph           $s5, 128               \n"  // |128|128|
+      "lui               $s6, 0xff              \n"
+      "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
+    "1:                                         \n"
+      I422ToTransientMipsRGB
+      // Arranging into bgra format
+      "precr.qb.ph      $t4, $t4, $t8           \n"  // |B1|b1|G1|g1|
+      "precr.qb.ph      $t5, $t5, $t9           \n"  // |B0|b0|G0|g0|
+      "precrq.qb.ph     $t8, $t4, $t5           \n"  // |B1|G1|B0|G0|
+      "precr.qb.ph      $t9, $t4, $t5           \n"  // |b1|g1|b0|g0|
+
+      "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+      "addiu             %[width], -4           \n"
+      "addiu             %[y_buf], 4            \n"
+      "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+      "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+      "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |
+      "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |
+      "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|
+      "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|
+      "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|
+      "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|
+      "sll               $t1, $t1, 16           \n"
+      "sll               $t2, $t2, 16           \n"
+      "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|
+      "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|
+// Store results.
+      "sw                $t2, 0(%[rgb_buf])     \n"
+      "sw                $t0, 4(%[rgb_buf])     \n"
+      "sw                $t1, 8(%[rgb_buf])     \n"
+      "sw                $t3, 12(%[rgb_buf])    \n"
+      "bnez              %[width], 1b           \n"
+      " addiu            %[rgb_buf], 16         \n"
+    "2:                                         \n"
+      ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
 #endif  // __mips__

 #ifdef __cplusplus
--- a/source/scale.cc
+++ b/source/scale.cc
@ -1957,6 +1957,26 @@ void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
                                const unsigned char* src_ptr,
                                ptrdiff_t src_stride,
                                int dst_width, int source_y_fraction);
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                              uint8* dst, int dst_width);
+void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                               uint8* dst, int dst_width);
+void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                               uint8* dst, int dst_width);
+void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)

 // CPU agnostic row functions
@ -2368,6 +2388,13 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
    ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
  }
+#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Int_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
+  }
 #endif

  for (int y = 0; y < dst_height; ++y) {
@ -2461,6 +2488,19 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
    }
  }
 #endif
+#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_MIPS_DSPR2;
+    }
+  }
+#endif

  for (int y = 0; y < dst_height - 2; y += 3) {
    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
@ -2541,6 +2581,18 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
      ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
    }
  }
+#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_MIPS_DSPR2;
+    }
+  }
 #endif

  for (int y = 0; y < dst_height - 2; y += 3) {
--- a/source/scale_mips.cc
+++ b/source/scale_mips.cc
@ -173,6 +173,460 @@ void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }

+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                                     uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                    \n"
+      ".set noreorder                               \n"
+
+      "srl            $t9, %[dst_width], 3          \n"
+      "beqz           $t9, 2f                       \n"
+      " nop                                         \n"
+
+     "1:                                            \n"
+      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
+      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
+      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
+      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
+      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
+      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
+      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
+      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
+      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
+      "addiu          %[src_ptr], %[src_ptr], 32    \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sw             $t1, 0(%[dst])                \n"
+      "sw             $t5, 4(%[dst])                \n"
+      "bgtz           $t9, 1b                       \n"
+      " addiu         %[dst], %[dst], 8             \n"
+
+    "2:                                             \n"
+      "andi           $t9, %[dst_width], 7          \n"  // residue
+      "beqz           $t9, 3f                       \n"
+      " nop                                         \n"
+
+    "21:                                            \n"
+      "lbu            $t1, 0(%[src_ptr])            \n"
+      "addiu          %[src_ptr], %[src_ptr], 4     \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sb             $t1, 0(%[dst])                \n"
+      "bgtz           $t9, 21b                      \n"
+      " addiu         %[dst], %[dst], 1             \n"
+
+    "3:                                             \n"
+      ".set pop                                     \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst)
+      : [dst_width] "r" (dst_width)
+      : "t1", "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                        uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  const uint8* s2 = s1 + stride;
+  const uint8* s3 = s2 + stride;
+
+  __asm__ __volatile__ (
+      ".set push                                  \n"
+      ".set noreorder                             \n"
+
+      "srl           $t9, %[dst_width], 1         \n"
+      "andi          $t8, %[dst_width], 1         \n"
+
+     "1:                                          \n"
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
+      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
+      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
+      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
+      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
+      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "add           $t4, $t4, $t5                \n"
+      "add           $t6, $t6, $t7                \n"
+      "add           $t4, $t4, $t6                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "shra_r.w      $t4, $t4, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+      "sb            $t4, 1(%[dst])               \n"
+      "addiu         %[src_ptr], %[src_ptr], 8    \n"
+      "addiu         %[s1], %[s1], 8              \n"
+      "addiu         %[s2], %[s2], 8              \n"
+      "addiu         %[s3], %[s3], 8              \n"
+      "addiu         $t9, $t9, -1                 \n"
+      "bgtz          $t9, 1b                      \n"
+      " addiu        %[dst], %[dst], 2            \n"
+      "beqz          $t8, 2f                      \n"
+      " nop                                       \n"
+
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+
+      "2:                                         \n"
+      ".set pop                                   \n"
+
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [s3] "+r" (s3)
+      : [dst_width] "r" (dst_width)
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                               uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                          \n"
+      ".set noreorder                                     \n"
+    "1:                                                   \n"
+      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
+      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
+      "addiu           %[dst_width], %[dst_width], -24    \n"
+      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
+      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
+      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
+      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
+      "addiu           %[src_ptr], %[src_ptr], 32         \n"
+      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
+      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
+      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
+      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
+      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
+      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
+      "sw              $t1, 0(%[dst])                     \n"
+      "sw              $t0, 4(%[dst])                     \n"
+      "sw              $t3, 8(%[dst])                     \n"
+      "sw              $t5, 12(%[dst])                    \n"
+      "sw              $t9, 16(%[dst])                    \n"
+      "sw              $t7, 20(%[dst])                    \n"
+      "bnez            %[dst_width], 1b                   \n"
+      " addiu          %[dst], %[dst], 24                 \n"
+      ".set pop                                           \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "repl.ph          $t3, 3                           \n"  // 0x00030003
+    "1:                                                  \n"
+      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
+      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
+      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                        \n"
+      "raddu.w.qb        $t1, $t1                        \n"
+      "shra_r.w          $t0, $t0, 1                     \n"
+      "shra_r.w          $t1, $t1, 1                     \n"
+      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
+      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
+      "addu.ph           $t2, $t2, $t4                   \n"
+      "addu.ph           $t6, $t6, $t5                   \n"
+      "sll               $t5, $t0, 1                     \n"
+      "add               $t0, $t5, $t0                   \n"
+      "shra_r.ph         $t2, $t2, 2                     \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "shll.ph           $t4, $t2, 1                     \n"
+      "addq.ph           $t4, $t4, $t2                   \n"
+      "addu              $t0, $t0, $t1                   \n"
+      "addiu             %[src_ptr], %[src_ptr], 4       \n"
+      "shra_r.w          $t0, $t0, 2                     \n"
+      "addu.ph           $t6, $t6, $t4                   \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "srl               $t1, $t6, 16                    \n"
+      "addiu             %[dst_width], %[dst_width], -3  \n"
+      "sb                $t1, 0(%[d])                    \n"
+      "sb                $t0, 1(%[d])                    \n"
+      "sb                $t6, 2(%[d])                    \n"
+      "bgtz              %[dst_width], 1b                \n"
+      " addiu            %[d], %[d], 3                   \n"
+    "3:                                                  \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                           \n"
+      ".set noreorder                                      \n"
+      "repl.ph           $t2, 3                            \n"  // 0x00030003
+    "1:                                                    \n"
+      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
+      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
+      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                          \n"
+      "raddu.w.qb        $t1, $t1                          \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "shra_r.w          $t1, $t1, 1                       \n"
+      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
+      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
+      "addu.ph           $t4, $t4, $t3                     \n"
+      "addu.ph           $t6, $t6, $t5                     \n"
+      "shra_r.ph         $t6, $t6, 2                       \n"
+      "shra_r.ph         $t4, $t4, 2                       \n"
+      "addu.ph           $t6, $t6, $t4                     \n"
+      "addiu             %[src_ptr], %[src_ptr], 4         \n"
+      "shra_r.ph         $t6, $t6, 1                       \n"
+      "addu              $t0, $t0, $t1                     \n"
+      "addiu             %[dst_width], %[dst_width], -3    \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "srl               $t1, $t6, 16                      \n"
+      "sb                $t1, 0(%[d])                      \n"
+      "sb                $t0, 1(%[d])                      \n"
+      "sb                $t6, 2(%[d])                      \n"
+      "bgtz              %[dst_width], 1b                  \n"
+      " addiu            %[d], %[d], 3                     \n"
+    "3:                                                    \n"
+      ".set pop                                            \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                               uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                     \n"
+      ".set noreorder                                \n"
+    "1:                                              \n"
+      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
+      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
+      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
+      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
+      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
+      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
+      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
+      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
+      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
+      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
+      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
+      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
+      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
+      "addiu      %[src_ptr], %[src_ptr], 32         \n"
+      "addiu      %[dst_width], %[dst_width], -12    \n"
+      "addiu      $t8,%[dst_width], -12              \n"
+      "sw         $t1, 0(%[dst])                     \n"
+      "sw         $t4, 4(%[dst])                     \n"
+      "sw         $t6, 8(%[dst])                     \n"
+      "bgez       $t8, 1b                            \n"
+      " addiu     %[dst], %[dst], 12                 \n"
+      ".set pop                                      \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* t = src_ptr + stride;
+  const int c = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
+      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
+      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
+      "srl             $t4, $t4, 2                       \n"  // t4 / 4
+      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
+      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
+      "addu            $t6, $t5, $t6                     \n"
+      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
+      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
+      "addu            $t0, $t0, $t2                     \n"
+      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[t], %[t], 8                     \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t4, -1(%[dst_ptr])               \n"
+      "sb              $t6, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [t] "+r" (t),
+        [dst_width] "+r" (dst_width)
+      : [c] "r" (c)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  stride += stride;
+  const uint8* s2 = src_ptr + stride;
+  const int c1 = 0x1C71;
+  const int c2 = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
+      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
+      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
+      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
+      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
+      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
+      "addu            $t7, $t7, $t8                     \n"
+      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
+      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
+      "addu            $t6, $t6, $t8                     \n"
+      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
+      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
+      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
+      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
+      "addu            $t7, $t7, $t8                     \n"
+      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
+      "raddu.w.qb      $t0, $t0                          \n"
+      "raddu.w.qb      $t2, $t2                          \n"
+      "raddu.w.qb      $t4, $t4                          \n"
+      "addu            $t0, $t0, $t2                     \n"
+      "addu            $t0, $t0, $t4                     \n"
+      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[s1], %[s1], 8                   \n"
+      "addiu           %[s2], %[s2], 8                   \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t7, $t7, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t6, -1(%[dst_ptr])               \n"
+      "sb              $t7, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [dst_width] "+r" (dst_width)
+      : [c1] "r" (c1), [c2] "r" (c2)
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
 void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
                                const unsigned char* src_ptr,
                                ptrdiff_t src_stride,
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -630,4 +630,70 @@ TEST_F(libyuvTest, TestAffine) {
 #endif
 }

+TEST_F(libyuvTest, TestCopyPlane) {
+  int err = 0;
+  int yw = benchmark_width_;
+  int yh = benchmark_height_;
+  int b = 12;
+  int i, j;
+
+  int y_plane_size = (yw + b * 2) * (yh + b * 2);
+  srandom(time(NULL));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(dst_c, y_plane_size)
+  align_buffer_16(dst_opt, y_plane_size);
+
+  memset(orig_y, 0, y_plane_size);
+  memset(dst_c, 0, y_plane_size);
+  memset(dst_opt, 0, y_plane_size);
+
+  // Fill image buffers with random data.
+  for (i = b; i < (yh + b); ++i) {
+    for (j = b; j < (yw + b); ++j) {
+      orig_y[i * (yw + b * 2) + j] = random() & 0xff;
+    }
+  }
+
+  // Fill destination buffers with random data.
+  for (i = 0; i < y_plane_size; ++i) {
+    uint8 random_number = random() & 0x7f;
+    dst_c[i] = random_number;
+    dst_opt[i] = dst_c[i];
+  }
+
+  int y_off = b * (yw + b * 2) + b;
+
+  int y_st = yw + b * 2;
+  int stride = 8;
+
+  // Disable all optimizations.
+  MaskCpuFlags(0);
+  double c_time = get_time();
+  for (j = 0; j < benchmark_iterations_; j++) {
+    CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh);
+  }
+  c_time = (get_time() - c_time) / benchmark_iterations_;
+
+  // Enable optimizations.
+  MaskCpuFlags(-1);
+  double opt_time = get_time();
+  for (j = 0; j < benchmark_iterations_; j++) {
+    CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations_;
+  printf(" %8d us C - %8d us OPT\n",
+         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  for (i = 0; i < y_plane_size; ++i) {
+    if (dst_c[i] != dst_opt[i])
+      ++err;
+  }
+
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(dst_c)
+  free_aligned_buffer_16(dst_opt)
+
+  EXPECT_EQ(0, err);
+}
+
 }  // namespace libyuv