diff --git a/libyuv.gyp b/libyuv.gyp
index 305faef52..0fd33b35a 100644
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -75,6 +75,7 @@
         'source/convert_from_argb.cc',
         'source/cpu_id.cc',
         'source/format_conversion.cc',
+        'source/memcpy_mips.S',
         'source/mjpeg_decoder.cc',
         'source/planar_functions.cc',
         'source/rotate.cc',
diff --git a/source/memcpy_mips.S b/source/memcpy_mips.S
new file mode 100644
index 000000000..83292d2f9
--- /dev/null
+++ b/source/memcpy_mips.S
@@ -0,0 +1,346 @@
+#if defined (__mips__)
+
+    .globl  memcpy_MIPS;
+    .align  2;
+    .type   memcpy_MIPS,@function;
+    .ent    memcpy_MIPS,0;
+memcpy_MIPS:
+    .frame  $sp,0,$ra
+  .set  noreorder
+  .set  noat
+
+  slti  $at,$a2,8
+  bne     $at,$zero,last8
+  move  $v0,$a0 # memcpy returns the dst pointer
+
+# Test if the src and dst are word-aligned, or can be made word-aligned
+  xor $t8,$a1,$a0
+  andi  $t8,$t8,0x3   # t8 is a0/a1 word-displacement
+
+  bne $t8,$zero,unaligned
+  negu  $a3,$a0
+
+  andi  $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
+  beq $a3,$zero,chk16w  # when a3=0 then the dst (a0) is word-aligned
+  subu  $a2,$a2,$a3 # now a2 is the remining bytes count
+
+  lwr  $t8,0($a1)
+  addu  $a1,$a1,$a3
+  swr  $t8,0($a0)
+  addu  $a0,$a0,$a3
+
+# Now the dst/src are mutually word-aligned with word-aligned addresses
+chk16w: andi  $t8,$a2,0x3f  # any whole 64-byte chunks?
+        # t8 is the byte count after 64-byte chunks
+
+  beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks
+        # There will be at most 1 32-byte chunk after it
+  subu  $a3,$a2,$t8 # subtract from a2 the reminder
+                                # Here a3 counts bytes in 16w chunks
+  addu  $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
+
+  addu  $t0,$a0,$a2 # t0 is the "past the end" address
+
+# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
+# the "t0-32" address
+# This means: for x=128 the last "safe" a0 address is "t0-160"
+# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
+# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
+  subu  $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
+
+  pref    0,0($a1)    # bring the first line of src, addr 0
+  pref    0,32($a1) # bring the second line of src, addr 32
+  pref    0,64($a1) # bring the third line of src, addr 64
+  pref  30,32($a0)  # safe, as we have at least 64 bytes ahead
+# In case the a0 > t9 don't use "pref 30" at all
+  sgtu  $v1,$a0,$t9
+  bgtz  $v1,loop16w # skip "pref 30,64(a0)" for too short arrays
+  nop
+# otherwise, start with using pref30
+  pref  30,64($a0)
+loop16w:
+  pref  0,96($a1)
+  lw  $t0,0($a1)
+  bgtz  $v1,skip_pref30_96  # skip "pref 30,96(a0)"
+  lw  $t1,4($a1)
+  pref    30,96($a0)   # continue setting up the dest, addr 96
+skip_pref30_96:
+  lw  $t2,8($a1)
+  lw  $t3,12($a1)
+  lw  $t4,16($a1)
+  lw  $t5,20($a1)
+  lw  $t6,24($a1)
+  lw  $t7,28($a1)
+        pref    0,128($a1)    # bring the next lines of src, addr 128
+
+  sw  $t0,0($a0)
+  sw  $t1,4($a0)
+  sw  $t2,8($a0)
+  sw  $t3,12($a0)
+  sw  $t4,16($a0)
+  sw  $t5,20($a0)
+  sw  $t6,24($a0)
+  sw  $t7,28($a0)
+
+  lw  $t0,32($a1)
+  bgtz  $v1,skip_pref30_128 # skip "pref 30,128(a0)"
+  lw  $t1,36($a1)
+  pref    30,128($a0)   # continue setting up the dest, addr 128
+skip_pref30_128:
+  lw  $t2,40($a1)
+  lw  $t3,44($a1)
+  lw  $t4,48($a1)
+  lw  $t5,52($a1)
+  lw  $t6,56($a1)
+  lw  $t7,60($a1)
+    pref    0, 160($a1)    # bring the next lines of src, addr 160
+
+  sw  $t0,32($a0)
+  sw  $t1,36($a0)
+  sw  $t2,40($a0)
+  sw  $t3,44($a0)
+  sw  $t4,48($a0)
+  sw  $t5,52($a0)
+  sw  $t6,56($a0)
+  sw  $t7,60($a0)
+
+  addiu $a0,$a0,64  # adding 64 to dest
+  sgtu  $v1,$a0,$t9
+  bne $a0,$a3,loop16w
+  addiu $a1,$a1,64  # adding 64 to src
+  move  $a2,$t8
+
+# Here we have src and dest word-aligned but less than 64-bytes to go
+
+chk8w:
+  pref 0, 0x0($a1)
+  andi  $t8,$a2,0x1f  # is there a 32-byte chunk?
+        # the t8 is the reminder count past 32-bytes
+  beq $a2,$t8,chk1w # when a2=t8, no 32-byte chunk
+   nop
+
+  lw  $t0,0($a1)
+  lw  $t1,4($a1)
+  lw  $t2,8($a1)
+  lw  $t3,12($a1)
+  lw  $t4,16($a1)
+  lw  $t5,20($a1)
+  lw  $t6,24($a1)
+  lw  $t7,28($a1)
+  addiu $a1,$a1,32
+
+  sw  $t0,0($a0)
+  sw  $t1,4($a0)
+  sw  $t2,8($a0)
+  sw  $t3,12($a0)
+  sw  $t4,16($a0)
+  sw  $t5,20($a0)
+  sw  $t6,24($a0)
+  sw  $t7,28($a0)
+  addiu $a0,$a0,32
+
+chk1w:
+  andi  $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
+  beq $a2,$t8,last8
+  subu  $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
+  addu  $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
+
+# copying in words (4-byte chunks)
+wordCopy_loop:
+  lw  $t3,0($a1)  # the first t3 may be equal t0 ... optimize?
+  addiu $a1,$a1,4
+  addiu $a0,$a0,4
+  bne $a0,$a3,wordCopy_loop
+  sw  $t3,-4($a0)
+
+# For the last (<8) bytes
+last8:
+  blez  $a2,leave
+  addu  $a3,$a0,$a2 # a3 is the last dst address
+last8loop:
+  lb  $v1,0($a1)
+  addiu $a1,$a1,1
+  addiu $a0,$a0,1
+  bne $a0,$a3,last8loop
+  sb  $v1,-1($a0)
+
+leave:  j $ra
+  nop
+
+#
+# UNALIGNED case
+#
+
+unaligned:
+  # got here with a3="negu a0"
+  andi  $a3,$a3,0x3 # test if the a0 is word aligned
+  beqz  $a3,ua_chk16w
+  subu  $a2,$a2,$a3 # bytes left after initial a3 bytes
+
+  lwr  $v1,0($a1)
+  lwl  $v1,3($a1)
+  addu  $a1,$a1,$a3 # a3 may be here 1, 2 or 3
+  swr  $v1,0($a0)
+  addu  $a0,$a0,$a3 # below the dst will be word aligned (NOTE1)
+
+ua_chk16w:  andi  $t8,$a2,0x3f  # any whole 64-byte chunks?
+        # t8 is the byte count after 64-byte chunks
+  beq $a2,$t8,ua_chk8w  # if a2==t8, no 64-byte chunks
+        # There will be at most 1 32-byte chunk after it
+  subu  $a3,$a2,$t8 # subtract from a2 the reminder
+                                # Here a3 counts bytes in 16w chunks
+  addu  $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
+
+  addu  $t0,$a0,$a2 # t0 is the "past the end" address
+
+  subu  $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
+
+  pref    0,0($a1)    # bring the first line of src, addr 0
+  pref    0,32($a1) # bring the second line of src, addr 32
+  pref    0,64($a1) # bring the third line of src, addr 64
+  pref  30,32($a0)  # safe, as we have at least 64 bytes ahead
+# In case the a0 > t9 don't use "pref 30" at all
+  sgtu  $v1,$a0,$t9
+  bgtz  $v1,ua_loop16w  # skip "pref 30,64(a0)" for too short arrays
+  nop
+# otherwise, start with using pref30
+  pref  30,64($a0)
+ua_loop16w:
+  pref  0,96($a1)
+  lwr  $t0,0($a1)
+  lwl  $t0,3($a1)
+  lwr  $t1,4($a1)
+  bgtz  $v1,ua_skip_pref30_96
+  lwl  $t1,7($a1)
+  pref    30,96($a0)   # continue setting up the dest, addr 96
+ua_skip_pref30_96:
+  lwr  $t2,8($a1)
+  lwl  $t2,11($a1)
+  lwr  $t3,12($a1)
+  lwl  $t3,15($a1)
+  lwr  $t4,16($a1)
+  lwl  $t4,19($a1)
+  lwr  $t5,20($a1)
+  lwl  $t5,23($a1)
+  lwr  $t6,24($a1)
+  lwl  $t6,27($a1)
+  lwr  $t7,28($a1)
+  lwl  $t7,31($a1)
+        pref    0,128($a1)    # bring the next lines of src, addr 128
+
+  sw  $t0,0($a0)
+  sw  $t1,4($a0)
+  sw  $t2,8($a0)
+  sw  $t3,12($a0)
+  sw  $t4,16($a0)
+  sw  $t5,20($a0)
+  sw  $t6,24($a0)
+  sw  $t7,28($a0)
+
+  lwr  $t0,32($a1)
+  lwl  $t0,35($a1)
+  lwr  $t1,36($a1)
+  bgtz  $v1,ua_skip_pref30_128
+  lwl  $t1,39($a1)
+  pref    30,128($a0)   # continue setting up the dest, addr 128
+ua_skip_pref30_128:
+  lwr  $t2,40($a1)
+  lwl  $t2,43($a1)
+  lwr  $t3,44($a1)
+  lwl  $t3,47($a1)
+  lwr  $t4,48($a1)
+  lwl  $t4,51($a1)
+  lwr  $t5,52($a1)
+  lwl  $t5,55($a1)
+  lwr  $t6,56($a1)
+  lwl  $t6,59($a1)
+  lwr  $t7,60($a1)
+  lwl  $t7,63($a1)
+        pref    0, 160($a1)    # bring the next lines of src, addr 160
+
+  sw  $t0,32($a0)
+  sw  $t1,36($a0)
+  sw  $t2,40($a0)
+  sw  $t3,44($a0)
+  sw  $t4,48($a0)
+  sw  $t5,52($a0)
+  sw  $t6,56($a0)
+  sw  $t7,60($a0)
+
+  addiu $a0,$a0,64  # adding 64 to dest
+  sgtu  $v1,$a0,$t9
+  bne $a0,$a3,ua_loop16w
+  addiu $a1,$a1,64  # adding 64 to src
+  move  $a2,$t8
+
+# Here we have src and dest word-aligned but less than 64-bytes to go
+
+ua_chk8w:
+  pref 0, 0x0($a1)
+  andi  $t8,$a2,0x1f  # is there a 32-byte chunk?
+        # the t8 is the reminder count
+  beq $a2,$t8,ua_chk1w  # when a2=t8, no 32-byte chunk
+
+  lwr  $t0,0($a1)
+  lwl  $t0,3($a1)
+  lwr  $t1,4($a1)
+  lwl  $t1,7($a1)
+  lwr  $t2,8($a1)
+  lwl  $t2,11($a1)
+  lwr  $t3,12($a1)
+  lwl  $t3,15($a1)
+  lwr  $t4,16($a1)
+  lwl  $t4,19($a1)
+  lwr  $t5,20($a1)
+  lwl  $t5,23($a1)
+  lwr  $t6,24($a1)
+  lwl  $t6,27($a1)
+  lwr  $t7,28($a1)
+  lwl  $t7,31($a1)
+  addiu $a1,$a1,32
+
+  sw  $t0,0($a0)
+  sw  $t1,4($a0)
+  sw  $t2,8($a0)
+  sw  $t3,12($a0)
+  sw  $t4,16($a0)
+  sw  $t5,20($a0)
+  sw  $t6,24($a0)
+  sw  $t7,28($a0)
+  addiu $a0,$a0,32
+
+ua_chk1w:
+  andi  $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
+  beq $a2,$t8,ua_smallCopy
+  subu  $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
+  addu  $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
+
+# copying in words (4-byte chunks)
+ua_wordCopy_loop:
+  lwr  $v1,0($a1)
+  lwl  $v1,3($a1)
+  addiu $a1,$a1,4
+  addiu $a0,$a0,4   # note: dst=a0 is word aligned here, see NOTE1
+  bne $a0,$a3,ua_wordCopy_loop
+  sw  $v1,-4($a0)
+
+# Now less than 4 bytes (value in a2) left to copy
+ua_smallCopy:
+  beqz  $a2,leave
+  addu  $a3,$a0,$a2 # a3 is the last dst address
+ua_smallCopy_loop:
+  lb  $v1,0($a1)
+  addiu $a1,$a1,1
+  addiu $a0,$a0,1
+  bne $a0,$a3,ua_smallCopy_loop
+  sb  $v1,-1($a0)
+
+  j $ra
+  nop
+
+  .set  at
+  .set  reorder
+    .end    memcpy_MIPS;
+    .size   memcpy_MIPS,.-memcpy_MIPS
+
+#endif // if defined (__mips__)