diff --git a/libyuv.gyp b/libyuv.gyp index 305faef52..0fd33b35a 100644 --- a/libyuv.gyp +++ b/libyuv.gyp @@ -75,6 +75,7 @@ 'source/convert_from_argb.cc', 'source/cpu_id.cc', 'source/format_conversion.cc', + 'source/memcpy_mips.S', 'source/mjpeg_decoder.cc', 'source/planar_functions.cc', 'source/rotate.cc', diff --git a/source/memcpy_mips.S b/source/memcpy_mips.S new file mode 100644 index 000000000..83292d2f9 --- /dev/null +++ b/source/memcpy_mips.S @@ -0,0 +1,346 @@ +#if defined (__mips__) + + .globl memcpy_MIPS; + .align 2; + .type memcpy_MIPS,@function; + .ent memcpy_MIPS,0; +memcpy_MIPS: + .frame $sp,0,$ra + .set noreorder + .set noat + + slti $at,$a2,8 + bne $at,$zero,last8 + move $v0,$a0 # memcpy returns the dst pointer + +# Test if the src and dst are word-aligned, or can be made word-aligned + xor $t8,$a1,$a0 + andi $t8,$t8,0x3 # t8 is a0/a1 word-displacement + + bne $t8,$zero,unaligned + negu $a3,$a0 + + andi $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned + beq $a3,$zero,chk16w # when a3=0 then the dst (a0) is word-aligned + subu $a2,$a2,$a3 # now a2 is the remining bytes count + + lwr $t8,0($a1) + addu $a1,$a1,$a3 + swr $t8,0($a0) + addu $a0,$a0,$a3 + +# Now the dst/src are mutually word-aligned with word-aligned addresses +chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks? + # t8 is the byte count after 64-byte chunks + + beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks + # There will be at most 1 32-byte chunk after it + subu $a3,$a2,$t8 # subtract from a2 the reminder + # Here a3 counts bytes in 16w chunks + addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks + + addu $t0,$a0,$a2 # t0 is the "past the end" address + +# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past +# the "t0-32" address +# This means: for x=128 the last "safe" a0 address is "t0-160" +# Alternatively, for x=64 the last "safe" a0 address is "t0-96" +# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit + subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address + + pref 0,0($a1) # bring the first line of src, addr 0 + pref 0,32($a1) # bring the second line of src, addr 32 + pref 0,64($a1) # bring the third line of src, addr 64 + pref 30,32($a0) # safe, as we have at least 64 bytes ahead +# In case the a0 > t9 don't use "pref 30" at all + sgtu $v1,$a0,$t9 + bgtz $v1,loop16w # skip "pref 30,64(a0)" for too short arrays + nop +# otherwise, start with using pref30 + pref 30,64($a0) +loop16w: + pref 0,96($a1) + lw $t0,0($a1) + bgtz $v1,skip_pref30_96 # skip "pref 30,96(a0)" + lw $t1,4($a1) + pref 30,96($a0) # continue setting up the dest, addr 96 +skip_pref30_96: + lw $t2,8($a1) + lw $t3,12($a1) + lw $t4,16($a1) + lw $t5,20($a1) + lw $t6,24($a1) + lw $t7,28($a1) + pref 0,128($a1) # bring the next lines of src, addr 128 + + sw $t0,0($a0) + sw $t1,4($a0) + sw $t2,8($a0) + sw $t3,12($a0) + sw $t4,16($a0) + sw $t5,20($a0) + sw $t6,24($a0) + sw $t7,28($a0) + + lw $t0,32($a1) + bgtz $v1,skip_pref30_128 # skip "pref 30,128(a0)" + lw $t1,36($a1) + pref 30,128($a0) # continue setting up the dest, addr 128 +skip_pref30_128: + lw $t2,40($a1) + lw $t3,44($a1) + lw $t4,48($a1) + lw $t5,52($a1) + lw $t6,56($a1) + lw $t7,60($a1) + pref 0, 160($a1) # bring the next lines of src, addr 160 + + sw $t0,32($a0) + sw $t1,36($a0) + sw $t2,40($a0) + sw $t3,44($a0) + sw $t4,48($a0) + sw $t5,52($a0) + sw $t6,56($a0) + sw $t7,60($a0) + + addiu $a0,$a0,64 # adding 64 to dest + sgtu $v1,$a0,$t9 + bne $a0,$a3,loop16w + addiu $a1,$a1,64 # adding 64 to src + move $a2,$t8 + +# Here we have src and dest word-aligned but less than 64-bytes to go + +chk8w: + pref 0, 0x0($a1) + andi $t8,$a2,0x1f # is there a 32-byte chunk? + # the t8 is the reminder count past 32-bytes + beq $a2,$t8,chk1w # when a2=t8, no 32-byte chunk + nop + + lw $t0,0($a1) + lw $t1,4($a1) + lw $t2,8($a1) + lw $t3,12($a1) + lw $t4,16($a1) + lw $t5,20($a1) + lw $t6,24($a1) + lw $t7,28($a1) + addiu $a1,$a1,32 + + sw $t0,0($a0) + sw $t1,4($a0) + sw $t2,8($a0) + sw $t3,12($a0) + sw $t4,16($a0) + sw $t5,20($a0) + sw $t6,24($a0) + sw $t7,28($a0) + addiu $a0,$a0,32 + +chk1w: + andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks + beq $a2,$t8,last8 + subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks + addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks + +# copying in words (4-byte chunks) +wordCopy_loop: + lw $t3,0($a1) # the first t3 may be equal t0 ... optimize? + addiu $a1,$a1,4 + addiu $a0,$a0,4 + bne $a0,$a3,wordCopy_loop + sw $t3,-4($a0) + +# For the last (<8) bytes +last8: + blez $a2,leave + addu $a3,$a0,$a2 # a3 is the last dst address +last8loop: + lb $v1,0($a1) + addiu $a1,$a1,1 + addiu $a0,$a0,1 + bne $a0,$a3,last8loop + sb $v1,-1($a0) + +leave: j $ra + nop + +# +# UNALIGNED case +# + +unaligned: + # got here with a3="negu a0" + andi $a3,$a3,0x3 # test if the a0 is word aligned + beqz $a3,ua_chk16w + subu $a2,$a2,$a3 # bytes left after initial a3 bytes + + lwr $v1,0($a1) + lwl $v1,3($a1) + addu $a1,$a1,$a3 # a3 may be here 1, 2 or 3 + swr $v1,0($a0) + addu $a0,$a0,$a3 # below the dst will be word aligned (NOTE1) + +ua_chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks? + # t8 is the byte count after 64-byte chunks + beq $a2,$t8,ua_chk8w # if a2==t8, no 64-byte chunks + # There will be at most 1 32-byte chunk after it + subu $a3,$a2,$t8 # subtract from a2 the reminder + # Here a3 counts bytes in 16w chunks + addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks + + addu $t0,$a0,$a2 # t0 is the "past the end" address + + subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address + + pref 0,0($a1) # bring the first line of src, addr 0 + pref 0,32($a1) # bring the second line of src, addr 32 + pref 0,64($a1) # bring the third line of src, addr 64 + pref 30,32($a0) # safe, as we have at least 64 bytes ahead +# In case the a0 > t9 don't use "pref 30" at all + sgtu $v1,$a0,$t9 + bgtz $v1,ua_loop16w # skip "pref 30,64(a0)" for too short arrays + nop +# otherwise, start with using pref30 + pref 30,64($a0) +ua_loop16w: + pref 0,96($a1) + lwr $t0,0($a1) + lwl $t0,3($a1) + lwr $t1,4($a1) + bgtz $v1,ua_skip_pref30_96 + lwl $t1,7($a1) + pref 30,96($a0) # continue setting up the dest, addr 96 +ua_skip_pref30_96: + lwr $t2,8($a1) + lwl $t2,11($a1) + lwr $t3,12($a1) + lwl $t3,15($a1) + lwr $t4,16($a1) + lwl $t4,19($a1) + lwr $t5,20($a1) + lwl $t5,23($a1) + lwr $t6,24($a1) + lwl $t6,27($a1) + lwr $t7,28($a1) + lwl $t7,31($a1) + pref 0,128($a1) # bring the next lines of src, addr 128 + + sw $t0,0($a0) + sw $t1,4($a0) + sw $t2,8($a0) + sw $t3,12($a0) + sw $t4,16($a0) + sw $t5,20($a0) + sw $t6,24($a0) + sw $t7,28($a0) + + lwr $t0,32($a1) + lwl $t0,35($a1) + lwr $t1,36($a1) + bgtz $v1,ua_skip_pref30_128 + lwl $t1,39($a1) + pref 30,128($a0) # continue setting up the dest, addr 128 +ua_skip_pref30_128: + lwr $t2,40($a1) + lwl $t2,43($a1) + lwr $t3,44($a1) + lwl $t3,47($a1) + lwr $t4,48($a1) + lwl $t4,51($a1) + lwr $t5,52($a1) + lwl $t5,55($a1) + lwr $t6,56($a1) + lwl $t6,59($a1) + lwr $t7,60($a1) + lwl $t7,63($a1) + pref 0, 160($a1) # bring the next lines of src, addr 160 + + sw $t0,32($a0) + sw $t1,36($a0) + sw $t2,40($a0) + sw $t3,44($a0) + sw $t4,48($a0) + sw $t5,52($a0) + sw $t6,56($a0) + sw $t7,60($a0) + + addiu $a0,$a0,64 # adding 64 to dest + sgtu $v1,$a0,$t9 + bne $a0,$a3,ua_loop16w + addiu $a1,$a1,64 # adding 64 to src + move $a2,$t8 + +# Here we have src and dest word-aligned but less than 64-bytes to go + +ua_chk8w: + pref 0, 0x0($a1) + andi $t8,$a2,0x1f # is there a 32-byte chunk? + # the t8 is the reminder count + beq $a2,$t8,ua_chk1w # when a2=t8, no 32-byte chunk + + lwr $t0,0($a1) + lwl $t0,3($a1) + lwr $t1,4($a1) + lwl $t1,7($a1) + lwr $t2,8($a1) + lwl $t2,11($a1) + lwr $t3,12($a1) + lwl $t3,15($a1) + lwr $t4,16($a1) + lwl $t4,19($a1) + lwr $t5,20($a1) + lwl $t5,23($a1) + lwr $t6,24($a1) + lwl $t6,27($a1) + lwr $t7,28($a1) + lwl $t7,31($a1) + addiu $a1,$a1,32 + + sw $t0,0($a0) + sw $t1,4($a0) + sw $t2,8($a0) + sw $t3,12($a0) + sw $t4,16($a0) + sw $t5,20($a0) + sw $t6,24($a0) + sw $t7,28($a0) + addiu $a0,$a0,32 + +ua_chk1w: + andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks + beq $a2,$t8,ua_smallCopy + subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks + addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks + +# copying in words (4-byte chunks) +ua_wordCopy_loop: + lwr $v1,0($a1) + lwl $v1,3($a1) + addiu $a1,$a1,4 + addiu $a0,$a0,4 # note: dst=a0 is word aligned here, see NOTE1 + bne $a0,$a3,ua_wordCopy_loop + sw $v1,-4($a0) + +# Now less than 4 bytes (value in a2) left to copy +ua_smallCopy: + beqz $a2,leave + addu $a3,$a0,$a2 # a3 is the last dst address +ua_smallCopy_loop: + lb $v1,0($a1) + addiu $a1,$a1,1 + addiu $a0,$a0,1 + bne $a0,$a3,ua_smallCopy_loop + sb $v1,-1($a0) + + j $ra + nop + + .set at + .set reorder + .end memcpy_MIPS; + .size memcpy_MIPS,.-memcpy_MIPS + +#endif // if defined (__mips__)