mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
mips memcpy
BUG=126 TEST=local test by mips Review URL: https://webrtc-codereview.appspot.com/932006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@448 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
1dafd444ba
commit
1f399dfaf8
@ -75,6 +75,7 @@
|
||||
'source/convert_from_argb.cc',
|
||||
'source/cpu_id.cc',
|
||||
'source/format_conversion.cc',
|
||||
'source/memcpy_mips.S',
|
||||
'source/mjpeg_decoder.cc',
|
||||
'source/planar_functions.cc',
|
||||
'source/rotate.cc',
|
||||
|
||||
346
source/memcpy_mips.S
Normal file
346
source/memcpy_mips.S
Normal file
@ -0,0 +1,346 @@
|
||||
#if defined (__mips__)
|
||||
|
||||
.globl memcpy_MIPS;
|
||||
.align 2;
|
||||
.type memcpy_MIPS,@function;
|
||||
.ent memcpy_MIPS,0;
|
||||
memcpy_MIPS:
|
||||
.frame $sp,0,$ra
|
||||
.set noreorder
|
||||
.set noat
|
||||
|
||||
slti $at,$a2,8
|
||||
bne $at,$zero,last8
|
||||
move $v0,$a0 # memcpy returns the dst pointer
|
||||
|
||||
# Test if the src and dst are word-aligned, or can be made word-aligned
|
||||
xor $t8,$a1,$a0
|
||||
andi $t8,$t8,0x3 # t8 is a0/a1 word-displacement
|
||||
|
||||
bne $t8,$zero,unaligned
|
||||
negu $a3,$a0
|
||||
|
||||
andi $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
|
||||
beq $a3,$zero,chk16w # when a3=0 then the dst (a0) is word-aligned
|
||||
subu $a2,$a2,$a3 # now a2 is the remining bytes count
|
||||
|
||||
lwr $t8,0($a1)
|
||||
addu $a1,$a1,$a3
|
||||
swr $t8,0($a0)
|
||||
addu $a0,$a0,$a3
|
||||
|
||||
# Now the dst/src are mutually word-aligned with word-aligned addresses
|
||||
chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks?
|
||||
# t8 is the byte count after 64-byte chunks
|
||||
|
||||
beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks
|
||||
# There will be at most 1 32-byte chunk after it
|
||||
subu $a3,$a2,$t8 # subtract from a2 the reminder
|
||||
# Here a3 counts bytes in 16w chunks
|
||||
addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
|
||||
|
||||
addu $t0,$a0,$a2 # t0 is the "past the end" address
|
||||
|
||||
# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
|
||||
# the "t0-32" address
|
||||
# This means: for x=128 the last "safe" a0 address is "t0-160"
|
||||
# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
|
||||
# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
|
||||
subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
|
||||
|
||||
pref 0,0($a1) # bring the first line of src, addr 0
|
||||
pref 0,32($a1) # bring the second line of src, addr 32
|
||||
pref 0,64($a1) # bring the third line of src, addr 64
|
||||
pref 30,32($a0) # safe, as we have at least 64 bytes ahead
|
||||
# In case the a0 > t9 don't use "pref 30" at all
|
||||
sgtu $v1,$a0,$t9
|
||||
bgtz $v1,loop16w # skip "pref 30,64(a0)" for too short arrays
|
||||
nop
|
||||
# otherwise, start with using pref30
|
||||
pref 30,64($a0)
|
||||
loop16w:
|
||||
pref 0,96($a1)
|
||||
lw $t0,0($a1)
|
||||
bgtz $v1,skip_pref30_96 # skip "pref 30,96(a0)"
|
||||
lw $t1,4($a1)
|
||||
pref 30,96($a0) # continue setting up the dest, addr 96
|
||||
skip_pref30_96:
|
||||
lw $t2,8($a1)
|
||||
lw $t3,12($a1)
|
||||
lw $t4,16($a1)
|
||||
lw $t5,20($a1)
|
||||
lw $t6,24($a1)
|
||||
lw $t7,28($a1)
|
||||
pref 0,128($a1) # bring the next lines of src, addr 128
|
||||
|
||||
sw $t0,0($a0)
|
||||
sw $t1,4($a0)
|
||||
sw $t2,8($a0)
|
||||
sw $t3,12($a0)
|
||||
sw $t4,16($a0)
|
||||
sw $t5,20($a0)
|
||||
sw $t6,24($a0)
|
||||
sw $t7,28($a0)
|
||||
|
||||
lw $t0,32($a1)
|
||||
bgtz $v1,skip_pref30_128 # skip "pref 30,128(a0)"
|
||||
lw $t1,36($a1)
|
||||
pref 30,128($a0) # continue setting up the dest, addr 128
|
||||
skip_pref30_128:
|
||||
lw $t2,40($a1)
|
||||
lw $t3,44($a1)
|
||||
lw $t4,48($a1)
|
||||
lw $t5,52($a1)
|
||||
lw $t6,56($a1)
|
||||
lw $t7,60($a1)
|
||||
pref 0, 160($a1) # bring the next lines of src, addr 160
|
||||
|
||||
sw $t0,32($a0)
|
||||
sw $t1,36($a0)
|
||||
sw $t2,40($a0)
|
||||
sw $t3,44($a0)
|
||||
sw $t4,48($a0)
|
||||
sw $t5,52($a0)
|
||||
sw $t6,56($a0)
|
||||
sw $t7,60($a0)
|
||||
|
||||
addiu $a0,$a0,64 # adding 64 to dest
|
||||
sgtu $v1,$a0,$t9
|
||||
bne $a0,$a3,loop16w
|
||||
addiu $a1,$a1,64 # adding 64 to src
|
||||
move $a2,$t8
|
||||
|
||||
# Here we have src and dest word-aligned but less than 64-bytes to go
|
||||
|
||||
chk8w:
|
||||
pref 0, 0x0($a1)
|
||||
andi $t8,$a2,0x1f # is there a 32-byte chunk?
|
||||
# the t8 is the reminder count past 32-bytes
|
||||
beq $a2,$t8,chk1w # when a2=t8, no 32-byte chunk
|
||||
nop
|
||||
|
||||
lw $t0,0($a1)
|
||||
lw $t1,4($a1)
|
||||
lw $t2,8($a1)
|
||||
lw $t3,12($a1)
|
||||
lw $t4,16($a1)
|
||||
lw $t5,20($a1)
|
||||
lw $t6,24($a1)
|
||||
lw $t7,28($a1)
|
||||
addiu $a1,$a1,32
|
||||
|
||||
sw $t0,0($a0)
|
||||
sw $t1,4($a0)
|
||||
sw $t2,8($a0)
|
||||
sw $t3,12($a0)
|
||||
sw $t4,16($a0)
|
||||
sw $t5,20($a0)
|
||||
sw $t6,24($a0)
|
||||
sw $t7,28($a0)
|
||||
addiu $a0,$a0,32
|
||||
|
||||
chk1w:
|
||||
andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
|
||||
beq $a2,$t8,last8
|
||||
subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
|
||||
addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
|
||||
|
||||
# copying in words (4-byte chunks)
|
||||
wordCopy_loop:
|
||||
lw $t3,0($a1) # the first t3 may be equal t0 ... optimize?
|
||||
addiu $a1,$a1,4
|
||||
addiu $a0,$a0,4
|
||||
bne $a0,$a3,wordCopy_loop
|
||||
sw $t3,-4($a0)
|
||||
|
||||
# For the last (<8) bytes
|
||||
last8:
|
||||
blez $a2,leave
|
||||
addu $a3,$a0,$a2 # a3 is the last dst address
|
||||
last8loop:
|
||||
lb $v1,0($a1)
|
||||
addiu $a1,$a1,1
|
||||
addiu $a0,$a0,1
|
||||
bne $a0,$a3,last8loop
|
||||
sb $v1,-1($a0)
|
||||
|
||||
leave: j $ra
|
||||
nop
|
||||
|
||||
#
|
||||
# UNALIGNED case
|
||||
#
|
||||
|
||||
unaligned:
|
||||
# got here with a3="negu a0"
|
||||
andi $a3,$a3,0x3 # test if the a0 is word aligned
|
||||
beqz $a3,ua_chk16w
|
||||
subu $a2,$a2,$a3 # bytes left after initial a3 bytes
|
||||
|
||||
lwr $v1,0($a1)
|
||||
lwl $v1,3($a1)
|
||||
addu $a1,$a1,$a3 # a3 may be here 1, 2 or 3
|
||||
swr $v1,0($a0)
|
||||
addu $a0,$a0,$a3 # below the dst will be word aligned (NOTE1)
|
||||
|
||||
ua_chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks?
|
||||
# t8 is the byte count after 64-byte chunks
|
||||
beq $a2,$t8,ua_chk8w # if a2==t8, no 64-byte chunks
|
||||
# There will be at most 1 32-byte chunk after it
|
||||
subu $a3,$a2,$t8 # subtract from a2 the reminder
|
||||
# Here a3 counts bytes in 16w chunks
|
||||
addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
|
||||
|
||||
addu $t0,$a0,$a2 # t0 is the "past the end" address
|
||||
|
||||
subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
|
||||
|
||||
pref 0,0($a1) # bring the first line of src, addr 0
|
||||
pref 0,32($a1) # bring the second line of src, addr 32
|
||||
pref 0,64($a1) # bring the third line of src, addr 64
|
||||
pref 30,32($a0) # safe, as we have at least 64 bytes ahead
|
||||
# In case the a0 > t9 don't use "pref 30" at all
|
||||
sgtu $v1,$a0,$t9
|
||||
bgtz $v1,ua_loop16w # skip "pref 30,64(a0)" for too short arrays
|
||||
nop
|
||||
# otherwise, start with using pref30
|
||||
pref 30,64($a0)
|
||||
ua_loop16w:
|
||||
pref 0,96($a1)
|
||||
lwr $t0,0($a1)
|
||||
lwl $t0,3($a1)
|
||||
lwr $t1,4($a1)
|
||||
bgtz $v1,ua_skip_pref30_96
|
||||
lwl $t1,7($a1)
|
||||
pref 30,96($a0) # continue setting up the dest, addr 96
|
||||
ua_skip_pref30_96:
|
||||
lwr $t2,8($a1)
|
||||
lwl $t2,11($a1)
|
||||
lwr $t3,12($a1)
|
||||
lwl $t3,15($a1)
|
||||
lwr $t4,16($a1)
|
||||
lwl $t4,19($a1)
|
||||
lwr $t5,20($a1)
|
||||
lwl $t5,23($a1)
|
||||
lwr $t6,24($a1)
|
||||
lwl $t6,27($a1)
|
||||
lwr $t7,28($a1)
|
||||
lwl $t7,31($a1)
|
||||
pref 0,128($a1) # bring the next lines of src, addr 128
|
||||
|
||||
sw $t0,0($a0)
|
||||
sw $t1,4($a0)
|
||||
sw $t2,8($a0)
|
||||
sw $t3,12($a0)
|
||||
sw $t4,16($a0)
|
||||
sw $t5,20($a0)
|
||||
sw $t6,24($a0)
|
||||
sw $t7,28($a0)
|
||||
|
||||
lwr $t0,32($a1)
|
||||
lwl $t0,35($a1)
|
||||
lwr $t1,36($a1)
|
||||
bgtz $v1,ua_skip_pref30_128
|
||||
lwl $t1,39($a1)
|
||||
pref 30,128($a0) # continue setting up the dest, addr 128
|
||||
ua_skip_pref30_128:
|
||||
lwr $t2,40($a1)
|
||||
lwl $t2,43($a1)
|
||||
lwr $t3,44($a1)
|
||||
lwl $t3,47($a1)
|
||||
lwr $t4,48($a1)
|
||||
lwl $t4,51($a1)
|
||||
lwr $t5,52($a1)
|
||||
lwl $t5,55($a1)
|
||||
lwr $t6,56($a1)
|
||||
lwl $t6,59($a1)
|
||||
lwr $t7,60($a1)
|
||||
lwl $t7,63($a1)
|
||||
pref 0, 160($a1) # bring the next lines of src, addr 160
|
||||
|
||||
sw $t0,32($a0)
|
||||
sw $t1,36($a0)
|
||||
sw $t2,40($a0)
|
||||
sw $t3,44($a0)
|
||||
sw $t4,48($a0)
|
||||
sw $t5,52($a0)
|
||||
sw $t6,56($a0)
|
||||
sw $t7,60($a0)
|
||||
|
||||
addiu $a0,$a0,64 # adding 64 to dest
|
||||
sgtu $v1,$a0,$t9
|
||||
bne $a0,$a3,ua_loop16w
|
||||
addiu $a1,$a1,64 # adding 64 to src
|
||||
move $a2,$t8
|
||||
|
||||
# Here we have src and dest word-aligned but less than 64-bytes to go
|
||||
|
||||
ua_chk8w:
|
||||
pref 0, 0x0($a1)
|
||||
andi $t8,$a2,0x1f # is there a 32-byte chunk?
|
||||
# the t8 is the reminder count
|
||||
beq $a2,$t8,ua_chk1w # when a2=t8, no 32-byte chunk
|
||||
|
||||
lwr $t0,0($a1)
|
||||
lwl $t0,3($a1)
|
||||
lwr $t1,4($a1)
|
||||
lwl $t1,7($a1)
|
||||
lwr $t2,8($a1)
|
||||
lwl $t2,11($a1)
|
||||
lwr $t3,12($a1)
|
||||
lwl $t3,15($a1)
|
||||
lwr $t4,16($a1)
|
||||
lwl $t4,19($a1)
|
||||
lwr $t5,20($a1)
|
||||
lwl $t5,23($a1)
|
||||
lwr $t6,24($a1)
|
||||
lwl $t6,27($a1)
|
||||
lwr $t7,28($a1)
|
||||
lwl $t7,31($a1)
|
||||
addiu $a1,$a1,32
|
||||
|
||||
sw $t0,0($a0)
|
||||
sw $t1,4($a0)
|
||||
sw $t2,8($a0)
|
||||
sw $t3,12($a0)
|
||||
sw $t4,16($a0)
|
||||
sw $t5,20($a0)
|
||||
sw $t6,24($a0)
|
||||
sw $t7,28($a0)
|
||||
addiu $a0,$a0,32
|
||||
|
||||
ua_chk1w:
|
||||
andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks
|
||||
beq $a2,$t8,ua_smallCopy
|
||||
subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks
|
||||
addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks
|
||||
|
||||
# copying in words (4-byte chunks)
|
||||
ua_wordCopy_loop:
|
||||
lwr $v1,0($a1)
|
||||
lwl $v1,3($a1)
|
||||
addiu $a1,$a1,4
|
||||
addiu $a0,$a0,4 # note: dst=a0 is word aligned here, see NOTE1
|
||||
bne $a0,$a3,ua_wordCopy_loop
|
||||
sw $v1,-4($a0)
|
||||
|
||||
# Now less than 4 bytes (value in a2) left to copy
|
||||
ua_smallCopy:
|
||||
beqz $a2,leave
|
||||
addu $a3,$a0,$a2 # a3 is the last dst address
|
||||
ua_smallCopy_loop:
|
||||
lb $v1,0($a1)
|
||||
addiu $a1,$a1,1
|
||||
addiu $a0,$a0,1
|
||||
bne $a0,$a3,ua_smallCopy_loop
|
||||
sb $v1,-1($a0)
|
||||
|
||||
j $ra
|
||||
nop
|
||||
|
||||
.set at
|
||||
.set reorder
|
||||
.end memcpy_MIPS;
|
||||
.size memcpy_MIPS,.-memcpy_MIPS
|
||||
|
||||
#endif // if defined (__mips__)
|
||||
Loading…
x
Reference in New Issue
Block a user