rm for stride in addrows for clang on mac to not run out of registers

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/440001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@204 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2012-03-06 20:18:27 +00:00
parent 965fb914ea
commit 9198f3754b
5 changed files with 16 additions and 47 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 203
Version: 204
License: BSD
License File: LICENSE

View File

@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 203
#define LIBYUV_VERSION 204
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -749,7 +749,6 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
#endif
#endif
static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int w) {

View File

@ -198,10 +198,8 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
// compute where to start writing destination
"add %1, %2 \n"
// work on segments that are multiples of 16
"lsrs r3, %2, #4 \n"
// the output is written in two block. 8 bytes followed
// by another 8. reading is done sequentially, from left to
// right. writing is done from right to left in block sizes
@ -209,31 +207,26 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// the first of the two blocks. need to subtract that 8 off
// along with 16 to get the next location.
"mov r3, #-24 \n"
"beq 2f \n"
// back of destination by the size of the register that is
// going to be mirrord
// going to be mirrored
"sub %1, #16 \n"
// the loop needs to run on blocks of 16. what will be left
// over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the
// loop will run one extra time.
"sub %2, #16 \n"
// mirror the bytes in the 64 bit segments. unable to mirror
// the bytes in the entire 128 bits in one go.
// because of the inability to mirror the entire 128 bits
// mirror the writing out of the two 64 bit segments.
"1: \n"
"vld1.8 {q0}, [%0]! \n" // src += 16
// mirror the bytes in the 64 bit segments. unable to mirror
// the bytes in the entire 128 bits in one go.
"vrev64.8 q0, q0 \n"
// because of the inability to mirror the entire 128 bits
// mirror the writing out of the two 64 bit segments.
"vst1.8 {d1}, [%1]! \n"
"vst1.8 {d0}, [%1], r3 \n" // dst -= 16
"subs %2, #16 \n"
"bge 1b \n"
@ -241,13 +234,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// residuals so jump past
"adds %2, #16 \n"
"beq 5f \n"
"add %1, #16 \n"
"2: \n"
"2: \n"
"mov r3, #-3 \n"
"sub %1, #2 \n"
"subs %2, #2 \n"
// check for 16*n+1 scenarios where segments_of_2 should not
@ -256,24 +245,20 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// do this in neon registers as per
// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
"3: \n"
"3: \n"
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
"vst1.8 {d1[0]}, [%1]! \n"
"vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
"subs %2, #2 \n"
"bge 3b \n"
"adds %2, #2 \n"
"beq 5f \n"
"4: \n"
"4: \n"
"add %1, #1 \n"
"vld1.8 {d0[0]}, [%0] \n"
"vst1.8 {d0[0]}, [%1] \n"
"5: \n"
"5: \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@ -289,37 +274,29 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
// compute where to start writing destination
"add %1, %3 \n" // dst_a + width
"add %2, %3 \n" // dst_b + width
// work on input segments that are multiples of 16, but
// width that has been passed is output segments, half
// the size of input.
"lsrs r12, %3, #3 \n"
"beq 2f \n"
// the output is written in to two blocks.
"mov r12, #-8 \n"
// back of destination by the size of the register that is
// going to be mirrord
"sub %1, #8 \n"
"sub %2, #8 \n"
// the loop needs to run on blocks of 8. what will be left
// over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the
// loop will run one extra time.
"sub %3, #8 \n"
// mirror the bytes in the 64 bit segments
"1: \n"
"vld2.8 {d0, d1}, [%0]! \n" // src += 16
// mirror the bytes in the 64 bit segments
"vrev64.8 q0, q0 \n"
"vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
"vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
"subs %3, #8 \n"
"bge 1b \n"
@ -327,26 +304,19 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
// residuals so return
"adds %3, #8 \n"
"beq 4f \n"
"add %1, #8 \n"
"add %2, #8 \n"
"2: \n"
"2: \n"
"mov r12, #-1 \n"
"sub %1, #1 \n"
"sub %2, #1 \n"
"3: \n"
"3: \n"
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
"vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
"vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
"subs %3, %3, #1 \n"
"bgt 3b \n"
"4: \n"
"4: \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2

View File

@ -1731,7 +1731,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"+r"(dst_ptr), // %1
"+r"(tmp_height), // %2
"+r"(tmp_src), // %3
"+r"(tmp_src_stride), // %4
"+rm"(tmp_src_stride), // %4
"+rm"(src_width), // %5
"+rm"(src_height) // %6
: