rm for stride in addrows for clang on mac to not run out of registers

BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/440001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@204 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2012-03-06 20:18:27 +00:00
parent 965fb914ea
commit 9198f3754b
5 changed files with 16 additions and 47 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 203 Version: 204
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 203 #define LIBYUV_VERSION 204
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -749,7 +749,6 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
#endif #endif
#endif #endif
static void TransposeWx8_C(const uint8* src, int src_stride, static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride,
int w) { int w) {

View File

@ -198,10 +198,8 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile ( asm volatile (
// compute where to start writing destination // compute where to start writing destination
"add %1, %2 \n" "add %1, %2 \n"
// work on segments that are multiples of 16 // work on segments that are multiples of 16
"lsrs r3, %2, #4 \n" "lsrs r3, %2, #4 \n"
// the output is written in two block. 8 bytes followed // the output is written in two block. 8 bytes followed
// by another 8. reading is done sequentially, from left to // by another 8. reading is done sequentially, from left to
// right. writing is done from right to left in block sizes // right. writing is done from right to left in block sizes
@ -209,31 +207,26 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// the first of the two blocks. need to subtract that 8 off // the first of the two blocks. need to subtract that 8 off
// along with 16 to get the next location. // along with 16 to get the next location.
"mov r3, #-24 \n" "mov r3, #-24 \n"
"beq 2f \n" "beq 2f \n"
// back of destination by the size of the register that is // back of destination by the size of the register that is
// going to be mirrord // going to be mirrored
"sub %1, #16 \n" "sub %1, #16 \n"
// the loop needs to run on blocks of 16. what will be left // the loop needs to run on blocks of 16. what will be left
// over is either a negative number, the residuals that need // over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the // to be done, or 0. if this isn't subtracted off here the
// loop will run one extra time. // loop will run one extra time.
"sub %2, #16 \n" "sub %2, #16 \n"
// mirror the bytes in the 64 bit segments. unable to mirror
// the bytes in the entire 128 bits in one go.
// because of the inability to mirror the entire 128 bits
// mirror the writing out of the two 64 bit segments.
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // src += 16 "vld1.8 {q0}, [%0]! \n" // src += 16
// mirror the bytes in the 64 bit segments. unable to mirror
// the bytes in the entire 128 bits in one go.
"vrev64.8 q0, q0 \n" "vrev64.8 q0, q0 \n"
// because of the inability to mirror the entire 128 bits
// mirror the writing out of the two 64 bit segments.
"vst1.8 {d1}, [%1]! \n" "vst1.8 {d1}, [%1]! \n"
"vst1.8 {d0}, [%1], r3 \n" // dst -= 16 "vst1.8 {d0}, [%1], r3 \n" // dst -= 16
"subs %2, #16 \n" "subs %2, #16 \n"
"bge 1b \n" "bge 1b \n"
@ -241,13 +234,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// residuals so jump past // residuals so jump past
"adds %2, #16 \n" "adds %2, #16 \n"
"beq 5f \n" "beq 5f \n"
"add %1, #16 \n" "add %1, #16 \n"
"2: \n"
"2: \n"
"mov r3, #-3 \n" "mov r3, #-3 \n"
"sub %1, #2 \n" "sub %1, #2 \n"
"subs %2, #2 \n" "subs %2, #2 \n"
// check for 16*n+1 scenarios where segments_of_2 should not // check for 16*n+1 scenarios where segments_of_2 should not
@ -256,24 +245,20 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
// do this in neon registers as per // do this in neon registers as per
// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ // http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
"3: \n" "3: \n"
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
"vst1.8 {d1[0]}, [%1]! \n" "vst1.8 {d1[0]}, [%1]! \n"
"vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2 "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
"subs %2, #2 \n" "subs %2, #2 \n"
"bge 3b \n" "bge 3b \n"
"adds %2, #2 \n" "adds %2, #2 \n"
"beq 5f \n" "beq 5f \n"
"4: \n"
"4: \n"
"add %1, #1 \n" "add %1, #1 \n"
"vld1.8 {d0[0]}, [%0] \n" "vld1.8 {d0[0]}, [%0] \n"
"vst1.8 {d0[0]}, [%1] \n" "vst1.8 {d0[0]}, [%1] \n"
"5: \n"
"5: \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
@ -289,37 +274,29 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
// compute where to start writing destination // compute where to start writing destination
"add %1, %3 \n" // dst_a + width "add %1, %3 \n" // dst_a + width
"add %2, %3 \n" // dst_b + width "add %2, %3 \n" // dst_b + width
// work on input segments that are multiples of 16, but // work on input segments that are multiples of 16, but
// width that has been passed is output segments, half // width that has been passed is output segments, half
// the size of input. // the size of input.
"lsrs r12, %3, #3 \n" "lsrs r12, %3, #3 \n"
"beq 2f \n" "beq 2f \n"
// the output is written in to two blocks. // the output is written in to two blocks.
"mov r12, #-8 \n" "mov r12, #-8 \n"
// back of destination by the size of the register that is // back of destination by the size of the register that is
// going to be mirrord // going to be mirrord
"sub %1, #8 \n" "sub %1, #8 \n"
"sub %2, #8 \n" "sub %2, #8 \n"
// the loop needs to run on blocks of 8. what will be left // the loop needs to run on blocks of 8. what will be left
// over is either a negative number, the residuals that need // over is either a negative number, the residuals that need
// to be done, or 0. if this isn't subtracted off here the // to be done, or 0. if this isn't subtracted off here the
// loop will run one extra time. // loop will run one extra time.
"sub %3, #8 \n" "sub %3, #8 \n"
// mirror the bytes in the 64 bit segments
"1: \n" "1: \n"
"vld2.8 {d0, d1}, [%0]! \n" // src += 16 "vld2.8 {d0, d1}, [%0]! \n" // src += 16
// mirror the bytes in the 64 bit segments
"vrev64.8 q0, q0 \n" "vrev64.8 q0, q0 \n"
"vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8 "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
"vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8 "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
"subs %3, #8 \n" "subs %3, #8 \n"
"bge 1b \n" "bge 1b \n"
@ -327,26 +304,19 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
// residuals so return // residuals so return
"adds %3, #8 \n" "adds %3, #8 \n"
"beq 4f \n" "beq 4f \n"
"add %1, #8 \n" "add %1, #8 \n"
"add %2, #8 \n" "add %2, #8 \n"
"2: \n"
"2: \n"
"mov r12, #-1 \n" "mov r12, #-1 \n"
"sub %1, #1 \n" "sub %1, #1 \n"
"sub %2, #1 \n" "sub %2, #1 \n"
"3: \n"
"3: \n"
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
"vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1 "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
"vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1 "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
"subs %3, %3, #1 \n" "subs %3, %3, #1 \n"
"bgt 3b \n" "bgt 3b \n"
"4: \n" "4: \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst_a), // %1 "+r"(dst_a), // %1
"+r"(dst_b), // %2 "+r"(dst_b), // %2

View File

@ -1731,7 +1731,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(tmp_height), // %2 "+r"(tmp_height), // %2
"+r"(tmp_src), // %3 "+r"(tmp_src), // %3
"+r"(tmp_src_stride), // %4 "+rm"(tmp_src_stride), // %4
"+rm"(src_width), // %5 "+rm"(src_width), // %5
"+rm"(src_height) // %6 "+rm"(src_height) // %6
: :