From 9198f3754b63ad447a7bfdcd429bc8680b1f0b9c Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 6 Mar 2012 20:18:27 +0000 Subject: [PATCH] rm for stride in addrows for clang on mac to not run out of registers BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/440001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@204 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/rotate.cc | 1 - source/row_neon.cc | 56 ++++++++++------------------------------ source/scale.cc | 2 +- 5 files changed, 16 insertions(+), 47 deletions(-) diff --git a/README.chromium b/README.chromium index c54a85c6c..fd7a3b071 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 203 +Version: 204 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e069731c8..be0e1da03 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 203 +#define LIBYUV_VERSION 204 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/rotate.cc b/source/rotate.cc index a10313614..ad1078774 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -749,7 +749,6 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, #endif #endif - static void TransposeWx8_C(const uint8* src, int src_stride, uint8* dst, int dst_stride, int w) { diff --git a/source/row_neon.cc b/source/row_neon.cc index bd88eae93..00b2aa6ab 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -198,10 +198,8 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { asm volatile ( // compute where to start writing destination "add %1, %2 \n" - // work on segments that are multiples of 16 "lsrs r3, %2, #4 \n" - // the output is written in two block. 8 bytes followed // by another 8. reading is done sequentially, from left to // right. writing is done from right to left in block sizes @@ -209,31 +207,26 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { // the first of the two blocks. need to subtract that 8 off // along with 16 to get the next location. "mov r3, #-24 \n" - "beq 2f \n" // back of destination by the size of the register that is - // going to be mirrord + // going to be mirrored "sub %1, #16 \n" - // the loop needs to run on blocks of 16. what will be left // over is either a negative number, the residuals that need // to be done, or 0. if this isn't subtracted off here the // loop will run one extra time. "sub %2, #16 \n" + // mirror the bytes in the 64 bit segments. unable to mirror + // the bytes in the entire 128 bits in one go. + // because of the inability to mirror the entire 128 bits + // mirror the writing out of the two 64 bit segments. "1: \n" "vld1.8 {q0}, [%0]! \n" // src += 16 - - // mirror the bytes in the 64 bit segments. unable to mirror - // the bytes in the entire 128 bits in one go. "vrev64.8 q0, q0 \n" - - // because of the inability to mirror the entire 128 bits - // mirror the writing out of the two 64 bit segments. "vst1.8 {d1}, [%1]! \n" "vst1.8 {d0}, [%1], r3 \n" // dst -= 16 - "subs %2, #16 \n" "bge 1b \n" @@ -241,13 +234,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { // residuals so jump past "adds %2, #16 \n" "beq 5f \n" - "add %1, #16 \n" - - "2: \n" - + "2: \n" "mov r3, #-3 \n" - "sub %1, #2 \n" "subs %2, #2 \n" // check for 16*n+1 scenarios where segments_of_2 should not @@ -256,24 +245,20 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { // do this in neon registers as per // http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ - "3: \n" + "3: \n" "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 - "vst1.8 {d1[0]}, [%1]! \n" "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2 - "subs %2, #2 \n" "bge 3b \n" "adds %2, #2 \n" "beq 5f \n" - - "4: \n" + "4: \n" "add %1, #1 \n" "vld1.8 {d0[0]}, [%0] \n" "vst1.8 {d0[0]}, [%1] \n" - - "5: \n" + "5: \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -289,37 +274,29 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { // compute where to start writing destination "add %1, %3 \n" // dst_a + width "add %2, %3 \n" // dst_b + width - // work on input segments that are multiples of 16, but // width that has been passed is output segments, half // the size of input. "lsrs r12, %3, #3 \n" - "beq 2f \n" - // the output is written in to two blocks. "mov r12, #-8 \n" - // back of destination by the size of the register that is // going to be mirrord "sub %1, #8 \n" "sub %2, #8 \n" - // the loop needs to run on blocks of 8. what will be left // over is either a negative number, the residuals that need // to be done, or 0. if this isn't subtracted off here the // loop will run one extra time. "sub %3, #8 \n" + // mirror the bytes in the 64 bit segments "1: \n" "vld2.8 {d0, d1}, [%0]! \n" // src += 16 - - // mirror the bytes in the 64 bit segments "vrev64.8 q0, q0 \n" - "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8 "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8 - "subs %3, #8 \n" "bge 1b \n" @@ -327,26 +304,19 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { // residuals so return "adds %3, #8 \n" "beq 4f \n" - "add %1, #8 \n" "add %2, #8 \n" - - "2: \n" - + "2: \n" "mov r12, #-1 \n" - "sub %1, #1 \n" "sub %2, #1 \n" - - "3: \n" + "3: \n" "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 - "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1 "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1 - "subs %3, %3, #1 \n" "bgt 3b \n" - "4: \n" + "4: \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 diff --git a/source/scale.cc b/source/scale.cc index 44ba9378d..5f8f05a95 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1731,7 +1731,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, "+r"(dst_ptr), // %1 "+r"(tmp_height), // %2 "+r"(tmp_src), // %3 - "+r"(tmp_src_stride), // %4 + "+rm"(tmp_src_stride), // %4 "+rm"(src_width), // %5 "+rm"(src_height) // %6 :