From 9198f3754b63ad447a7bfdcd429bc8680b1f0b9c Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Tue, 6 Mar 2012 20:18:27 +0000
Subject: [PATCH] rm for stride in addrows for clang on mac to not run out of
 registers BUG=none TEST=none Review URL:
 https://webrtc-codereview.appspot.com/440001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@204 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium          |  2 +-
 include/libyuv/version.h |  2 +-
 source/rotate.cc         |  1 -
 source/row_neon.cc       | 56 ++++++++++------------------------------
 source/scale.cc          |  2 +-
 5 files changed, 16 insertions(+), 47 deletions(-)

diff --git a/README.chromium b/README.chromium
index c54a85c6c..fd7a3b071 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 203
+Version: 204
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index e069731c8..be0e1da03 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 203
+#define LIBYUV_VERSION 204
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
 
diff --git a/source/rotate.cc b/source/rotate.cc
index a10313614..ad1078774 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -749,7 +749,6 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 #endif
 #endif
 
-
 static void TransposeWx8_C(const uint8* src, int src_stride,
                            uint8* dst, int dst_stride,
                            int w) {
diff --git a/source/row_neon.cc b/source/row_neon.cc
index bd88eae93..00b2aa6ab 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -198,10 +198,8 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
   asm volatile (
     // compute where to start writing destination
     "add         %1, %2                        \n"
-
     // work on segments that are multiples of 16
     "lsrs        r3, %2, #4                    \n"
-
     // the output is written in two block.  8 bytes followed
     // by another 8.  reading is done sequentially, from left to
     // right.  writing is done from right to left in block sizes
@@ -209,31 +207,26 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
     // the first of the two blocks.  need to subtract that 8 off
     // along with 16 to get the next location.
     "mov         r3, #-24                      \n"
-
     "beq         2f                            \n"
 
     // back of destination by the size of the register that is
-    // going to be mirrord
+    // going to be mirrored
     "sub         %1, #16                       \n"
-
     // the loop needs to run on blocks of 16.  what will be left
     // over is either a negative number, the residuals that need
     // to be done, or 0.  if this isn't subtracted off here the
     // loop will run one extra time.
     "sub         %2, #16                       \n"
 
+    // mirror the bytes in the 64 bit segments.  unable to mirror
+    // the bytes in the entire 128 bits in one go.
+    // because of the inability to mirror the entire 128 bits
+     // mirror the writing out of the two 64 bit segments.
     "1:                                        \n"
       "vld1.8      {q0}, [%0]!                 \n"  // src += 16
-
-        // mirror the bytes in the 64 bit segments.  unable to mirror
-        // the bytes in the entire 128 bits in one go.
       "vrev64.8    q0, q0                      \n"
-
-        // because of the inability to mirror the entire 128 bits
-        // mirror the writing out of the two 64 bit segments.
       "vst1.8      {d1}, [%1]!                 \n"
       "vst1.8      {d0}, [%1], r3              \n"  // dst -= 16
-
       "subs        %2, #16                     \n"
     "bge         1b                            \n"
 
@@ -241,13 +234,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
     // residuals so jump past
     "adds        %2, #16                       \n"
     "beq         5f                            \n"
-
     "add         %1, #16                       \n"
-
-    "2:                                        \n"
-
+  "2:                                          \n"
     "mov         r3, #-3                       \n"
-
     "sub         %1, #2                        \n"
     "subs        %2, #2                        \n"
     // check for 16*n+1 scenarios where segments_of_2 should not
@@ -256,24 +245,20 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
 
 // do this in neon registers as per
 // http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-    "3:                                        \n"
+  "3:                                          \n"
     "vld2.8      {d0[0], d1[0]}, [%0]!         \n"  // src += 2
-
     "vst1.8      {d1[0]}, [%1]!                \n"
     "vst1.8      {d0[0]}, [%1], r3             \n"  // dst -= 2
-
     "subs        %2, #2                        \n"
     "bge         3b                            \n"
 
     "adds        %2, #2                        \n"
     "beq         5f                            \n"
-
-    "4:                                        \n"
+  "4:                                          \n"
     "add         %1, #1                        \n"
     "vld1.8      {d0[0]}, [%0]                 \n"
     "vst1.8      {d0[0]}, [%1]                 \n"
-
-    "5:                                        \n"
+  "5:                                          \n"
     : "+r"(src),   // %0
       "+r"(dst),   // %1
       "+r"(width)  // %2
@@ -289,37 +274,29 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
     // compute where to start writing destination
     "add         %1, %3                        \n"  // dst_a + width
     "add         %2, %3                        \n"  // dst_b + width
-
     // work on input segments that are multiples of 16, but
     // width that has been passed is output segments, half
     // the size of input.
     "lsrs        r12, %3, #3                   \n"
-
     "beq         2f                            \n"
-
     // the output is written in to two blocks.
     "mov         r12, #-8                      \n"
-
     // back of destination by the size of the register that is
     // going to be mirrord
     "sub         %1, #8                        \n"
     "sub         %2, #8                        \n"
-
     // the loop needs to run on blocks of 8.  what will be left
     // over is either a negative number, the residuals that need
     // to be done, or 0.  if this isn't subtracted off here the
     // loop will run one extra time.
     "sub         %3, #8                        \n"
 
+    // mirror the bytes in the 64 bit segments
     "1:                                        \n"
       "vld2.8      {d0, d1}, [%0]!             \n"  // src += 16
-
-      // mirror the bytes in the 64 bit segments
       "vrev64.8    q0, q0                      \n"
-
       "vst1.8      {d0}, [%1], r12             \n"  // dst_a -= 8
       "vst1.8      {d1}, [%2], r12             \n"  // dst_b -= 8
-
       "subs        %3, #8                      \n"
       "bge         1b                          \n"
 
@@ -327,26 +304,19 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
     // residuals so return
     "adds        %3, #8                        \n"
     "beq         4f                            \n"
-
     "add         %1, #8                        \n"
     "add         %2, #8                        \n"
-
-    "2:                                        \n"
-
+  "2:                                          \n"
     "mov         r12, #-1                      \n"
-
     "sub         %1, #1                        \n"
     "sub         %2, #1                        \n"
-
-    "3:                                        \n"
+  "3:                                          \n"
       "vld2.8      {d0[0], d1[0]}, [%0]!       \n"  // src += 2
-
       "vst1.8      {d0[0]}, [%1], r12          \n"  // dst_a -= 1
       "vst1.8      {d1[0]}, [%2], r12          \n"  // dst_b -= 1
-
       "subs        %3, %3, #1                  \n"
       "bgt         3b                          \n"
-    "4:                                        \n"
+  "4:                                          \n"
     : "+r"(src),    // %0
       "+r"(dst_a),  // %1
       "+r"(dst_b),  // %2
diff --git a/source/scale.cc b/source/scale.cc
index 44ba9378d..5f8f05a95 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1731,7 +1731,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
     "+r"(dst_ptr),     // %1
     "+r"(tmp_height),  // %2
     "+r"(tmp_src),     // %3
-    "+r"(tmp_src_stride), // %4
+    "+rm"(tmp_src_stride), // %4
     "+rm"(src_width),  // %5
     "+rm"(src_height)  // %6
   :