I420 and NV12 rotate functions.

Consolidate rotate files. Add unit tests for I420 and NV12 rotate functions. Fix remaining pitch/stride references. Review URL: http://webrtc-codereview.appspot.com/239001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@32 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-07 01:06:46 +08:00 · 2011-10-19 17:52:15 +00:00 · 2011-10-19 17:52:15 +00:00 · 3de12ae1c6
commit 3de12ae1c6
parent a1280730c2
12 changed files with 1809 additions and 884 deletions
--- a/include/libyuv/general.h
+++ b/include/libyuv/general.h
@ -20,14 +20,6 @@
 namespace libyuv {
 // Supported rotation
 enum RotationMode {
  kRotateNone = 0,
  kRotateClockwise = 90,
  kRotateCounterClockwise = 270,
  kRotate180 = 180,
 };
 // I420 mirror
 int
 I420Mirror(const uint8* src_yplane, int src_ystride,
@ -50,17 +42,6 @@ I420Crop(uint8* frame,
         int src_width, int src_height,
         int dst_width, int dst_height);
 // Rotate I420 frame
 int
 I420Rotate(const uint8* src_yplane, int src_ystride,
           const uint8* src_uplane, int src_ustride,
           const uint8* src_vplane, int src_vstride,
           uint8* dst_yplane, int dst_ystride,
           uint8* dst_uplane, int dst_ustride,
           uint8* dst_vplane, int dst_vstride,
           int width, int height,
           RotationMode mode);
 } // namespace libyuv
 #endif // INCLUDE_LIBYUV_GENERAL_H_
--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@ -0,0 +1,50 @@
 /*
 *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef INCLUDE_LIBYUV_ROTATE_H_
 #define INCLUDE_LIBYUV_ROTATE_H_
 #include "libyuv/basic_types.h"
 namespace libyuv {
 // Supported rotation
 enum RotationMode {
  kRotateNone = 0,
  kRotateClockwise = 90,
  kRotateCounterClockwise = 270,
  kRotate180 = 180,
 };
 // Rotate I420 frame
 int
 I420Rotate(const uint8* src_y, int src_stride_y,
           const uint8* src_u, int src_stride_u,
           const uint8* src_v, int src_stride_v,
           uint8* dst_y, int dst_stride_y,
           uint8* dst_u, int dst_stride_u,
           uint8* dst_v, int dst_stride_v,
           int width, int height,
           RotationMode mode);
 // Split a NV12 input buffer into Y, U, V buffers and
 // then rotate the buffers.
 int
 NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                 const uint8* src_uv, int src_stride_uv,
                 uint8* dst_y, int dst_stride_y,
                 uint8* dst_u, int dst_stride_u,
                 uint8* dst_v, int dst_stride_v,
                 int width, int height,
                 RotationMode mode);
 }  // namespace libyuv
 #endif  // INCLUDE_LIBYUV_ROTATE_H_
--- a/libyuv.gyp
+++ b/libyuv.gyp
@ -44,7 +44,6 @@
        'source/general.cc',
        'source/planar_functions.cc',
        'source/rotate.cc',
        'source/rotate_deinterleave.cc',
        'source/row_table.cc',
        'source/scale.cc',
        'source/video_common.cc',
--- a/source/general.cc
+++ b/source/general.cc
@ -13,7 +13,6 @@
 #include <string.h>     // memcpy(), memset()
 #include "libyuv/planar_functions.h"
 #include "rotate.h"
 namespace libyuv {
@ -282,66 +281,4 @@ I420CropPad(const uint8* src_frame, int src_width,
  return 0;
 }
 int
 I420Rotate(const uint8* src_yplane, int src_ystride,
           const uint8* src_uplane, int src_ustride,
           const uint8* src_vplane, int src_vstride,
           uint8* dst_yplane, int dst_ystride,
           uint8* dst_uplane, int dst_ustride,
           uint8* dst_vplane, int dst_vstride,
           int width, int height,
           RotationMode mode) {
  switch (mode) {
    case kRotateNone:
      // copy frame
      return I420Copy(src_yplane, src_ystride,
                      src_uplane, src_ustride,
                      src_vplane, src_vstride,
                      dst_yplane, dst_ystride,
                      dst_uplane, dst_ustride,
                      dst_vplane, dst_vstride,
                      width, height);
      break;
    case kRotateClockwise:
      Rotate90(src_yplane, src_ystride,
               dst_yplane, dst_ystride,
               width, height);
      Rotate90(src_uplane, src_ustride,
               dst_uplane, dst_ustride,
               width, height);
      Rotate90(src_vplane, src_vstride,
               dst_vplane, dst_vstride,
               width, height);
      return 0;
      break;
    case kRotateCounterClockwise:
      Rotate270(src_yplane, src_ystride,
                dst_yplane, dst_ystride,
                width, height);
      Rotate270(src_uplane, src_ustride,
                dst_uplane, dst_ustride,
                width, height);
      Rotate270(src_vplane, src_vstride,
                dst_vplane, dst_vstride,
                width, height);
      return 0;
      break;
    case kRotate180:
      Rotate180(src_yplane, src_ystride,
                dst_yplane, dst_ystride,
                width, height);
      Rotate180(src_uplane, src_ustride,
                dst_uplane, dst_ustride,
                width, height);
      Rotate180(src_vplane, src_vstride,
                dst_vplane, dst_vstride,
               width, height);
      return 0;
    break;
  default:
    return -1;
    break;
  }
 }
 } // namespace libyuv
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -8,107 +8,135 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
-#include "rotate.h"
+#include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "rotate_priv.h"
 namespace libyuv {
 typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int);
 typedef void (*reverse_func)(const uint8*, uint8*, int);
-typedef void (*rotate_wx8func)(const uint8*, int, uint8*, int, int);
+typedef void (*rotate_uv_wx8_func)(const uint8*, int,
-typedef void (*rotate_wxhfunc)(const uint8*, int, uint8*, int, int, int);
+                                   uint8*, int,
                                   uint8*, int, int);
 typedef void (*rotate_uv_wxh_func)(const uint8*, int,
                                   uint8*, int,
                                   uint8*, int, int, int);
 typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int);
 typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int);
 #ifdef __ARM_NEON__
 extern "C" {
 void RestoreRegisters_NEON(unsigned long long *restore);
 void SaveRegisters_NEON(unsigned long long *store);
 void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
-void Transpose_wx8_NEON(const uint8* src, int src_stride,
+void ReverseLineUV_NEON(const uint8* src,
-                        uint8* dst, int dst_stride, int width);
+                        uint8* dst_a, uint8* dst_b,
                        int width);
 void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride, int width);
 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width);
 }  // extern "C"
 #endif
-static void Transpose_wx8_C(const uint8* src, int src_stride,
+static void TransposeWx8_C(const uint8* src, int src_stride,
-                            uint8* dst, int dst_stride,
+                           uint8* dst, int dst_stride,
-                            int w) {
+                           int w) {
  int i, j;
  for (i = 0; i < w; ++i)
    for (j = 0; j < 8; ++j)
      dst[i * dst_stride + j] = src[j * src_stride + i];
 }
-static void Transpose_wxh_C(const uint8* src, int src_stride,
+static void TransposeWxH_C(const uint8* src, int src_stride,
-                            uint8* dst, int dst_stride,
+                           uint8* dst, int dst_stride,
-                            int width, int height) {
+                           int width, int height) {
  int i, j;
  for (i = 0; i < width; ++i)
    for (j = 0; j < height; ++j)
      dst[i * dst_stride + j] = src[j * src_stride + i];
 }
-void Transpose(const uint8* src, int src_stride,
+void TransposePlane(const uint8* src, int src_stride,
-               uint8* dst, int dst_stride,
+                    uint8* dst, int dst_stride,
-               int width, int height) {
+                    int width, int height) {
  int i = height;
-  rotate_wx8func Transpose_wx8;
+  rotate_wx8_func TransposeWx8;
-  rotate_wxhfunc Transpose_wxh;
+  rotate_wxh_func TransposeWxH;
  // do processor detection here.
 #ifdef __ARM_NEON__
-  Transpose_wx8 = Transpose_wx8_NEON;
+  TransposeWx8 = TransposeWx8_NEON;
-  Transpose_wxh = Transpose_wxh_C;
+  TransposeWxH = TransposeWxH_C;
 #else
-  Transpose_wx8 = Transpose_wx8_C;
+  TransposeWx8 = TransposeWx8_C;
-  Transpose_wxh = Transpose_wxh_C;
+  TransposeWxH = TransposeWxH_C;
 #endif
  // work across the source in 8x8 tiles
-  do {
+  while (i >= 8) {
-    Transpose_wx8(src, src_stride, dst, dst_stride, width);
+    TransposeWx8(src, src_stride, dst, dst_stride, width);
-    src += 8 * src_stride;
+    src += 8 * src_stride;    // go down 8 rows
-    dst += 8;
+    dst += 8;                 // move over 8 columns
    i   -= 8;
-  } while (i >= 8);
+  }
-// TODO(frkoenig): Have wx4 and maybe wx2
+  TransposeWxH(src, src_stride, dst, dst_stride, width, i);
  Transpose_wxh(src, src_stride, dst, dst_stride, width, i);
 }
-void Rotate90(const uint8* src, int src_stride,
+void RotatePlane90(const uint8* src, int src_stride,
-              uint8* dst, int dst_stride,
+                   uint8* dst, int dst_stride,
-              int width, int height) {
+                   int width, int height) {
-  src += src_stride*(height-1);
+  // Rotate by 90 is a transpose with the source read
  // from bottom to top.  So set the source pointer to the end
  // of the buffer and flip the sign of the source stride.
  src += src_stride * (height - 1);
  src_stride = -src_stride;
-  Transpose(src, src_stride, dst, dst_stride, width, height);
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }
-void Rotate270(const uint8* src, int src_stride,
+void RotatePlane270(const uint8* src, int src_stride,
-               uint8* dst, int dst_stride,
+                    uint8* dst, int dst_stride,
-               int width, int height) {
+                    int width, int height) {
-  dst += dst_stride*(width-1);
+  // Rotate by 270 is a transpose with the destination written
  // from bottom to top.  So set the destination pointer to the end
  // of the buffer and flip the sign of the destination stride.
  dst += dst_stride * (width - 1);
  dst_stride = -dst_stride;
-  Transpose(src, src_stride, dst, dst_stride, width, height);
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }
 void ReverseLine_C(const uint8* src, uint8* dst, int width) {
  int i;
-  for (i = 0; i < width; ++i)
+  src += width;
-    dst[width-1 - i] = src[i];
+  for (i = 0; i < width; ++i) {
    --src;
    dst[i] = src[0];
  }
 }
-void Rotate180(const uint8* src, int src_stride,
+void RotatePlane180(const uint8* src, int src_stride,
-               uint8* dst, int dst_stride,
+                    uint8* dst, int dst_stride,
-               int width, int height) {
+                    int width, int height) {
  int i;
  reverse_func ReverseLine;
-  // do processor detection here.
+  // TODO(frkoenig): do processor detection here.
 #ifdef __ARM_NEON__
  ReverseLine = ReverseLine_NEON;
 #else
  ReverseLine = ReverseLine_C;
 #endif
-  dst += dst_stride*(height-1);
+  // Rotate by 180 is a mirror with the destination
  // written in reverse.
  dst += dst_stride * (height - 1);
  for (i = 0; i < height; ++i) {
    ReverseLine(src, dst, width);
@ -118,4 +146,269 @@ void Rotate180(const uint8* src, int src_stride,
  }
 }
 static void TransposeUVWx8_C(const uint8* src, int src_stride,
                             uint8* dst_a, int dst_stride_a,
                             uint8* dst_b, int dst_stride_b,
                             int w) {
  int i, j;
  for (i = 0; i < w * 2; i += 2)
    for (j = 0; j < 8; ++j) {
      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
    }
 }
 static void TransposeUVWxH_C(const uint8* src, int src_stride,
                             uint8* dst_a, int dst_stride_a,
                             uint8* dst_b, int dst_stride_b,
                             int w, int h) {
  int i, j;
  for (i = 0; i < w*2; i += 2)
    for (j = 0; j < h; ++j) {
      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
    }
 }
 void TransposeUV(const uint8* src, int src_stride,
                 uint8* dst_a, int dst_stride_a,
                 uint8* dst_b, int dst_stride_b,
                 int width, int height) {
  int i = height;
  rotate_uv_wx8_func TransposeWx8;
  rotate_uv_wxh_func TransposeWxH;
  // do processor detection here.
 #ifdef __ARM_NEON__
  unsigned long long store_reg[8];
  SaveRegisters_NEON(store_reg);
  TransposeWx8 = TransposeUVWx8_NEON;
  TransposeWxH = TransposeUVWxH_C;
 #else
  TransposeWx8 = TransposeUVWx8_C;
  TransposeWxH = TransposeUVWxH_C;
 #endif
  // work through the source in 8x8 tiles
  while (i >= 8) {
    TransposeWx8(src, src_stride,
                 dst_a, dst_stride_a,
                 dst_b, dst_stride_b,
                 width);
    src   += 8 * src_stride;    // go down 8 rows
    dst_a += 8;                 // move over 8 columns
    dst_b += 8;                 // move over 8 columns
    i     -= 8;
  }
  TransposeWxH(src, src_stride,
               dst_a, dst_stride_a,
               dst_b, dst_stride_b,
               width, i);
 #ifdef __ARM_NEON__
  RestoreRegisters_NEON(store_reg);
 #endif
 }
 void RotateUV90(const uint8* src, int src_stride,
                uint8* dst_a, int dst_stride_a,
                uint8* dst_b, int dst_stride_b,
                int width, int height) {
  src += src_stride * (height - 1);
  src_stride = -src_stride;
  TransposeUV(src, src_stride,
              dst_a, dst_stride_a,
              dst_b, dst_stride_b,
              width, height);
 }
 void RotateUV270(const uint8* src, int src_stride,
                 uint8* dst_a, int dst_stride_a,
                 uint8* dst_b, int dst_stride_b,
                 int width, int height) {
  dst_a += dst_stride_a * (width - 1);
  dst_b += dst_stride_b * (width - 1);
  dst_stride_a = -dst_stride_a;
  dst_stride_b = -dst_stride_b;
  TransposeUV(src, src_stride,
              dst_a, dst_stride_a,
              dst_b, dst_stride_b,
              width, height);
 }
 static void ReverseLineUV_C(const uint8* src,
                            uint8* dst_a, uint8* dst_b,
                            int width) {
  int i;
  src += width << 1;
  for (i = 0; i < width; ++i) {
    src -= 2;
    dst_a[i] = src[0];
    dst_b[i] = src[1];
  }
 }
 void RotateUV180(const uint8* src, int src_stride,
                 uint8* dst_a, int dst_stride_a,
                 uint8* dst_b, int dst_stride_b,
                 int width, int height) {
  int i;
  reverse_uv_func ReverseLine;
  // TODO(frkoenig) : do processor detection here.
 #ifdef __ARM_NEON__
  ReverseLine = ReverseLineUV_NEON;
 #else
  ReverseLine = ReverseLineUV_C;
 #endif
  dst_a += dst_stride_a * (height - 1);
  dst_b += dst_stride_b * (height - 1);
  for (i = 0; i < height; ++i) {
    ReverseLine(src, dst_a, dst_b, width);
    src   += src_stride;      // down one line at a time
    dst_a -= dst_stride_a;    // nominally up one line at a time
    dst_b -= dst_stride_b;    // nominally up one line at a time
  }
 }
 int I420Rotate(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height,
               RotationMode mode) {
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    halfheight = (height + 1) >> 1;
    src_y = src_y + (height - 1) * src_stride_y;
    src_u = src_u + (halfheight - 1) * src_stride_u;
    src_v = src_v + (halfheight - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
  }
  switch (mode) {
    case kRotateNone:
      // copy frame
      return I420Copy(src_y, src_stride_y,
                      src_u, src_stride_u,
                      src_v, src_stride_v,
                      dst_y, dst_stride_y,
                      dst_u, dst_stride_u,
                      dst_v, dst_stride_v,
                      width, height);
    case kRotateClockwise:
      RotatePlane90(src_y, src_stride_y,
                    dst_y, dst_stride_y,
                    width, height);
      RotatePlane90(src_u, src_stride_u,
                    dst_u, dst_stride_u,
                    halfwidth, halfheight);
      RotatePlane90(src_v, src_stride_v,
                    dst_v, dst_stride_v,
                    halfwidth, halfheight);
      return 0;
    case kRotateCounterClockwise:
      RotatePlane270(src_y, src_stride_y,
                     dst_y, dst_stride_y,
                     width, height);
      RotatePlane270(src_u, src_stride_u,
                     dst_u, dst_stride_u,
                     halfwidth, halfheight);
      RotatePlane270(src_v, src_stride_v,
                     dst_v, dst_stride_v,
                     halfwidth, halfheight);
      return 0;
    case kRotate180:
      RotatePlane180(src_y, src_stride_y,
                     dst_y, dst_stride_y,
                     width, height);
      RotatePlane180(src_u, src_stride_u,
                     dst_u, dst_stride_u,
                     halfwidth, halfheight);
      RotatePlane180(src_v, src_stride_v,
                     dst_v, dst_stride_v,
                     halfwidth, halfheight);
      return 0;
    default:
      break;
  }
  return -1;
 }
 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                     const uint8* src_uv, int src_stride_uv,
                     uint8* dst_y, int dst_stride_y,
                     uint8* dst_u, int dst_stride_u,
                     uint8* dst_v, int dst_stride_v,
                     int width, int height,
                     RotationMode mode) {
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    halfheight = (height + 1) >> 1;
    src_y = src_y + (height - 1) * src_stride_y;
    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
    src_stride_y = -src_stride_y;
    src_stride_uv = -src_stride_uv;
  }
  switch (mode) {
    case kRotateNone:
      // copy frame
      return NV12ToI420(src_y, src_uv, src_stride_y,
                        dst_y, dst_stride_y,
                        dst_u, dst_stride_u,
                        dst_v, dst_stride_v,
                        width, height);
    case kRotateClockwise:
      RotatePlane90(src_y, src_stride_y,
                    dst_y, dst_stride_y,
                    width, height);
      RotateUV90(src_uv, src_stride_uv,
                 dst_u, dst_stride_u,
                 dst_v, dst_stride_v,
                 halfwidth, halfheight);
      return 0;
    case kRotateCounterClockwise:
      RotatePlane270(src_y, src_stride_y,
                     dst_y, dst_stride_y,
                     width, height);
      RotateUV270(src_uv, src_stride_uv,
                  dst_u, dst_stride_u,
                  dst_v, dst_stride_v,
                  halfwidth, halfheight);
      return 0;
    case kRotate180:
      RotatePlane180(src_y, src_stride_y,
                     dst_y, dst_stride_y,
                     width, height);
      RotateUV180(src_uv, src_stride_uv,
                  dst_u, dst_stride_u,
                  dst_v, dst_stride_v,
                  halfwidth, halfheight);
      return 0;
    default:
      break;
  }
  return -1;
 }
 }  // namespace libyuv
--- a/source/rotate.h
+++ b/source/rotate.h
@ -1,46 +0,0 @@
 /*
 *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef LIBYUV_SOURCE_ROTATE_H_
 #define LIBYUV_SOURCE_ROTATE_H_
 #include "libyuv/basic_types.h"
 namespace libyuv {
 void Rotate90(const uint8* src, int src_stride,
              uint8* dst, int dst_stride,
              int width, int height);
 void Rotate180(const uint8* src, int src_stride,
               uint8* dst, int dst_stride,
               int width, int height);
 void Rotate270(const uint8* src, int src_stride,
               uint8* dst, int dst_stride,
               int width, int height);
 void Rotate90_deinterleave(const uint8* src, int src_stride,
                           uint8* dst_a, int dst_stride_a,
                           uint8* dst_b, int dst_stride_b,
                           int width, int height);
 void Rotate180_deinterleave(const uint8* src, int src_stride,
                            uint8* dst_a, int dst_stride_a,
                            uint8* dst_b, int dst_stride_b,
                            int width, int height);
 void Rotate270_deinterleave(const uint8* src, int src_stride,
                            uint8* dst_a, int dst_stride_a,
                            uint8* dst_b, int dst_stride_b,
                            int width, int height);
 void Transpose(const uint8* src, int src_stride,
               uint8* dst, int dst_stride,
               int width, int height);
 }  // namespace libyuv
 #endif  // LIBYUV_SOURCE_ROTATE_H_
--- a/source/rotate_deinterleave.cc
+++ b/source/rotate_deinterleave.cc
@ -1,171 +0,0 @@
 /*
 *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "rotate.h"
 namespace libyuv {
 typedef void (*reverse_func)(const uint8*, uint8*, uint8*, int);
 typedef void (*rotate_wx8func)(const uint8*, int,
                               uint8*, int,
                               uint8*, int, int);
 typedef void (*rotate_wxhfunc)(const uint8*, int,
                               uint8*, int,
                               uint8*, int, int, int);
 #ifdef __ARM_NEON__
 extern "C" {
 void RestoreRegisters_NEON(unsigned long long *restore);
 void ReverseLine_di_NEON(const uint8* src,
                         uint8* dst_a, uint8* dst_b,
                         int width);
 void SaveRegisters_NEON(unsigned long long *store);
 void Transpose_di_wx8_NEON(const uint8* src, int src_stride,
                           uint8* dst_a, int dst_stride_a,
                           uint8* dst_b, int dst_stride_b,
                           int width);
 }  // extern "C"
 #endif
 static void Transpose_di_wx8_C(const uint8* src, int src_stride,
                               uint8* dst_a, int dst_stride_a,
                               uint8* dst_b, int dst_stride_b,
                               int w) {
  int i, j;
  for (i = 0; i < w*2; i += 2)
    for (j = 0; j < 8; ++j) {
      dst_a[j + (i>>1)*dst_stride_a] = src[i + j*src_stride];
      dst_b[j + (i>>1)*dst_stride_b] = src[i + j*src_stride + 1];
    }
 }
 static void Transpose_di_wxh_C(const uint8* src, int src_stride,
                               uint8* dst_a, int dst_stride_a,
                               uint8* dst_b, int dst_stride_b,
                               int w, int h) {
  int i, j;
  for (i = 0; i < w*2; i += 2)
    for (j = 0; j < h; ++j) {
      dst_a[j + (i>>1)*dst_stride_a] = src[i + j*src_stride];
      dst_b[j + (i>>1)*dst_stride_b] = src[i + j*src_stride + 1];
    }
 }
 void Transpose_deinterleave(const uint8* src, int src_stride,
                            uint8* dst_a, int dst_stride_a,
                            uint8* dst_b, int dst_stride_b,
                            int width, int height) {
  int i = height;
  rotate_wx8func Transpose_wx8;
  rotate_wxhfunc Transpose_wxh;
  // do processor detection here.
 #ifdef __ARM_NEON__
  unsigned long long store_reg[8];
  SaveRegisters_NEON(store_reg);
  Transpose_wx8 = Transpose_di_wx8_NEON;
  Transpose_wxh = Transpose_di_wxh_C;
 #else
  Transpose_wx8 = Transpose_di_wx8_C;
  Transpose_wxh = Transpose_di_wxh_C;
 #endif
  width >>= 1;
  // work across the source in 8x8 tiles
  do {
    Transpose_wx8(src, src_stride,
                  dst_a, dst_stride_a,
                  dst_b, dst_stride_b,
                  width);
    src   += 8 * src_stride;
    dst_a += 8;
    dst_b += 8;
    i     -= 8;
  } while (i >= 8);
  Transpose_wxh(src, src_stride,
                dst_a, dst_stride_a,
                dst_b, dst_stride_b,
                width, i);
 #ifdef __ARM_NEON__
  RestoreRegisters_NEON(store_reg);
 #endif
 }
 void Rotate90_deinterleave(const uint8* src, int src_stride,
                           uint8* dst_a, int dst_stride_a,
                           uint8* dst_b, int dst_stride_b,
                            int width, int height) {
  src += src_stride*(height-1);
  src_stride = -src_stride;
  Transpose_deinterleave(src, src_stride,
                         dst_a, dst_stride_a,
                         dst_b, dst_stride_b,
                         width, height);
 }
 void Rotate270_deinterleave(const uint8* src, int src_stride,
                            uint8* dst_a, int dst_stride_a,
                            uint8* dst_b, int dst_stride_b,
                            int width, int height) {
  dst_a += dst_stride_a*((width>>1)-1);
  dst_b += dst_stride_b*((width>>1)-1);
  dst_stride_a = -dst_stride_a;
  dst_stride_b = -dst_stride_b;
  Transpose_deinterleave(src, src_stride,
                         dst_a, dst_stride_a,
                         dst_b, dst_stride_b,
                         width, height);
 }
 static void ReverseLine_di_C(const uint8* src,
                             uint8* dst_a, uint8* dst_b,
                             int width) {
  int i;
  for (i = 0; i < width*2; i += 2) {
    dst_a[width-1 - (i>>1)] = src[i];
    dst_b[width-1 - (i>>1)] = src[i+1];
  }
 }
 void Rotate180_deinterleave(const uint8* src, int src_stride,
                            uint8* dst_a, int dst_stride_a,
                            uint8* dst_b, int dst_stride_b,
                            int width, int height) {
  int i;
  reverse_func ReverseLine;
  // do processor detection here.
 #ifdef __ARM_NEON__
  ReverseLine = ReverseLine_di_NEON;
 #else
  ReverseLine = ReverseLine_di_C;
 #endif
  dst_a += dst_stride_a*(height-1);
  dst_b += dst_stride_b*(height-1);
  width >>= 1;
  for (i = 0; i < height; ++i) {
    ReverseLine(src, dst_a, dst_b, width);
    src   += src_stride;
    dst_a -= dst_stride_a;
    dst_b -= dst_stride_b;
  }
 }
 }  // namespace libyuv
--- a/source/rotate_deinterleave_neon.s
+++ b/source/rotate_deinterleave_neon.s
@ -1,310 +0,0 @@
  .global RestoreRegisters_NEON
  .global ReverseLine_di_NEON
  .global SaveRegisters_NEON
  .global Transpose_di_wx8_NEON
  .type RestoreRegisters_NEON, function
  .type ReverseLine_di_NEON, function
  .type SaveRegisters_NEON, function
  .type Transpose_di_wx8_NEON, function
@ void SaveRegisters_NEON (unsigned long long store)
@ r0 unsigned long long store
 SaveRegisters_NEON:
  vst1.i64    {d8, d9, d10, d11}, [r0]!
  vst1.i64    {d12, d13, d14, d15}, [r0]!
  bx          lr
@ void RestoreRegisters_NEON (unsigned long long store)
@ r0 unsigned long long store
 RestoreRegisters_NEON:
  vld1.i64    {d8, d9, d10, d11}, [r0]!
  vld1.i64    {d12, d13, d14, d15}, [r0]!
  bx          lr
@ void ReverseLine_NEON (const uint8* src,
@                        uint8* dst_a,
@                        uint8* dst_b,
@                        int width)
@ r0 const uint8* src
@ r1 uint8* dst_a
@ r2 uint8* dst_b
@ r3 width
 ReverseLine_di_NEON:
  @ compute where to start writing destination
  add         r1, r1, r3      @ dst_a + width
  add         r2, r2, r3      @ dst_b + width
  @ work on input segments that are multiples of 16, but
  @ width that has been passed is output segments, half
  @ the size of input.
  lsrs        r12, r3, #3
  beq         .line_residuals
  @ the output is written in to two blocks.
  mov         r12, #-8
  @ back of destination by the size of the register that is
  @ going to be reversed
  sub         r1, r1, #8
  sub         r2, r2, #8
  @ the loop needs to run on blocks of 16.  what will be left
  @ over is either a negative number, the residuals that need
  @ to be done, or 0.  if this isn't subtracted off here the
  @ loop will run one extra time.
  sub         r3, r3, #8
 .segments_of_8:
    vld2.8      {d0, d1}, [r0]!         @ src += 16
    @ reverse the bytes in the 64 bit segments
    vrev64.8    q0, q0
    vst1.8      {d0}, [r1], r12         @ dst_a -= 8
    vst1.8      {d1}, [r2], r12         @ dst_b -= 8
    subs        r3, r3, #8
    bge         .segments_of_8
  @ add 16 back to the counter.  if the result is 0 there is no
  @ residuals so return
  adds        r3, r3, #8
  bxeq        lr
  add         r1, r1, #8
  add         r2, r2, #8
 .line_residuals:
  mov         r12, #-1
  sub         r1, r1, #1
  sub         r2, r2, #1
@ do this in neon registers as per
@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
 .segments_of_2:
    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
    vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
    vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1
    subs        r3, r3, #1
    bgt         .segments_of_2
  bx          lr
@ void Transpose_di_wx8_NEON (const uint8* src, int src_pitch,
@                             uint8* dst_a, int dst_pitch_a,
@                             uint8* dst_b, int dst_pitch_b,
@                             int width)
@ r0 const uint8* src
@ r1 int src_pitch
@ r2 uint8* dst_a
@ r3 int dst_pitch_a
@ stack uint8* dst_b
@ stack int dst_pitch_b
@ stack int width
 Transpose_di_wx8_NEON:
  push        {r4-r9,lr}
  ldr         r4, [sp, #28]         @ dst_b
  ldr         r5, [sp, #32]         @ dst_pitch_b
  ldr         r7, [sp, #36]         @ width
  @ loops are on blocks of 8.  loop will stop when
  @ counter gets to or below 0.  starting the counter
  @ at w-8 allow for this
  sub         r8, #8
@ handle 8x8 blocks.  this should be the majority of the plane
 .loop_8x8:
    mov         r9, r0
    vld2.8      {d0,  d1},  [r9], r1
    vld2.8      {d2,  d3},  [r9], r1
    vld2.8      {d4,  d5},  [r9], r1
    vld2.8      {d6,  d7},  [r9], r1
    vld2.8      {d8,  d9},  [r9], r1
    vld2.8      {d10, d11}, [r9], r1
    vld2.8      {d12, d13}, [r9], r1
    vld2.8      {d14, d15}, [r9]
    vtrn.8      q1, q0
    vtrn.8      q3, q2
    vtrn.8      q5, q4
    vtrn.8      q7, q6
    vtrn.16     q1, q3
    vtrn.16     q0, q2
    vtrn.16     q5, q7
    vtrn.16     q4, q6
    vtrn.32     q1, q5
    vtrn.32     q0, q4
    vtrn.32     q3, q7
    vtrn.32     q2, q6
    vrev16.8    q0, q0
    vrev16.8    q1, q1
    vrev16.8    q2, q2
    vrev16.8    q3, q3
    vrev16.8    q4, q4
    vrev16.8    q5, q5
    vrev16.8    q6, q6
    vrev16.8    q7, q7
    mov         r9, r2
    vst1.8      {d2},  [r9], r3
    vst1.8      {d0},  [r9], r3
    vst1.8      {d6},  [r9], r3
    vst1.8      {d4},  [r9], r3
    vst1.8      {d10}, [r9], r3
    vst1.8      {d8},  [r9], r3
    vst1.8      {d14}, [r9], r3
    vst1.8      {d12}, [r9]
    mov         r9, r4
    vst1.8      {d3},  [r9], r5
    vst1.8      {d1},  [r9], r5
    vst1.8      {d7},  [r9], r5
    vst1.8      {d5},  [r9], r5
    vst1.8      {d11}, [r9], r5
    vst1.8      {d9},  [r9], r5
    vst1.8      {d15}, [r9], r5
    vst1.8      {d13}, [r9]
    add         r0, #8*2          @ src   += 8*2
    add         r2, r3, lsl #3    @ dst_a += 8 * dst_pitch_a
    add         r4, r5, lsl #3    @ dst_b += 8 * dst_pitch_b
    subs        r8,  #8           @ w     -= 8
    bge         .loop_8x8
  @ add 8 back to counter.  if the result is 0 there are
  @ no residuals.
  adds        r8, #8
  beq         .done
  @ some residual, so between 1 and 7 lines left to transpose
  cmp         r8, #2
  blt         .block_1x8
  cmp         r8, #4
  blt         .block_2x8
@ TODO(frkoenig) : clean this up
 .block_4x8:
  mov         r9, r0
  vld1.64     {d0}, [r9], r1
  vld1.64     {d1}, [r9], r1
  vld1.64     {d2}, [r9], r1
  vld1.64     {d3}, [r9], r1
  vld1.64     {d4}, [r9], r1
  vld1.64     {d5}, [r9], r1
  vld1.64     {d6}, [r9], r1
  vld1.64     {d7}, [r9]
  adr         r12, vtbl_4x4_transpose
  vld1.8      {q7}, [r12]
  vtrn.8      q0, q1
  vtrn.8      q2, q3
  vtbl.8      d8,  {d0, d1}, d14
  vtbl.8      d9,  {d0, d1}, d15
  vtbl.8      d10, {d2, d3}, d14
  vtbl.8      d11, {d2, d3}, d15
  vtbl.8      d12, {d4, d5}, d14
  vtbl.8      d13, {d4, d5}, d15
  vtbl.8      d0,  {d6, d7}, d14
  vtbl.8      d1,  {d6, d7}, d15
  mov         r9, r2
  vst1.32     {d8[0]},  [r9], r3
  vst1.32     {d8[1]},  [r9], r3
  vst1.32     {d9[0]},  [r9], r3
  vst1.32     {d9[1]},  [r9], r3
  add         r9, r2, #4
  vst1.32     {d12[0]}, [r9], r3
  vst1.32     {d12[1]}, [r9], r3
  vst1.32     {d13[0]}, [r9], r3
  vst1.32     {d13[1]}, [r9]
  mov         r9, r4
  vst1.32     {d10[0]}, [r9], r5
  vst1.32     {d10[1]}, [r9], r5
  vst1.32     {d11[0]}, [r9], r5
  vst1.32     {d11[1]}, [r9], r5
  add         r9, r4, #4
  vst1.32     {d0[0]},  [r9], r5
  vst1.32     {d0[1]},  [r9], r5
  vst1.32     {d1[0]},  [r9], r5
  vst1.32     {d1[1]},  [r9]
  add         r0, #4*2          @ src   += 4 * 2
  add         r2, r3, lsl #2    @ dst_a += 4 * dst_pitch_a
  add         r4, r5, lsl #2    @ dst_b += 4 * dst_pitch_b
  subs        r8,  #4           @ w     -= 4
  beq         .done
  @ some residual, check to see if it includes a 2x8 block,
  @ or less
  cmp         r8, #2
  blt         .block_1x8
 .block_2x8:
  mov         r9, r0
  vld2.16     {d0[0], d2[0]}, [r9], r1
  vld2.16     {d1[0], d3[0]}, [r9], r1
  vld2.16     {d0[1], d2[1]}, [r9], r1
  vld2.16     {d1[1], d3[1]}, [r9], r1
  vld2.16     {d0[2], d2[2]}, [r9], r1
  vld2.16     {d1[2], d3[2]}, [r9], r1
  vld2.16     {d0[3], d2[3]}, [r9], r1
  vld2.16     {d1[3], d3[3]}, [r9]
  vtrn.8      d0, d1
  vtrn.8      d2, d3
  mov         r9, r2
  vst1.64     {d0}, [r9], r3
  vst1.64     {d2}, [r9]
  mov         r9, r4
  vst1.64     {d1}, [r9], r5
  vst1.64     {d3}, [r9]
  add         r0, #2*2          @ src   += 2 * 2
  add         r2, r3, lsl #1    @ dst_a += 2 * dst_pitch_a
  add         r4, r5, lsl #1    @ dst_a += 2 * dst_pitch_a
  subs        r8,  #2           @ w     -= 2
  beq         .done
 .block_1x8:
  vld2.8      {d0[0], d1[0]}, [r0], r1
  vld2.8      {d0[1], d1[1]}, [r0], r1
  vld2.8      {d0[2], d1[2]}, [r0], r1
  vld2.8      {d0[3], d1[3]}, [r0], r1
  vld2.8      {d0[4], d1[4]}, [r0], r1
  vld2.8      {d0[5], d1[5]}, [r0], r1
  vld2.8      {d0[6], d1[6]}, [r0], r1
  vld2.8      {d0[7], d1[7]}, [r0]
  vst1.64     {d0}, [r2]
  vst1.64     {d1}, [r4]
 .done:
  pop         {r4-r9, pc}
 vtbl_4x4_transpose:
  .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
--- a/source/rotate_neon.s
+++ b/source/rotate_neon.s
@ -1,7 +1,15 @@
  .global RestoreRegisters_NEON
  .global ReverseLine_NEON
-  .global Transpose_wx8_NEON
+  .global ReverseLineUV_NEON
  .global SaveRegisters_NEON
  .global TransposeWx8_NEON
  .global TransposeUVWx8_NEON
  .type RestoreRegisters_NEON, function
  .type ReverseLine_NEON, function
-  .type Transpose_wx8_NEON, function
+  .type ReverseLineUV_NEON, function
  .type SaveRegisters_NEON, function
  .type TransposeWx8_NEON, function
  .type TransposeUVWx8_NEON, function
@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
@ r0 const uint8* src
@ -23,7 +31,7 @@ ReverseLine_NEON:
  @ along with 16 to get the next location.
  mov         r3, #-24
-  beq         .line_residuals
+  beq         Lline_residuals
  @ back of destination by the size of the register that is
  @ going to be reversed
@ -35,7 +43,7 @@ ReverseLine_NEON:
  @ loop will run one extra time.
  sub         r2, #16
-.segments_of_16:
+Lsegments_of_16:
    vld1.8      {q0}, [r0]!               @ src += 16
    @ reverse the bytes in the 64 bit segments.  unable to reverse
@ -48,7 +56,7 @@ ReverseLine_NEON:
    vst1.8      {d0}, [r1], r3            @ dst -= 16
    subs        r2, #16
-    bge         .segments_of_16
+    bge         Lsegments_of_16
  @ add 16 back to the counter.  if the result is 0 there is no
  @ residuals so return
@ -57,7 +65,7 @@ ReverseLine_NEON:
  add         r1, #16
-.line_residuals:
+Lline_residuals:
  mov         r3, #-3
@ -65,38 +73,38 @@ ReverseLine_NEON:
  subs        r2, #2
  @ check for 16*n+1 scenarios where segments_of_2 should not
  @ be run, but there is something left over.
-  blt         .segment_of_1
+  blt         Lsegment_of_1
@ do this in neon registers as per
@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-.segments_of_2:
+Lsegments_of_2:
    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
    vst1.8      {d1[0]}, [r1]!
    vst1.8      {d0[0]}, [r1], r3         @ dst -= 2
    subs        r2, #2
-    bge         .segments_of_2
+    bge         Lsegments_of_2
  adds        r2, #2
  bxeq        lr
-.segment_of_1:
+Lsegment_of_1:
  add         r1, #1
  vld1.8      {d0[0]}, [r0]
  vst1.8      {d0[0]}, [r1]
  bx          lr
-@ void Transpose_wx8_NEON (const uint8* src, int src_pitch,
+@ void TransposeWx8_NEON (const uint8* src, int src_stride,
-@                          uint8* dst, int dst_pitch,
+@                         uint8* dst, int dst_stride,
-@                          int w)
+@                         int w)
@ r0 const uint8* src
-@ r1 int src_pitch
+@ r1 int src_stride
@ r2 uint8* dst
-@ r3 int dst_pitch
+@ r3 int dst_stride
@ stack int w
-Transpose_wx8_NEON:
+TransposeWx8_NEON:
  push        {r4,r8,r9,lr}
  ldr         r8, [sp, #16]        @ width
@ -107,7 +115,7 @@ Transpose_wx8_NEON:
  sub         r8, #8
@ handle 8x8 blocks.  this should be the majority of the plane
-.loop_8x8:
+Lloop_8x8:
    mov         r9, r0
    vld1.8      {d0}, [r9], r1
@ -151,23 +159,23 @@ Transpose_wx8_NEON:
    vst1.8      {d6}, [r9]
    add         r0, #8            @ src += 8
-    add         r2, r3, lsl #3    @ dst += 8 * dst_pitch
+    add         r2, r3, lsl #3    @ dst += 8 * dst_stride
    subs        r8,  #8           @ w   -= 8
-    bge         .loop_8x8
+    bge         Lloop_8x8
  @ add 8 back to counter.  if the result is 0 there are
  @ no residuals.
  adds        r8, #8
-  beq         .done
+  beq         Ldone
  @ some residual, so between 1 and 7 lines left to transpose
  cmp         r8, #2
-  blt         .block_1x8
+  blt         Lblock_1x8
  cmp         r8, #4
-  blt         .block_2x8
+  blt         Lblock_2x8
-.block_4x8:
+Lblock_4x8:
  mov         r9, r0
  vld1.32     {d0[0]}, [r9], r1
  vld1.32     {d0[1]}, [r9], r1
@ -202,16 +210,16 @@ Transpose_wx8_NEON:
  vst1.32     {d1[1]}, [r9]
  add         r0, #4            @ src += 4
-  add         r2, r3, lsl #2    @ dst += 4 * dst_pitch
+  add         r2, r3, lsl #2    @ dst += 4 * dst_stride
  subs        r8,  #4           @ w   -= 4
-  beq         .done
+  beq         Ldone
  @ some residual, check to see if it includes a 2x8 block,
  @ or less
  cmp         r8, #2
-  blt         .block_1x8
+  blt         Lblock_1x8
-.block_2x8:
+Lblock_2x8:
  mov         r9, r0
  vld1.16     {d0[0]}, [r9], r1
  vld1.16     {d1[0]}, [r9], r1
@ -230,11 +238,11 @@ Transpose_wx8_NEON:
  vst1.64     {d1}, [r9]
  add         r0, #2            @ src += 2
-  add         r2, r3, lsl #1    @ dst += 2 * dst_pitch
+  add         r2, r3, lsl #1    @ dst += 2 * dst_stride
  subs        r8,  #2           @ w   -= 2
-  beq         .done
+  beq         Ldone
-.block_1x8:
+Lblock_1x8:
  vld1.8      {d0[0]}, [r0], r1
  vld1.8      {d0[1]}, [r0], r1
  vld1.8      {d0[2]}, [r0], r1
@ -246,9 +254,310 @@ Transpose_wx8_NEON:
  vst1.64     {d0}, [r2]
-.done:
+Ldone:
  pop         {r4,r8,r9,pc}
 vtbl_4x4_transpose:
  .byte  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
@ void SaveRegisters_NEON (unsigned long long store)
@ r0 unsigned long long store
 SaveRegisters_NEON:
  vst1.i64    {d8, d9, d10, d11}, [r0]!
  vst1.i64    {d12, d13, d14, d15}, [r0]!
  bx          lr
@ void RestoreRegisters_NEON (unsigned long long store)
@ r0 unsigned long long store
 RestoreRegisters_NEON:
  vld1.i64    {d8, d9, d10, d11}, [r0]!
  vld1.i64    {d12, d13, d14, d15}, [r0]!
  bx          lr
@ void ReverseLineUV_NEON (const uint8* src,
@                          uint8* dst_a,
@                          uint8* dst_b,
@                          int width)
@ r0 const uint8* src
@ r1 uint8* dst_a
@ r2 uint8* dst_b
@ r3 width
 ReverseLineUV_NEON:
  @ compute where to start writing destination
  add         r1, r1, r3      @ dst_a + width
  add         r2, r2, r3      @ dst_b + width
  @ work on input segments that are multiples of 16, but
  @ width that has been passed is output segments, half
  @ the size of input.
  lsrs        r12, r3, #3
  beq         Lline_residuals_di
  @ the output is written in to two blocks.
  mov         r12, #-8
  @ back of destination by the size of the register that is
  @ going to be reversed
  sub         r1, r1, #8
  sub         r2, r2, #8
  @ the loop needs to run on blocks of 8.  what will be left
  @ over is either a negative number, the residuals that need
  @ to be done, or 0.  if this isn't subtracted off here the
  @ loop will run one extra time.
  sub         r3, r3, #8
 Lsegments_of_8_di:
    vld2.8      {d0, d1}, [r0]!         @ src += 16
    @ reverse the bytes in the 64 bit segments
    vrev64.8    q0, q0
    vst1.8      {d0}, [r1], r12         @ dst_a -= 8
    vst1.8      {d1}, [r2], r12         @ dst_b -= 8
    subs        r3, r3, #8
    bge         Lsegments_of_8_di
  @ add 8 back to the counter.  if the result is 0 there is no
  @ residuals so return
  adds        r3, r3, #8
  bxeq        lr
  add         r1, r1, #8
  add         r2, r2, #8
 Lline_residuals_di:
  mov         r12, #-1
  sub         r1, r1, #1
  sub         r2, r2, #1
@ do this in neon registers as per
@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
 Lsegments_of_1:
    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
    vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
    vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1
    subs        r3, r3, #1
    bgt         Lsegments_of_1
  bx          lr
@ void TransposeUVWx8_NEON (const uint8* src, int src_stride,
@                           uint8* dst_a, int dst_stride_a,
@                           uint8* dst_b, int dst_stride_b,
@                           int width)
@ r0 const uint8* src
@ r1 int src_stride
@ r2 uint8* dst_a
@ r3 int dst_stride_a
@ stack uint8* dst_b
@ stack int dst_stride_b
@ stack int width
 TransposeUVWx8_NEON:
  push        {r4-r9,lr}
  ldr         r4, [sp, #28]         @ dst_b
  ldr         r5, [sp, #32]         @ dst_stride_b
  ldr         r8, [sp, #36]         @ width
  @ loops are on blocks of 8.  loop will stop when
  @ counter gets to or below 0.  starting the counter
  @ at w-8 allow for this
  sub         r8, #8
@ handle 8x8 blocks.  this should be the majority of the plane
 Lloop_8x8_di:
    mov         r9, r0
    vld2.8      {d0,  d1},  [r9], r1
    vld2.8      {d2,  d3},  [r9], r1
    vld2.8      {d4,  d5},  [r9], r1
    vld2.8      {d6,  d7},  [r9], r1
    vld2.8      {d8,  d9},  [r9], r1
    vld2.8      {d10, d11}, [r9], r1
    vld2.8      {d12, d13}, [r9], r1
    vld2.8      {d14, d15}, [r9]
    vtrn.8      q1, q0
    vtrn.8      q3, q2
    vtrn.8      q5, q4
    vtrn.8      q7, q6
    vtrn.16     q1, q3
    vtrn.16     q0, q2
    vtrn.16     q5, q7
    vtrn.16     q4, q6
    vtrn.32     q1, q5
    vtrn.32     q0, q4
    vtrn.32     q3, q7
    vtrn.32     q2, q6
    vrev16.8    q0, q0
    vrev16.8    q1, q1
    vrev16.8    q2, q2
    vrev16.8    q3, q3
    vrev16.8    q4, q4
    vrev16.8    q5, q5
    vrev16.8    q6, q6
    vrev16.8    q7, q7
    mov         r9, r2
    vst1.8      {d2},  [r9], r3
    vst1.8      {d0},  [r9], r3
    vst1.8      {d6},  [r9], r3
    vst1.8      {d4},  [r9], r3
    vst1.8      {d10}, [r9], r3
    vst1.8      {d8},  [r9], r3
    vst1.8      {d14}, [r9], r3
    vst1.8      {d12}, [r9]
    mov         r9, r4
    vst1.8      {d3},  [r9], r5
    vst1.8      {d1},  [r9], r5
    vst1.8      {d7},  [r9], r5
    vst1.8      {d5},  [r9], r5
    vst1.8      {d11}, [r9], r5
    vst1.8      {d9},  [r9], r5
    vst1.8      {d15}, [r9], r5
    vst1.8      {d13}, [r9]
    add         r0, #8*2          @ src   += 8*2
    add         r2, r3, lsl #3    @ dst_a += 8 * dst_stride_a
    add         r4, r5, lsl #3    @ dst_b += 8 * dst_stride_b
    subs        r8,  #8           @ w     -= 8
    bge         Lloop_8x8_di
  @ add 8 back to counter.  if the result is 0 there are
  @ no residuals.
  adds        r8, #8
  beq         Ldone_di
  @ some residual, so between 1 and 7 lines left to transpose
  cmp         r8, #2
  blt         Lblock_1x8_di
  cmp         r8, #4
  blt         Lblock_2x8_di
@ TODO(frkoenig) : clean this up
 Lblock_4x8_di:
  mov         r9, r0
  vld1.64     {d0}, [r9], r1
  vld1.64     {d1}, [r9], r1
  vld1.64     {d2}, [r9], r1
  vld1.64     {d3}, [r9], r1
  vld1.64     {d4}, [r9], r1
  vld1.64     {d5}, [r9], r1
  vld1.64     {d6}, [r9], r1
  vld1.64     {d7}, [r9]
  adr         r12, vtbl_4x4_transpose_di
  vld1.8      {q7}, [r12]
  vtrn.8      q0, q1
  vtrn.8      q2, q3
  vtbl.8      d8,  {d0, d1}, d14
  vtbl.8      d9,  {d0, d1}, d15
  vtbl.8      d10, {d2, d3}, d14
  vtbl.8      d11, {d2, d3}, d15
  vtbl.8      d12, {d4, d5}, d14
  vtbl.8      d13, {d4, d5}, d15
  vtbl.8      d0,  {d6, d7}, d14
  vtbl.8      d1,  {d6, d7}, d15
  mov         r9, r2
  vst1.32     {d8[0]},  [r9], r3
  vst1.32     {d8[1]},  [r9], r3
  vst1.32     {d9[0]},  [r9], r3
  vst1.32     {d9[1]},  [r9], r3
  add         r9, r2, #4
  vst1.32     {d12[0]}, [r9], r3
  vst1.32     {d12[1]}, [r9], r3
  vst1.32     {d13[0]}, [r9], r3
  vst1.32     {d13[1]}, [r9]
  mov         r9, r4
  vst1.32     {d10[0]}, [r9], r5
  vst1.32     {d10[1]}, [r9], r5
  vst1.32     {d11[0]}, [r9], r5
  vst1.32     {d11[1]}, [r9], r5
  add         r9, r4, #4
  vst1.32     {d0[0]},  [r9], r5
  vst1.32     {d0[1]},  [r9], r5
  vst1.32     {d1[0]},  [r9], r5
  vst1.32     {d1[1]},  [r9]
  add         r0, #4*2          @ src   += 4 * 2
  add         r2, r3, lsl #2    @ dst_a += 4 * dst_stride_a
  add         r4, r5, lsl #2    @ dst_b += 4 * dst_stride_b
  subs        r8,  #4           @ w     -= 4
  beq         Ldone_di
  @ some residual, check to see if it includes a 2x8 block,
  @ or less
  cmp         r8, #2
  blt         Lblock_1x8_di
 Lblock_2x8_di:
  mov         r9, r0
  vld2.16     {d0[0], d2[0]}, [r9], r1
  vld2.16     {d1[0], d3[0]}, [r9], r1
  vld2.16     {d0[1], d2[1]}, [r9], r1
  vld2.16     {d1[1], d3[1]}, [r9], r1
  vld2.16     {d0[2], d2[2]}, [r9], r1
  vld2.16     {d1[2], d3[2]}, [r9], r1
  vld2.16     {d0[3], d2[3]}, [r9], r1
  vld2.16     {d1[3], d3[3]}, [r9]
  vtrn.8      d0, d1
  vtrn.8      d2, d3
  mov         r9, r2
  vst1.64     {d0}, [r9], r3
  vst1.64     {d2}, [r9]
  mov         r9, r4
  vst1.64     {d1}, [r9], r5
  vst1.64     {d3}, [r9]
  add         r0, #2*2          @ src   += 2 * 2
  add         r2, r3, lsl #1    @ dst_a += 2 * dst_stride_a
  add         r4, r5, lsl #1    @ dst_a += 2 * dst_stride_a
  subs        r8,  #2           @ w     -= 2
  beq         Ldone_di
 Lblock_1x8_di:
  vld2.8      {d0[0], d1[0]}, [r0], r1
  vld2.8      {d0[1], d1[1]}, [r0], r1
  vld2.8      {d0[2], d1[2]}, [r0], r1
  vld2.8      {d0[3], d1[3]}, [r0], r1
  vld2.8      {d0[4], d1[4]}, [r0], r1
  vld2.8      {d0[5], d1[5]}, [r0], r1
  vld2.8      {d0[6], d1[6]}, [r0], r1
  vld2.8      {d0[7], d1[7]}, [r0]
  vst1.64     {d0}, [r2]
  vst1.64     {d1}, [r4]
 Ldone_di:
  pop         {r4-r9, pc}
 vtbl_4x4_transpose_di:
  .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
--- a/source/rotate_priv.h
+++ b/source/rotate_priv.h
@ -0,0 +1,72 @@
 /*
 *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef SOURCE_ROTATE_PRIV_H_
 #define SOURCE_ROTATE_PRIV_H_
 #include "libyuv/basic_types.h"
 namespace libyuv {
 // Rotate planes by 90, 180, 270
 void
 RotatePlane90(const uint8* src, int src_stride,
              uint8* dst, int dst_stride,
              int width, int height);
 void
 RotatePlane180(const uint8* src, int src_stride,
               uint8* dst, int dst_stride,
               int width, int height);
 void
 RotatePlane270(const uint8* src, int src_stride,
               uint8* dst, int dst_stride,
               int width, int height);
 void
 RotateUV90(const uint8* src, int src_stride,
           uint8* dst_a, int dst_stride_a,
           uint8* dst_b, int dst_stride_b,
           int width, int height);
 // Rotations for when U and V are interleaved.
 // These functions take one input pointer and
 // split the data into two buffers while
 // rotating them.
 void
 RotateUV180(const uint8* src, int src_stride,
            uint8* dst_a, int dst_stride_a,
            uint8* dst_b, int dst_stride_b,
            int width, int height);
 void
 RotateUV270(const uint8* src, int src_stride,
            uint8* dst_a, int dst_stride_a,
            uint8* dst_b, int dst_stride_b,
            int width, int height);
 // The 90 and 270 functions are based on transposes.
 // Doing a transpose with reversing the read/write
 // order will result in a rotation by +- 90 degrees.
 void
 TransposePlane(const uint8* src, int src_stride,
               uint8* dst, int dst_stride,
               int width, int height);
 void
 TransposeUV(const uint8* src, int src_stride,
            uint8* dst_a, int dst_stride_a,
            uint8* dst_b, int dst_stride_b,
            int width, int height);
 }  // namespace libyuv
 #endif  // SOURCE_ROTATE_PRIV_H_
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@ -11,7 +11,6 @@
 #ifndef UINIT_TEST_H_
 #define UINIT_TEST_H_
 #include "basic_types.h"
 #include <gtest/gtest.h>
 class libyuvTest : public ::testing::Test {
@ -20,8 +19,8 @@ class libyuvTest : public ::testing::Test {
  virtual void SetUp();
  virtual void TearDown();
-  const uint32 _rotate_max_w;
+  const int _rotate_max_w;
-  const uint32 _rotate_max_h;
+  const int _rotate_max_h;
 };