I420 and NV12 rotate functions.

Consolidate rotate files. Add unit tests for I420 and NV12 rotate functions. Fix remaining pitch/stride references. Review URL: http://webrtc-codereview.appspot.com/239001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@32 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2011-10-19 17:52:15 +00:00 · 2011-10-19 17:52:15 +00:00 · 3de12ae1c6
commit 3de12ae1c6
parent a1280730c2
12 changed files with 1809 additions and 884 deletions
--- a/include/libyuv/general.h
+++ b/include/libyuv/general.h
@ -20,14 +20,6 @@

 namespace libyuv {

-// Supported rotation
-enum RotationMode {
-  kRotateNone = 0,
-  kRotateClockwise = 90,
-  kRotateCounterClockwise = 270,
-  kRotate180 = 180,
-};
-
 // I420 mirror
 int
 I420Mirror(const uint8* src_yplane, int src_ystride,
@ -50,17 +42,6 @@ I420Crop(uint8* frame,
         int src_width, int src_height,
         int dst_width, int dst_height);

-// Rotate I420 frame
-int
-I420Rotate(const uint8* src_yplane, int src_ystride,
-           const uint8* src_uplane, int src_ustride,
-           const uint8* src_vplane, int src_vstride,
-           uint8* dst_yplane, int dst_ystride,
-           uint8* dst_uplane, int dst_ustride,
-           uint8* dst_vplane, int dst_vstride,
-           int width, int height,
-           RotationMode mode);
-
 } // namespace libyuv

 #endif // INCLUDE_LIBYUV_GENERAL_H_
--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+// Supported rotation
+enum RotationMode {
+  kRotateNone = 0,
+  kRotateClockwise = 90,
+  kRotateCounterClockwise = 270,
+  kRotate180 = 180,
+};
+
+// Rotate I420 frame
+int
+I420Rotate(const uint8* src_y, int src_stride_y,
+           const uint8* src_u, int src_stride_u,
+           const uint8* src_v, int src_stride_v,
+           uint8* dst_y, int dst_stride_y,
+           uint8* dst_u, int dst_stride_u,
+           uint8* dst_v, int dst_stride_v,
+           int width, int height,
+           RotationMode mode);
+
+// Split a NV12 input buffer into Y, U, V buffers and
+// then rotate the buffers.
+int
+NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height,
+                 RotationMode mode);
+
+}  // namespace libyuv
+
+#endif  // INCLUDE_LIBYUV_ROTATE_H_
--- a/libyuv.gyp
+++ b/libyuv.gyp
@ -44,7 +44,6 @@
        'source/general.cc',
        'source/planar_functions.cc',
        'source/rotate.cc',
-        'source/rotate_deinterleave.cc',
        'source/row_table.cc',
        'source/scale.cc',
        'source/video_common.cc',
--- a/source/general.cc
+++ b/source/general.cc
@ -13,7 +13,6 @@
 #include <string.h>     // memcpy(), memset()

 #include "libyuv/planar_functions.h"
-#include "rotate.h"

 namespace libyuv {

@ -282,66 +281,4 @@ I420CropPad(const uint8* src_frame, int src_width,
  return 0;
 }

-int
-I420Rotate(const uint8* src_yplane, int src_ystride,
-           const uint8* src_uplane, int src_ustride,
-           const uint8* src_vplane, int src_vstride,
-           uint8* dst_yplane, int dst_ystride,
-           uint8* dst_uplane, int dst_ustride,
-           uint8* dst_vplane, int dst_vstride,
-           int width, int height,
-           RotationMode mode) {
-  switch (mode) {
-    case kRotateNone:
-      // copy frame
-      return I420Copy(src_yplane, src_ystride,
-                      src_uplane, src_ustride,
-                      src_vplane, src_vstride,
-                      dst_yplane, dst_ystride,
-                      dst_uplane, dst_ustride,
-                      dst_vplane, dst_vstride,
-                      width, height);
-      break;
-    case kRotateClockwise:
-      Rotate90(src_yplane, src_ystride,
-               dst_yplane, dst_ystride,
-               width, height);
-      Rotate90(src_uplane, src_ustride,
-               dst_uplane, dst_ustride,
-               width, height);
-      Rotate90(src_vplane, src_vstride,
-               dst_vplane, dst_vstride,
-               width, height);
-      return 0;
-      break;
-    case kRotateCounterClockwise:
-      Rotate270(src_yplane, src_ystride,
-                dst_yplane, dst_ystride,
-                width, height);
-      Rotate270(src_uplane, src_ustride,
-                dst_uplane, dst_ustride,
-                width, height);
-      Rotate270(src_vplane, src_vstride,
-                dst_vplane, dst_vstride,
-                width, height);
-      return 0;
-      break;
-    case kRotate180:
-      Rotate180(src_yplane, src_ystride,
-                dst_yplane, dst_ystride,
-                width, height);
-      Rotate180(src_uplane, src_ustride,
-                dst_uplane, dst_ustride,
-                width, height);
-      Rotate180(src_vplane, src_vstride,
-                dst_vplane, dst_vstride,
-               width, height);
-      return 0;
-    break;
-  default:
-    return -1;
-    break;
-  }
-}
-
 } // namespace libyuv
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -8,23 +8,41 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

-#include "rotate.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "rotate_priv.h"

 namespace libyuv {

+typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int);
 typedef void (*reverse_func)(const uint8*, uint8*, int);
-typedef void (*rotate_wx8func)(const uint8*, int, uint8*, int, int);
-typedef void (*rotate_wxhfunc)(const uint8*, int, uint8*, int, int, int);
+typedef void (*rotate_uv_wx8_func)(const uint8*, int,
+                                   uint8*, int,
+                                   uint8*, int, int);
+typedef void (*rotate_uv_wxh_func)(const uint8*, int,
+                                   uint8*, int,
+                                   uint8*, int, int, int);
+typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int);
+typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int);

 #ifdef __ARM_NEON__
 extern "C" {
+void RestoreRegisters_NEON(unsigned long long *restore);
+void SaveRegisters_NEON(unsigned long long *store);
 void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
-void Transpose_wx8_NEON(const uint8* src, int src_stride,
+void ReverseLineUV_NEON(const uint8* src,
+                        uint8* dst_a, uint8* dst_b,
+                        int width);
+void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride, int width);
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width);
 }  // extern "C"
 #endif

-static void Transpose_wx8_C(const uint8* src, int src_stride,
+static void TransposeWx8_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
                           int w) {
  int i, j;
@ -33,7 +51,7 @@ static void Transpose_wx8_C(const uint8* src, int src_stride,
      dst[i * dst_stride + j] = src[j * src_stride + i];
 }

-static void Transpose_wxh_C(const uint8* src, int src_stride,
+static void TransposeWxH_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
                           int width, int height) {
  int i, j;
@ -42,72 +60,82 @@ static void Transpose_wxh_C(const uint8* src, int src_stride,
      dst[i * dst_stride + j] = src[j * src_stride + i];
 }

-void Transpose(const uint8* src, int src_stride,
+void TransposePlane(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
                    int width, int height) {
  int i = height;
-  rotate_wx8func Transpose_wx8;
-  rotate_wxhfunc Transpose_wxh;
+  rotate_wx8_func TransposeWx8;
+  rotate_wxh_func TransposeWxH;

  // do processor detection here.
 #ifdef __ARM_NEON__
-  Transpose_wx8 = Transpose_wx8_NEON;
-  Transpose_wxh = Transpose_wxh_C;
+  TransposeWx8 = TransposeWx8_NEON;
+  TransposeWxH = TransposeWxH_C;
 #else
-  Transpose_wx8 = Transpose_wx8_C;
-  Transpose_wxh = Transpose_wxh_C;
+  TransposeWx8 = TransposeWx8_C;
+  TransposeWxH = TransposeWxH_C;
 #endif

  // work across the source in 8x8 tiles
-  do {
-    Transpose_wx8(src, src_stride, dst, dst_stride, width);
+  while (i >= 8) {
+    TransposeWx8(src, src_stride, dst, dst_stride, width);

-    src += 8 * src_stride;
-    dst += 8;
+    src += 8 * src_stride;    // go down 8 rows
+    dst += 8;                 // move over 8 columns
    i   -= 8;
-  } while (i >= 8);
-
-// TODO(frkoenig): Have wx4 and maybe wx2
-  Transpose_wxh(src, src_stride, dst, dst_stride, width, i);
  }

-void Rotate90(const uint8* src, int src_stride,
+  TransposeWxH(src, src_stride, dst, dst_stride, width, i);
+}
+
+void RotatePlane90(const uint8* src, int src_stride,
                   uint8* dst, int dst_stride,
                   int width, int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top.  So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
  src += src_stride * (height - 1);
  src_stride = -src_stride;

-  Transpose(src, src_stride, dst, dst_stride, width, height);
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }

-void Rotate270(const uint8* src, int src_stride,
+void RotatePlane270(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
                    int width, int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top.  So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
  dst += dst_stride * (width - 1);
  dst_stride = -dst_stride;

-  Transpose(src, src_stride, dst, dst_stride, width, height);
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }

 void ReverseLine_C(const uint8* src, uint8* dst, int width) {
  int i;
-  for (i = 0; i < width; ++i)
-    dst[width-1 - i] = src[i];
+  src += width;
+  for (i = 0; i < width; ++i) {
+    --src;
+    dst[i] = src[0];
+  }
 }

-void Rotate180(const uint8* src, int src_stride,
+void RotatePlane180(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
                    int width, int height) {
  int i;
  reverse_func ReverseLine;

-  // do processor detection here.
+  // TODO(frkoenig): do processor detection here.
 #ifdef __ARM_NEON__
  ReverseLine = ReverseLine_NEON;
 #else
  ReverseLine = ReverseLine_C;
 #endif

+  // Rotate by 180 is a mirror with the destination
+  // written in reverse.
  dst += dst_stride * (height - 1);

  for (i = 0; i < height; ++i) {
@ -118,4 +146,269 @@ void Rotate180(const uint8* src, int src_stride,
  }
 }

+static void TransposeUVWx8_C(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b,
+                             int w) {
+  int i, j;
+  for (i = 0; i < w * 2; i += 2)
+    for (j = 0; j < 8; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+}
+
+static void TransposeUVWxH_C(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b,
+                             int w, int h) {
+  int i, j;
+  for (i = 0; i < w*2; i += 2)
+    for (j = 0; j < h; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+}
+
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i = height;
+  rotate_uv_wx8_func TransposeWx8;
+  rotate_uv_wxh_func TransposeWxH;
+
+  // do processor detection here.
+#ifdef __ARM_NEON__
+  unsigned long long store_reg[8];
+  SaveRegisters_NEON(store_reg);
+  TransposeWx8 = TransposeUVWx8_NEON;
+  TransposeWxH = TransposeUVWxH_C;
+#else
+  TransposeWx8 = TransposeUVWx8_C;
+  TransposeWxH = TransposeUVWxH_C;
+#endif
+
+  // work through the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8(src, src_stride,
+                 dst_a, dst_stride_a,
+                 dst_b, dst_stride_b,
+                 width);
+
+    src   += 8 * src_stride;    // go down 8 rows
+    dst_a += 8;                 // move over 8 columns
+    dst_b += 8;                 // move over 8 columns
+    i     -= 8;
+  }
+
+  TransposeWxH(src, src_stride,
+               dst_a, dst_stride_a,
+               dst_b, dst_stride_b,
+               width, i);
+
+#ifdef __ARM_NEON__
+  RestoreRegisters_NEON(store_reg);
+#endif
+}
+
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height) {
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  dst_a += dst_stride_a * (width - 1);
+  dst_b += dst_stride_b * (width - 1);
+  dst_stride_a = -dst_stride_a;
+  dst_stride_b = -dst_stride_b;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+static void ReverseLineUV_C(const uint8* src,
+                            uint8* dst_a, uint8* dst_b,
+                            int width) {
+  int i;
+  src += width << 1;
+  for (i = 0; i < width; ++i) {
+    src -= 2;
+    dst_a[i] = src[0];
+    dst_b[i] = src[1];
+  }
+}
+
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i;
+  reverse_uv_func ReverseLine;
+
+  // TODO(frkoenig) : do processor detection here.
+#ifdef __ARM_NEON__
+  ReverseLine = ReverseLineUV_NEON;
+#else
+  ReverseLine = ReverseLineUV_C;
+#endif
+
+  dst_a += dst_stride_a * (height - 1);
+  dst_b += dst_stride_b * (height - 1);
+
+  for (i = 0; i < height; ++i) {
+    ReverseLine(src, dst_a, dst_b, width);
+
+    src   += src_stride;      // down one line at a time
+    dst_a -= dst_stride_a;    // nominally up one line at a time
+    dst_b -= dst_stride_b;    // nominally up one line at a time
+  }
+}
+
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height,
+               RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotateNone:
+      // copy frame
+      return I420Copy(src_y, src_stride_y,
+                      src_u, src_stride_u,
+                      src_v, src_stride_v,
+                      dst_y, dst_stride_y,
+                      dst_u, dst_stride_u,
+                      dst_v, dst_stride_v,
+                      width, height);
+    case kRotateClockwise:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotatePlane90(src_u, src_stride_u,
+                    dst_u, dst_stride_u,
+                    halfwidth, halfheight);
+      RotatePlane90(src_v, src_stride_v,
+                    dst_v, dst_stride_v,
+                    halfwidth, halfheight);
+      return 0;
+    case kRotateCounterClockwise:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane270(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane270(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane180(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane180(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int width, int height,
+                     RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  switch (mode) {
+    case kRotateNone:
+      // copy frame
+      return NV12ToI420(src_y, src_uv, src_stride_y,
+                        dst_y, dst_stride_y,
+                        dst_u, dst_stride_u,
+                        dst_v, dst_stride_v,
+                        width, height);
+    case kRotateClockwise:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotateUV90(src_uv, src_stride_uv,
+                 dst_u, dst_stride_u,
+                 dst_v, dst_stride_v,
+                 halfwidth, halfheight);
+      return 0;
+    case kRotateCounterClockwise:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV270(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV180(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
 }  // namespace libyuv
--- a/source/rotate.h
+++ b/source/rotate.h
@ -1,46 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef LIBYUV_SOURCE_ROTATE_H_
-#define LIBYUV_SOURCE_ROTATE_H_
-
-#include "libyuv/basic_types.h"
-
-namespace libyuv {
-
-void Rotate90(const uint8* src, int src_stride,
-              uint8* dst, int dst_stride,
-              int width, int height);
-void Rotate180(const uint8* src, int src_stride,
-               uint8* dst, int dst_stride,
-               int width, int height);
-void Rotate270(const uint8* src, int src_stride,
-               uint8* dst, int dst_stride,
-               int width, int height);
-
-void Rotate90_deinterleave(const uint8* src, int src_stride,
-                           uint8* dst_a, int dst_stride_a,
-                           uint8* dst_b, int dst_stride_b,
-                           int width, int height);
-void Rotate180_deinterleave(const uint8* src, int src_stride,
-                            uint8* dst_a, int dst_stride_a,
-                            uint8* dst_b, int dst_stride_b,
-                            int width, int height);
-void Rotate270_deinterleave(const uint8* src, int src_stride,
-                            uint8* dst_a, int dst_stride_a,
-                            uint8* dst_b, int dst_stride_b,
-                            int width, int height);
-
-void Transpose(const uint8* src, int src_stride,
-               uint8* dst, int dst_stride,
-               int width, int height);
-}  // namespace libyuv
-
-#endif  // LIBYUV_SOURCE_ROTATE_H_
--- a/source/rotate_deinterleave.cc
+++ b/source/rotate_deinterleave.cc
@ -1,171 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "rotate.h"
-
-namespace libyuv {
-
-typedef void (*reverse_func)(const uint8*, uint8*, uint8*, int);
-typedef void (*rotate_wx8func)(const uint8*, int,
-                               uint8*, int,
-                               uint8*, int, int);
-typedef void (*rotate_wxhfunc)(const uint8*, int,
-                               uint8*, int,
-                               uint8*, int, int, int);
-
-#ifdef __ARM_NEON__
-extern "C" {
-void RestoreRegisters_NEON(unsigned long long *restore);
-void ReverseLine_di_NEON(const uint8* src,
-                         uint8* dst_a, uint8* dst_b,
-                         int width);
-void SaveRegisters_NEON(unsigned long long *store);
-void Transpose_di_wx8_NEON(const uint8* src, int src_stride,
-                           uint8* dst_a, int dst_stride_a,
-                           uint8* dst_b, int dst_stride_b,
-                           int width);
-}  // extern "C"
-#endif
-
-static void Transpose_di_wx8_C(const uint8* src, int src_stride,
-                               uint8* dst_a, int dst_stride_a,
-                               uint8* dst_b, int dst_stride_b,
-                               int w) {
-  int i, j;
-  for (i = 0; i < w*2; i += 2)
-    for (j = 0; j < 8; ++j) {
-      dst_a[j + (i>>1)*dst_stride_a] = src[i + j*src_stride];
-      dst_b[j + (i>>1)*dst_stride_b] = src[i + j*src_stride + 1];
-    }
-}
-
-static void Transpose_di_wxh_C(const uint8* src, int src_stride,
-                               uint8* dst_a, int dst_stride_a,
-                               uint8* dst_b, int dst_stride_b,
-                               int w, int h) {
-  int i, j;
-  for (i = 0; i < w*2; i += 2)
-    for (j = 0; j < h; ++j) {
-      dst_a[j + (i>>1)*dst_stride_a] = src[i + j*src_stride];
-      dst_b[j + (i>>1)*dst_stride_b] = src[i + j*src_stride + 1];
-    }
-}
-
-void Transpose_deinterleave(const uint8* src, int src_stride,
-                            uint8* dst_a, int dst_stride_a,
-                            uint8* dst_b, int dst_stride_b,
-                            int width, int height) {
-  int i = height;
-  rotate_wx8func Transpose_wx8;
-  rotate_wxhfunc Transpose_wxh;
-
-  // do processor detection here.
-#ifdef __ARM_NEON__
-  unsigned long long store_reg[8];
-  SaveRegisters_NEON(store_reg);
-  Transpose_wx8 = Transpose_di_wx8_NEON;
-  Transpose_wxh = Transpose_di_wxh_C;
-#else
-  Transpose_wx8 = Transpose_di_wx8_C;
-  Transpose_wxh = Transpose_di_wxh_C;
-#endif
-
-  width >>= 1;
-
-  // work across the source in 8x8 tiles
-  do {
-    Transpose_wx8(src, src_stride,
-                  dst_a, dst_stride_a,
-                  dst_b, dst_stride_b,
-                  width);
-
-    src   += 8 * src_stride;
-    dst_a += 8;
-    dst_b += 8;
-    i     -= 8;
-  } while (i >= 8);
-
-  Transpose_wxh(src, src_stride,
-                dst_a, dst_stride_a,
-                dst_b, dst_stride_b,
-                width, i);
-
-#ifdef __ARM_NEON__
-  RestoreRegisters_NEON(store_reg);
-#endif
-}
-
-void Rotate90_deinterleave(const uint8* src, int src_stride,
-                           uint8* dst_a, int dst_stride_a,
-                           uint8* dst_b, int dst_stride_b,
-                            int width, int height) {
-  src += src_stride*(height-1);
-  src_stride = -src_stride;
-
-  Transpose_deinterleave(src, src_stride,
-                         dst_a, dst_stride_a,
-                         dst_b, dst_stride_b,
-                         width, height);
-}
-
-void Rotate270_deinterleave(const uint8* src, int src_stride,
-                            uint8* dst_a, int dst_stride_a,
-                            uint8* dst_b, int dst_stride_b,
-                            int width, int height) {
-  dst_a += dst_stride_a*((width>>1)-1);
-  dst_b += dst_stride_b*((width>>1)-1);
-  dst_stride_a = -dst_stride_a;
-  dst_stride_b = -dst_stride_b;
-
-  Transpose_deinterleave(src, src_stride,
-                         dst_a, dst_stride_a,
-                         dst_b, dst_stride_b,
-                         width, height);
-}
-
-static void ReverseLine_di_C(const uint8* src,
-                             uint8* dst_a, uint8* dst_b,
-                             int width) {
-  int i;
-  for (i = 0; i < width*2; i += 2) {
-    dst_a[width-1 - (i>>1)] = src[i];
-    dst_b[width-1 - (i>>1)] = src[i+1];
-  }
-}
-
-void Rotate180_deinterleave(const uint8* src, int src_stride,
-                            uint8* dst_a, int dst_stride_a,
-                            uint8* dst_b, int dst_stride_b,
-                            int width, int height) {
-  int i;
-  reverse_func ReverseLine;
-
-  // do processor detection here.
-#ifdef __ARM_NEON__
-  ReverseLine = ReverseLine_di_NEON;
-#else
-  ReverseLine = ReverseLine_di_C;
-#endif
-
-  dst_a += dst_stride_a*(height-1);
-  dst_b += dst_stride_b*(height-1);
-
-  width >>= 1;
-
-  for (i = 0; i < height; ++i) {
-    ReverseLine(src, dst_a, dst_b, width);
-
-    src   += src_stride;
-    dst_a -= dst_stride_a;
-    dst_b -= dst_stride_b;
-  }
-}
-
-}  // namespace libyuv
--- a/source/rotate_deinterleave_neon.s
+++ b/source/rotate_deinterleave_neon.s
@ -1,310 +0,0 @@
-  .global RestoreRegisters_NEON
-  .global ReverseLine_di_NEON
-  .global SaveRegisters_NEON
-  .global Transpose_di_wx8_NEON
-  .type RestoreRegisters_NEON, function
-  .type ReverseLine_di_NEON, function
-  .type SaveRegisters_NEON, function
-  .type Transpose_di_wx8_NEON, function
-
-@ void SaveRegisters_NEON (unsigned long long store)
-@ r0 unsigned long long store
-SaveRegisters_NEON:
-  vst1.i64    {d8, d9, d10, d11}, [r0]!
-  vst1.i64    {d12, d13, d14, d15}, [r0]!
-  bx          lr
-
-@ void RestoreRegisters_NEON (unsigned long long store)
-@ r0 unsigned long long store
-RestoreRegisters_NEON:
-  vld1.i64    {d8, d9, d10, d11}, [r0]!
-  vld1.i64    {d12, d13, d14, d15}, [r0]!
-  bx          lr
-
-
-@ void ReverseLine_NEON (const uint8* src,
-@                        uint8* dst_a,
-@                        uint8* dst_b,
-@                        int width)
-@ r0 const uint8* src
-@ r1 uint8* dst_a
-@ r2 uint8* dst_b
-@ r3 width
-ReverseLine_di_NEON:
-
-  @ compute where to start writing destination
-  add         r1, r1, r3      @ dst_a + width
-  add         r2, r2, r3      @ dst_b + width
-
-  @ work on input segments that are multiples of 16, but
-  @ width that has been passed is output segments, half
-  @ the size of input.
-  lsrs        r12, r3, #3
-
-  beq         .line_residuals
-
-  @ the output is written in to two blocks.
-  mov         r12, #-8
-
-  @ back of destination by the size of the register that is
-  @ going to be reversed
-  sub         r1, r1, #8
-  sub         r2, r2, #8
-
-  @ the loop needs to run on blocks of 16.  what will be left
-  @ over is either a negative number, the residuals that need
-  @ to be done, or 0.  if this isn't subtracted off here the
-  @ loop will run one extra time.
-  sub         r3, r3, #8
-
-.segments_of_8:
-    vld2.8      {d0, d1}, [r0]!         @ src += 16
-
-    @ reverse the bytes in the 64 bit segments
-    vrev64.8    q0, q0
-
-    vst1.8      {d0}, [r1], r12         @ dst_a -= 8
-    vst1.8      {d1}, [r2], r12         @ dst_b -= 8
-
-    subs        r3, r3, #8
-    bge         .segments_of_8
-
-  @ add 16 back to the counter.  if the result is 0 there is no
-  @ residuals so return
-  adds        r3, r3, #8
-  bxeq        lr
-
-  add         r1, r1, #8
-  add         r2, r2, #8
-
-.line_residuals:
-
-  mov         r12, #-1
-
-  sub         r1, r1, #1
-  sub         r2, r2, #1
-
-@ do this in neon registers as per
-@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-.segments_of_2:
-    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
-
-    vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
-    vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1
-
-    subs        r3, r3, #1
-    bgt         .segments_of_2
-
-  bx          lr
-
-@ void Transpose_di_wx8_NEON (const uint8* src, int src_pitch,
-@                             uint8* dst_a, int dst_pitch_a,
-@                             uint8* dst_b, int dst_pitch_b,
-@                             int width)
-@ r0 const uint8* src
-@ r1 int src_pitch
-@ r2 uint8* dst_a
-@ r3 int dst_pitch_a
-@ stack uint8* dst_b
-@ stack int dst_pitch_b
-@ stack int width
-Transpose_di_wx8_NEON:
-  push        {r4-r9,lr}
-
-  ldr         r4, [sp, #28]         @ dst_b
-  ldr         r5, [sp, #32]         @ dst_pitch_b
-  ldr         r7, [sp, #36]         @ width
-  @ loops are on blocks of 8.  loop will stop when
-  @ counter gets to or below 0.  starting the counter
-  @ at w-8 allow for this
-  sub         r8, #8
-
-@ handle 8x8 blocks.  this should be the majority of the plane
-.loop_8x8:
-    mov         r9, r0
-
-    vld2.8      {d0,  d1},  [r9], r1
-    vld2.8      {d2,  d3},  [r9], r1
-    vld2.8      {d4,  d5},  [r9], r1
-    vld2.8      {d6,  d7},  [r9], r1
-    vld2.8      {d8,  d9},  [r9], r1
-    vld2.8      {d10, d11}, [r9], r1
-    vld2.8      {d12, d13}, [r9], r1
-    vld2.8      {d14, d15}, [r9]
-
-    vtrn.8      q1, q0
-    vtrn.8      q3, q2
-    vtrn.8      q5, q4
-    vtrn.8      q7, q6
-
-    vtrn.16     q1, q3
-    vtrn.16     q0, q2
-    vtrn.16     q5, q7
-    vtrn.16     q4, q6
-
-    vtrn.32     q1, q5
-    vtrn.32     q0, q4
-    vtrn.32     q3, q7
-    vtrn.32     q2, q6
-
-    vrev16.8    q0, q0
-    vrev16.8    q1, q1
-    vrev16.8    q2, q2
-    vrev16.8    q3, q3
-    vrev16.8    q4, q4
-    vrev16.8    q5, q5
-    vrev16.8    q6, q6
-    vrev16.8    q7, q7
-
-    mov         r9, r2
-
-    vst1.8      {d2},  [r9], r3
-    vst1.8      {d0},  [r9], r3
-    vst1.8      {d6},  [r9], r3
-    vst1.8      {d4},  [r9], r3
-    vst1.8      {d10}, [r9], r3
-    vst1.8      {d8},  [r9], r3
-    vst1.8      {d14}, [r9], r3
-    vst1.8      {d12}, [r9]
-
-    mov         r9, r4
-
-    vst1.8      {d3},  [r9], r5
-    vst1.8      {d1},  [r9], r5
-    vst1.8      {d7},  [r9], r5
-    vst1.8      {d5},  [r9], r5
-    vst1.8      {d11}, [r9], r5
-    vst1.8      {d9},  [r9], r5
-    vst1.8      {d15}, [r9], r5
-    vst1.8      {d13}, [r9]
-
-    add         r0, #8*2          @ src   += 8*2
-    add         r2, r3, lsl #3    @ dst_a += 8 * dst_pitch_a
-    add         r4, r5, lsl #3    @ dst_b += 8 * dst_pitch_b
-    subs        r8,  #8           @ w     -= 8
-    bge         .loop_8x8
-
-  @ add 8 back to counter.  if the result is 0 there are
-  @ no residuals.
-  adds        r8, #8
-  beq         .done
-
-  @ some residual, so between 1 and 7 lines left to transpose
-  cmp         r8, #2
-  blt         .block_1x8
-
-  cmp         r8, #4
-  blt         .block_2x8
-
-@ TODO(frkoenig) : clean this up
-.block_4x8:
-  mov         r9, r0
-  vld1.64     {d0}, [r9], r1
-  vld1.64     {d1}, [r9], r1
-  vld1.64     {d2}, [r9], r1
-  vld1.64     {d3}, [r9], r1
-  vld1.64     {d4}, [r9], r1
-  vld1.64     {d5}, [r9], r1
-  vld1.64     {d6}, [r9], r1
-  vld1.64     {d7}, [r9]
-
-  adr         r12, vtbl_4x4_transpose
-  vld1.8      {q7}, [r12]
-
-  vtrn.8      q0, q1
-  vtrn.8      q2, q3
-
-  vtbl.8      d8,  {d0, d1}, d14
-  vtbl.8      d9,  {d0, d1}, d15
-  vtbl.8      d10, {d2, d3}, d14
-  vtbl.8      d11, {d2, d3}, d15
-  vtbl.8      d12, {d4, d5}, d14
-  vtbl.8      d13, {d4, d5}, d15
-  vtbl.8      d0,  {d6, d7}, d14
-  vtbl.8      d1,  {d6, d7}, d15
-
-  mov         r9, r2
-
-  vst1.32     {d8[0]},  [r9], r3
-  vst1.32     {d8[1]},  [r9], r3
-  vst1.32     {d9[0]},  [r9], r3
-  vst1.32     {d9[1]},  [r9], r3
-
-  add         r9, r2, #4
-  vst1.32     {d12[0]}, [r9], r3
-  vst1.32     {d12[1]}, [r9], r3
-  vst1.32     {d13[0]}, [r9], r3
-  vst1.32     {d13[1]}, [r9]
-
-  mov         r9, r4
-
-  vst1.32     {d10[0]}, [r9], r5
-  vst1.32     {d10[1]}, [r9], r5
-  vst1.32     {d11[0]}, [r9], r5
-  vst1.32     {d11[1]}, [r9], r5
-
-  add         r9, r4, #4
-  vst1.32     {d0[0]},  [r9], r5
-  vst1.32     {d0[1]},  [r9], r5
-  vst1.32     {d1[0]},  [r9], r5
-  vst1.32     {d1[1]},  [r9]
-
-  add         r0, #4*2          @ src   += 4 * 2
-  add         r2, r3, lsl #2    @ dst_a += 4 * dst_pitch_a
-  add         r4, r5, lsl #2    @ dst_b += 4 * dst_pitch_b
-  subs        r8,  #4           @ w     -= 4
-  beq         .done
-
-  @ some residual, check to see if it includes a 2x8 block,
-  @ or less
-  cmp         r8, #2
-  blt         .block_1x8
-
-.block_2x8:
-  mov         r9, r0
-  vld2.16     {d0[0], d2[0]}, [r9], r1
-  vld2.16     {d1[0], d3[0]}, [r9], r1
-  vld2.16     {d0[1], d2[1]}, [r9], r1
-  vld2.16     {d1[1], d3[1]}, [r9], r1
-  vld2.16     {d0[2], d2[2]}, [r9], r1
-  vld2.16     {d1[2], d3[2]}, [r9], r1
-  vld2.16     {d0[3], d2[3]}, [r9], r1
-  vld2.16     {d1[3], d3[3]}, [r9]
-
-  vtrn.8      d0, d1
-  vtrn.8      d2, d3
-
-  mov         r9, r2
-
-  vst1.64     {d0}, [r9], r3
-  vst1.64     {d2}, [r9]
-
-  mov         r9, r4
-
-  vst1.64     {d1}, [r9], r5
-  vst1.64     {d3}, [r9]
-
-  add         r0, #2*2          @ src   += 2 * 2
-  add         r2, r3, lsl #1    @ dst_a += 2 * dst_pitch_a
-  add         r4, r5, lsl #1    @ dst_a += 2 * dst_pitch_a
-  subs        r8,  #2           @ w     -= 2
-  beq         .done
-
-.block_1x8:
-  vld2.8      {d0[0], d1[0]}, [r0], r1
-  vld2.8      {d0[1], d1[1]}, [r0], r1
-  vld2.8      {d0[2], d1[2]}, [r0], r1
-  vld2.8      {d0[3], d1[3]}, [r0], r1
-  vld2.8      {d0[4], d1[4]}, [r0], r1
-  vld2.8      {d0[5], d1[5]}, [r0], r1
-  vld2.8      {d0[6], d1[6]}, [r0], r1
-  vld2.8      {d0[7], d1[7]}, [r0]
-
-  vst1.64     {d0}, [r2]
-  vst1.64     {d1}, [r4]
-
-.done:
-  pop         {r4-r9, pc}
-
-vtbl_4x4_transpose:
-  .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
--- a/source/rotate_neon.s
+++ b/source/rotate_neon.s
@ -1,7 +1,15 @@
+  .global RestoreRegisters_NEON
  .global ReverseLine_NEON
-  .global Transpose_wx8_NEON
+  .global ReverseLineUV_NEON
+  .global SaveRegisters_NEON
+  .global TransposeWx8_NEON
+  .global TransposeUVWx8_NEON
+  .type RestoreRegisters_NEON, function
  .type ReverseLine_NEON, function
-  .type Transpose_wx8_NEON, function
+  .type ReverseLineUV_NEON, function
+  .type SaveRegisters_NEON, function
+  .type TransposeWx8_NEON, function
+  .type TransposeUVWx8_NEON, function

@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
@ r0 const uint8* src
@ -23,7 +31,7 @@ ReverseLine_NEON:
  @ along with 16 to get the next location.
  mov         r3, #-24

-  beq         .line_residuals
+  beq         Lline_residuals

  @ back of destination by the size of the register that is
  @ going to be reversed
@ -35,7 +43,7 @@ ReverseLine_NEON:
  @ loop will run one extra time.
  sub         r2, #16

-.segments_of_16:
+Lsegments_of_16:
    vld1.8      {q0}, [r0]!               @ src += 16

    @ reverse the bytes in the 64 bit segments.  unable to reverse
@ -48,7 +56,7 @@ ReverseLine_NEON:
    vst1.8      {d0}, [r1], r3            @ dst -= 16

    subs        r2, #16
-    bge         .segments_of_16
+    bge         Lsegments_of_16

  @ add 16 back to the counter.  if the result is 0 there is no
  @ residuals so return
@ -57,7 +65,7 @@ ReverseLine_NEON:

  add         r1, #16

-.line_residuals:
+Lline_residuals:

  mov         r3, #-3

@ -65,38 +73,38 @@ ReverseLine_NEON:
  subs        r2, #2
  @ check for 16*n+1 scenarios where segments_of_2 should not
  @ be run, but there is something left over.
-  blt         .segment_of_1
+  blt         Lsegment_of_1

@ do this in neon registers as per
@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-.segments_of_2:
+Lsegments_of_2:
    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2

    vst1.8      {d1[0]}, [r1]!
    vst1.8      {d0[0]}, [r1], r3         @ dst -= 2

    subs        r2, #2
-    bge         .segments_of_2
+    bge         Lsegments_of_2

  adds        r2, #2
  bxeq        lr

-.segment_of_1:
+Lsegment_of_1:
  add         r1, #1
  vld1.8      {d0[0]}, [r0]
  vst1.8      {d0[0]}, [r1]

  bx          lr

-@ void Transpose_wx8_NEON (const uint8* src, int src_pitch,
-@                          uint8* dst, int dst_pitch,
+@ void TransposeWx8_NEON (const uint8* src, int src_stride,
+@                         uint8* dst, int dst_stride,
@                         int w)
@ r0 const uint8* src
-@ r1 int src_pitch
+@ r1 int src_stride
@ r2 uint8* dst
-@ r3 int dst_pitch
+@ r3 int dst_stride
@ stack int w
-Transpose_wx8_NEON:
+TransposeWx8_NEON:
  push        {r4,r8,r9,lr}

  ldr         r8, [sp, #16]        @ width
@ -107,7 +115,7 @@ Transpose_wx8_NEON:
  sub         r8, #8

@ handle 8x8 blocks.  this should be the majority of the plane
-.loop_8x8:
+Lloop_8x8:
    mov         r9, r0

    vld1.8      {d0}, [r9], r1
@ -151,23 +159,23 @@ Transpose_wx8_NEON:
    vst1.8      {d6}, [r9]

    add         r0, #8            @ src += 8
-    add         r2, r3, lsl #3    @ dst += 8 * dst_pitch
+    add         r2, r3, lsl #3    @ dst += 8 * dst_stride
    subs        r8,  #8           @ w   -= 8
-    bge         .loop_8x8
+    bge         Lloop_8x8

  @ add 8 back to counter.  if the result is 0 there are
  @ no residuals.
  adds        r8, #8
-  beq         .done
+  beq         Ldone

  @ some residual, so between 1 and 7 lines left to transpose
  cmp         r8, #2
-  blt         .block_1x8
+  blt         Lblock_1x8

  cmp         r8, #4
-  blt         .block_2x8
+  blt         Lblock_2x8

-.block_4x8:
+Lblock_4x8:
  mov         r9, r0
  vld1.32     {d0[0]}, [r9], r1
  vld1.32     {d0[1]}, [r9], r1
@ -202,16 +210,16 @@ Transpose_wx8_NEON:
  vst1.32     {d1[1]}, [r9]

  add         r0, #4            @ src += 4
-  add         r2, r3, lsl #2    @ dst += 4 * dst_pitch
+  add         r2, r3, lsl #2    @ dst += 4 * dst_stride
  subs        r8,  #4           @ w   -= 4
-  beq         .done
+  beq         Ldone

  @ some residual, check to see if it includes a 2x8 block,
  @ or less
  cmp         r8, #2
-  blt         .block_1x8
+  blt         Lblock_1x8

-.block_2x8:
+Lblock_2x8:
  mov         r9, r0
  vld1.16     {d0[0]}, [r9], r1
  vld1.16     {d1[0]}, [r9], r1
@ -230,11 +238,11 @@ Transpose_wx8_NEON:
  vst1.64     {d1}, [r9]

  add         r0, #2            @ src += 2
-  add         r2, r3, lsl #1    @ dst += 2 * dst_pitch
+  add         r2, r3, lsl #1    @ dst += 2 * dst_stride
  subs        r8,  #2           @ w   -= 2
-  beq         .done
+  beq         Ldone

-.block_1x8:
+Lblock_1x8:
  vld1.8      {d0[0]}, [r0], r1
  vld1.8      {d0[1]}, [r0], r1
  vld1.8      {d0[2]}, [r0], r1
@ -246,9 +254,310 @@ Transpose_wx8_NEON:

  vst1.64     {d0}, [r2]

-.done:
+Ldone:

  pop         {r4,r8,r9,pc}

 vtbl_4x4_transpose:
  .byte  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+@ void SaveRegisters_NEON (unsigned long long store)
+@ r0 unsigned long long store
+SaveRegisters_NEON:
+  vst1.i64    {d8, d9, d10, d11}, [r0]!
+  vst1.i64    {d12, d13, d14, d15}, [r0]!
+  bx          lr
+
+@ void RestoreRegisters_NEON (unsigned long long store)
+@ r0 unsigned long long store
+RestoreRegisters_NEON:
+  vld1.i64    {d8, d9, d10, d11}, [r0]!
+  vld1.i64    {d12, d13, d14, d15}, [r0]!
+  bx          lr
+
+@ void ReverseLineUV_NEON (const uint8* src,
+@                          uint8* dst_a,
+@                          uint8* dst_b,
+@                          int width)
+@ r0 const uint8* src
+@ r1 uint8* dst_a
+@ r2 uint8* dst_b
+@ r3 width
+ReverseLineUV_NEON:
+
+  @ compute where to start writing destination
+  add         r1, r1, r3      @ dst_a + width
+  add         r2, r2, r3      @ dst_b + width
+
+  @ work on input segments that are multiples of 16, but
+  @ width that has been passed is output segments, half
+  @ the size of input.
+  lsrs        r12, r3, #3
+
+  beq         Lline_residuals_di
+
+  @ the output is written in to two blocks.
+  mov         r12, #-8
+
+  @ back of destination by the size of the register that is
+  @ going to be reversed
+  sub         r1, r1, #8
+  sub         r2, r2, #8
+
+  @ the loop needs to run on blocks of 8.  what will be left
+  @ over is either a negative number, the residuals that need
+  @ to be done, or 0.  if this isn't subtracted off here the
+  @ loop will run one extra time.
+  sub         r3, r3, #8
+
+Lsegments_of_8_di:
+    vld2.8      {d0, d1}, [r0]!         @ src += 16
+
+    @ reverse the bytes in the 64 bit segments
+    vrev64.8    q0, q0
+
+    vst1.8      {d0}, [r1], r12         @ dst_a -= 8
+    vst1.8      {d1}, [r2], r12         @ dst_b -= 8
+
+    subs        r3, r3, #8
+    bge         Lsegments_of_8_di
+
+  @ add 8 back to the counter.  if the result is 0 there is no
+  @ residuals so return
+  adds        r3, r3, #8
+  bxeq        lr
+
+  add         r1, r1, #8
+  add         r2, r2, #8
+
+Lline_residuals_di:
+
+  mov         r12, #-1
+
+  sub         r1, r1, #1
+  sub         r2, r2, #1
+
+@ do this in neon registers as per
+@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+Lsegments_of_1:
+    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
+
+    vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
+    vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1
+
+    subs        r3, r3, #1
+    bgt         Lsegments_of_1
+
+  bx          lr
+
+@ void TransposeUVWx8_NEON (const uint8* src, int src_stride,
+@                           uint8* dst_a, int dst_stride_a,
+@                           uint8* dst_b, int dst_stride_b,
+@                           int width)
+@ r0 const uint8* src
+@ r1 int src_stride
+@ r2 uint8* dst_a
+@ r3 int dst_stride_a
+@ stack uint8* dst_b
+@ stack int dst_stride_b
+@ stack int width
+TransposeUVWx8_NEON:
+  push        {r4-r9,lr}
+
+  ldr         r4, [sp, #28]         @ dst_b
+  ldr         r5, [sp, #32]         @ dst_stride_b
+  ldr         r8, [sp, #36]         @ width
+  @ loops are on blocks of 8.  loop will stop when
+  @ counter gets to or below 0.  starting the counter
+  @ at w-8 allow for this
+  sub         r8, #8
+
+@ handle 8x8 blocks.  this should be the majority of the plane
+Lloop_8x8_di:
+    mov         r9, r0
+
+    vld2.8      {d0,  d1},  [r9], r1
+    vld2.8      {d2,  d3},  [r9], r1
+    vld2.8      {d4,  d5},  [r9], r1
+    vld2.8      {d6,  d7},  [r9], r1
+    vld2.8      {d8,  d9},  [r9], r1
+    vld2.8      {d10, d11}, [r9], r1
+    vld2.8      {d12, d13}, [r9], r1
+    vld2.8      {d14, d15}, [r9]
+
+    vtrn.8      q1, q0
+    vtrn.8      q3, q2
+    vtrn.8      q5, q4
+    vtrn.8      q7, q6
+
+    vtrn.16     q1, q3
+    vtrn.16     q0, q2
+    vtrn.16     q5, q7
+    vtrn.16     q4, q6
+
+    vtrn.32     q1, q5
+    vtrn.32     q0, q4
+    vtrn.32     q3, q7
+    vtrn.32     q2, q6
+
+    vrev16.8    q0, q0
+    vrev16.8    q1, q1
+    vrev16.8    q2, q2
+    vrev16.8    q3, q3
+    vrev16.8    q4, q4
+    vrev16.8    q5, q5
+    vrev16.8    q6, q6
+    vrev16.8    q7, q7
+
+    mov         r9, r2
+
+    vst1.8      {d2},  [r9], r3
+    vst1.8      {d0},  [r9], r3
+    vst1.8      {d6},  [r9], r3
+    vst1.8      {d4},  [r9], r3
+    vst1.8      {d10}, [r9], r3
+    vst1.8      {d8},  [r9], r3
+    vst1.8      {d14}, [r9], r3
+    vst1.8      {d12}, [r9]
+
+    mov         r9, r4
+
+    vst1.8      {d3},  [r9], r5
+    vst1.8      {d1},  [r9], r5
+    vst1.8      {d7},  [r9], r5
+    vst1.8      {d5},  [r9], r5
+    vst1.8      {d11}, [r9], r5
+    vst1.8      {d9},  [r9], r5
+    vst1.8      {d15}, [r9], r5
+    vst1.8      {d13}, [r9]
+
+    add         r0, #8*2          @ src   += 8*2
+    add         r2, r3, lsl #3    @ dst_a += 8 * dst_stride_a
+    add         r4, r5, lsl #3    @ dst_b += 8 * dst_stride_b
+    subs        r8,  #8           @ w     -= 8
+    bge         Lloop_8x8_di
+
+  @ add 8 back to counter.  if the result is 0 there are
+  @ no residuals.
+  adds        r8, #8
+  beq         Ldone_di
+
+  @ some residual, so between 1 and 7 lines left to transpose
+  cmp         r8, #2
+  blt         Lblock_1x8_di
+
+  cmp         r8, #4
+  blt         Lblock_2x8_di
+
+@ TODO(frkoenig) : clean this up
+Lblock_4x8_di:
+  mov         r9, r0
+  vld1.64     {d0}, [r9], r1
+  vld1.64     {d1}, [r9], r1
+  vld1.64     {d2}, [r9], r1
+  vld1.64     {d3}, [r9], r1
+  vld1.64     {d4}, [r9], r1
+  vld1.64     {d5}, [r9], r1
+  vld1.64     {d6}, [r9], r1
+  vld1.64     {d7}, [r9]
+
+  adr         r12, vtbl_4x4_transpose_di
+  vld1.8      {q7}, [r12]
+
+  vtrn.8      q0, q1
+  vtrn.8      q2, q3
+
+  vtbl.8      d8,  {d0, d1}, d14
+  vtbl.8      d9,  {d0, d1}, d15
+  vtbl.8      d10, {d2, d3}, d14
+  vtbl.8      d11, {d2, d3}, d15
+  vtbl.8      d12, {d4, d5}, d14
+  vtbl.8      d13, {d4, d5}, d15
+  vtbl.8      d0,  {d6, d7}, d14
+  vtbl.8      d1,  {d6, d7}, d15
+
+  mov         r9, r2
+
+  vst1.32     {d8[0]},  [r9], r3
+  vst1.32     {d8[1]},  [r9], r3
+  vst1.32     {d9[0]},  [r9], r3
+  vst1.32     {d9[1]},  [r9], r3
+
+  add         r9, r2, #4
+  vst1.32     {d12[0]}, [r9], r3
+  vst1.32     {d12[1]}, [r9], r3
+  vst1.32     {d13[0]}, [r9], r3
+  vst1.32     {d13[1]}, [r9]
+
+  mov         r9, r4
+
+  vst1.32     {d10[0]}, [r9], r5
+  vst1.32     {d10[1]}, [r9], r5
+  vst1.32     {d11[0]}, [r9], r5
+  vst1.32     {d11[1]}, [r9], r5
+
+  add         r9, r4, #4
+  vst1.32     {d0[0]},  [r9], r5
+  vst1.32     {d0[1]},  [r9], r5
+  vst1.32     {d1[0]},  [r9], r5
+  vst1.32     {d1[1]},  [r9]
+
+  add         r0, #4*2          @ src   += 4 * 2
+  add         r2, r3, lsl #2    @ dst_a += 4 * dst_stride_a
+  add         r4, r5, lsl #2    @ dst_b += 4 * dst_stride_b
+  subs        r8,  #4           @ w     -= 4
+  beq         Ldone_di
+
+  @ some residual, check to see if it includes a 2x8 block,
+  @ or less
+  cmp         r8, #2
+  blt         Lblock_1x8_di
+
+Lblock_2x8_di:
+  mov         r9, r0
+  vld2.16     {d0[0], d2[0]}, [r9], r1
+  vld2.16     {d1[0], d3[0]}, [r9], r1
+  vld2.16     {d0[1], d2[1]}, [r9], r1
+  vld2.16     {d1[1], d3[1]}, [r9], r1
+  vld2.16     {d0[2], d2[2]}, [r9], r1
+  vld2.16     {d1[2], d3[2]}, [r9], r1
+  vld2.16     {d0[3], d2[3]}, [r9], r1
+  vld2.16     {d1[3], d3[3]}, [r9]
+
+  vtrn.8      d0, d1
+  vtrn.8      d2, d3
+
+  mov         r9, r2
+
+  vst1.64     {d0}, [r9], r3
+  vst1.64     {d2}, [r9]
+
+  mov         r9, r4
+
+  vst1.64     {d1}, [r9], r5
+  vst1.64     {d3}, [r9]
+
+  add         r0, #2*2          @ src   += 2 * 2
+  add         r2, r3, lsl #1    @ dst_a += 2 * dst_stride_a
+  add         r4, r5, lsl #1    @ dst_a += 2 * dst_stride_a
+  subs        r8,  #2           @ w     -= 2
+  beq         Ldone_di
+
+Lblock_1x8_di:
+  vld2.8      {d0[0], d1[0]}, [r0], r1
+  vld2.8      {d0[1], d1[1]}, [r0], r1
+  vld2.8      {d0[2], d1[2]}, [r0], r1
+  vld2.8      {d0[3], d1[3]}, [r0], r1
+  vld2.8      {d0[4], d1[4]}, [r0], r1
+  vld2.8      {d0[5], d1[5]}, [r0], r1
+  vld2.8      {d0[6], d1[6]}, [r0], r1
+  vld2.8      {d0[7], d1[7]}, [r0]
+
+  vst1.64     {d0}, [r2]
+  vst1.64     {d1}, [r4]
+
+Ldone_di:
+  pop         {r4-r9, pc}
+
+vtbl_4x4_transpose_di:
+  .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
--- a/source/rotate_priv.h
+++ b/source/rotate_priv.h
@ -0,0 +1,72 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef SOURCE_ROTATE_PRIV_H_
+#define SOURCE_ROTATE_PRIV_H_
+
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+// Rotate planes by 90, 180, 270
+void
+RotatePlane90(const uint8* src, int src_stride,
+              uint8* dst, int dst_stride,
+              int width, int height);
+
+void
+RotatePlane180(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
+               int width, int height);
+
+void
+RotatePlane270(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
+               int width, int height);
+
+void
+RotateUV90(const uint8* src, int src_stride,
+           uint8* dst_a, int dst_stride_a,
+           uint8* dst_b, int dst_stride_b,
+           int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them.
+void
+RotateUV180(const uint8* src, int src_stride,
+            uint8* dst_a, int dst_stride_a,
+            uint8* dst_b, int dst_stride_b,
+            int width, int height);
+
+void
+RotateUV270(const uint8* src, int src_stride,
+            uint8* dst_a, int dst_stride_a,
+            uint8* dst_b, int dst_stride_b,
+            int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+void
+TransposePlane(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
+               int width, int height);
+
+void
+TransposeUV(const uint8* src, int src_stride,
+            uint8* dst_a, int dst_stride_a,
+            uint8* dst_b, int dst_stride_b,
+            int width, int height);
+
+}  // namespace libyuv
+
+#endif  // SOURCE_ROTATE_PRIV_H_
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@ -11,7 +11,6 @@
 #ifndef UINIT_TEST_H_
 #define UINIT_TEST_H_

-#include "basic_types.h"
 #include <gtest/gtest.h>

 class libyuvTest : public ::testing::Test {
@ -20,8 +19,8 @@ class libyuvTest : public ::testing::Test {
  virtual void SetUp();
  virtual void TearDown();

-  const uint32 _rotate_max_w;
-  const uint32 _rotate_max_h;
+  const int _rotate_max_w;
+  const int _rotate_max_h;

 };