diff --git a/libyuv.gyp b/libyuv.gyp
index ea3575eee..c6b674962 100644
--- a/libyuv.gyp
+++ b/libyuv.gyp
@@ -28,7 +28,6 @@
         'include/libyuv/general.h',
         'include/libyuv/scale.h',
         'include/libyuv/planar_functions.h',
-        
 
         # headers
          'source/conversion_tables.h',
@@ -59,6 +58,15 @@
             'source/row_posix.cc',
           ],
         }],
+        ['target_arch=="arm"',{
+          'conditions': [
+            ['arm_neon==1', {
+              'sources' : [
+                'source/rotate_neon.cc',
+              ],
+            }],
+          ],
+        }],
       ]
     },
   ], # targets
diff --git a/source/rotate.cc b/source/rotate.cc
index 752f756dd..efd674d86 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -43,7 +43,6 @@ typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int);
 typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int);
 
 #ifdef __ARM_NEON__
-extern "C" {
 #define HAS_REVERSE_LINE_NEON
 void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
 #define HAS_REVERSE_LINE_UV_NEON
@@ -58,7 +57,6 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                          uint8* dst_a, int dst_stride_a,
                          uint8* dst_b, int dst_stride_b,
                          int width);
-}  // extern "C"
 #endif
 
 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
@@ -784,10 +782,7 @@ void TransposePlane(const uint8* src, int src_stride,
   rotate_wxh_func TransposeWxH;
 
 #if defined(HAS_TRANSPOSE_WX8_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (width % 8 == 0) &&
-      IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
-      IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
     TransposeWx8 = TransposeWx8_NEON;
     TransposeWxH = TransposeWxH_C;
   } else
@@ -917,10 +912,7 @@ void RotatePlane180(const uint8* src, int src_stride,
   reverse_func ReverseLine;
 
 #if defined(HAS_REVERSE_LINE_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
     ReverseLine = ReverseLine_NEON;
   } else
 #endif
@@ -1145,11 +1137,7 @@ void RotateUV180(const uint8* src, int src_stride,
   reverse_uv_func ReverseLine;
 
 #if defined(HAS_REVERSE_LINE_UV_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (width % 16 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
     ReverseLine = ReverseLineUV_NEON;
   } else
 #endif
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
new file mode 100644
index 000000000..a6496d33b
--- /dev/null
+++ b/source/rotate_neon.cc
@@ -0,0 +1,557 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
+
+void ReverseLine_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile(
+    // compute where to start writing destination
+    "add         %1, %2\n"
+
+    // work on segments that are multiples of 16
+    "lsrs        r3, %2, #4\n"
+
+    // the output is written in two block.  8 bytes followed
+    // by another 8.  reading is done sequentially, from left to
+    // right.  writing is done from right to left in block sizes
+    // %1, the destination pointer is incremented after writing
+    // the first of the two blocks.  need to subtract that 8 off
+    // along with 16 to get the next location.
+    "mov         r3, #-24\n"
+
+    "beq         2f\n"
+
+    // back of destination by the size of the register that is
+    // going to be reversed
+    "sub         %1, #16\n"
+
+    // the loop needs to run on blocks of 16.  what will be left
+    // over is either a negative number, the residuals that need
+    // to be done, or 0.  if this isn't subtracted off here the
+    // loop will run one extra time.
+    "sub         %2, #16\n"
+
+    "1:\n"
+      "vld1.8      {q0}, [%0]!\n"                 // src += 16
+
+        // reverse the bytes in the 64 bit segments.  unable to reverse
+        // the bytes in the entire 128 bits in one go.
+      "vrev64.8    q0, q0\n"
+
+        // because of the inability to reverse the entire 128 bits
+        // reverse the writing out of the two 64 bit segments.
+      "vst1.8      {d1}, [%1]!\n"
+      "vst1.8      {d0}, [%1], r3\n"              // dst -= 16
+
+      "subs        %2, #16\n"
+    "bge         1b\n"
+
+    // add 16 back to the counter.  if the result is 0 there is no
+    // residuals so jump past
+    "adds        %2, #16\n"
+    "beq         5f\n"
+
+    "add         %1, #16\n"
+
+    "2:\n"
+
+    "mov         r3, #-3\n"
+
+    "sub         %1, #2\n"
+    "subs        %2, #2\n"
+    // check for 16*n+1 scenarios where segments_of_2 should not
+    // be run, but there is something left over.
+    "blt         4f\n"
+
+// do this in neon registers as per
+// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+    "3:\n"
+    "vld2.8      {d0[0], d1[0]}, [%0]!\n"       // src += 2
+
+    "vst1.8      {d1[0]}, [%1]!\n"
+    "vst1.8      {d0[0]}, [%1], r3\n"           // dst -= 2
+
+    "subs        %2, #2\n"
+    "bge         3b\n"
+
+    "adds        %2, #2\n"
+    "beq         5f\n"
+
+    "4:\n"
+    "add         %1, #1\n"
+    "vld1.8      {d0[0]}, [%0]\n"
+    "vst1.8      {d0[0]}, [%1]\n"
+
+    "5:\n"
+    : "+r"(src),              // %0
+      "+r"(dst),              // %1
+      "+r"(width)             // %2
+    :
+    : "memory", "cc", "r3", "q0"
+  );
+}
+
+static const uint8 vtbl_4x4_transpose[16] __attribute__((vector_size(16))) =
+  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride,
+                       int width) {
+  asm volatile(
+    // loops are on blocks of 8.  loop will stop when
+    // counter gets to or below 0.  starting the counter
+    // at w-8 allow for this
+    "sub         %4, #8\n"
+
+    // handle 8x8 blocks.  this should be the majority of the plane
+    "1:\n"
+      "mov         r9, %0\n"
+
+      "vld1.8      {d0}, [r9], %1\n"
+      "vld1.8      {d1}, [r9], %1\n"
+      "vld1.8      {d2}, [r9], %1\n"
+      "vld1.8      {d3}, [r9], %1\n"
+      "vld1.8      {d4}, [r9], %1\n"
+      "vld1.8      {d5}, [r9], %1\n"
+      "vld1.8      {d6}, [r9], %1\n"
+      "vld1.8      {d7}, [r9]\n"
+
+      "vtrn.8      d1, d0\n"
+      "vtrn.8      d3, d2\n"
+      "vtrn.8      d5, d4\n"
+      "vtrn.8      d7, d6\n"
+
+      "vtrn.16     d1, d3\n"
+      "vtrn.16     d0, d2\n"
+      "vtrn.16     d5, d7\n"
+      "vtrn.16     d4, d6\n"
+
+      "vtrn.32     d1, d5\n"
+      "vtrn.32     d0, d4\n"
+      "vtrn.32     d3, d7\n"
+      "vtrn.32     d2, d6\n"
+
+      "vrev16.8    q0, q0\n"
+      "vrev16.8    q1, q1\n"
+      "vrev16.8    q2, q2\n"
+      "vrev16.8    q3, q3\n"
+
+      "mov         r9, %2\n"
+
+      "vst1.8      {d1}, [r9], %3\n"
+      "vst1.8      {d0}, [r9], %3\n"
+      "vst1.8      {d3}, [r9], %3\n"
+      "vst1.8      {d2}, [r9], %3\n"
+      "vst1.8      {d5}, [r9], %3\n"
+      "vst1.8      {d4}, [r9], %3\n"
+      "vst1.8      {d7}, [r9], %3\n"
+      "vst1.8      {d6}, [r9]\n"
+
+      "add         %0, #8\n"              // src += 8
+      "add         %2, %3, lsl #3\n"      // dst += 8 * dst_stride
+      "subs        %4,  #8\n"             // w   -= 8
+      "bge         1b\n"
+
+    // add 8 back to counter.  if the result is 0 there are
+    // no residuals.
+    "adds        %4, #8\n"
+    "beq         4f\n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %4, #2\n"
+    "blt         3f\n"
+
+    "cmp         %4, #4\n"
+    "blt         2f\n"
+
+    // 4x8 block
+    "mov         r9, %0\n"
+    "vld1.32     {d0[0]}, [r9], %1\n"
+    "vld1.32     {d0[1]}, [r9], %1\n"
+    "vld1.32     {d1[0]}, [r9], %1\n"
+    "vld1.32     {d1[1]}, [r9], %1\n"
+    "vld1.32     {d2[0]}, [r9], %1\n"
+    "vld1.32     {d2[1]}, [r9], %1\n"
+    "vld1.32     {d3[0]}, [r9], %1\n"
+    "vld1.32     {d3[1]}, [r9]\n"
+
+    "mov         r9, %2\n"
+
+    "vld1.8      {q3}, [%5]\n"
+
+    "vtbl.8      d4, {d0, d1}, d6\n"
+    "vtbl.8      d5, {d0, d1}, d7\n"
+    "vtbl.8      d0, {d2, d3}, d6\n"
+    "vtbl.8      d1, {d2, d3}, d7\n"
+
+    // TODO: rework shuffle above to write
+    //       out with 4 instead of 8 writes
+    "vst1.32     {d4[0]}, [r9], %3\n"
+    "vst1.32     {d4[1]}, [r9], %3\n"
+    "vst1.32     {d5[0]}, [r9], %3\n"
+    "vst1.32     {d5[1]}, [r9]\n"
+
+    "add         r9, %2, #4\n"
+    "vst1.32     {d0[0]}, [r9], %3\n"
+    "vst1.32     {d0[1]}, [r9], %3\n"
+    "vst1.32     {d1[0]}, [r9], %3\n"
+    "vst1.32     {d1[1]}, [r9]\n"
+
+    "add         %0, #4\n"              // src += 4
+    "add         %2, %3, lsl #2\n"      // dst += 4 * dst_stride
+    "subs        %4,  #4\n"             // w   -= 4
+    "beq         4f\n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %4, #2\n"
+    "blt         3f\n"
+
+    // 2x8 block
+    "2:\n"
+    "mov         r9, %0\n"
+    "vld1.16     {d0[0]}, [r9], %1\n"
+    "vld1.16     {d1[0]}, [r9], %1\n"
+    "vld1.16     {d0[1]}, [r9], %1\n"
+    "vld1.16     {d1[1]}, [r9], %1\n"
+    "vld1.16     {d0[2]}, [r9], %1\n"
+    "vld1.16     {d1[2]}, [r9], %1\n"
+    "vld1.16     {d0[3]}, [r9], %1\n"
+    "vld1.16     {d1[3]}, [r9]\n"
+
+    "vtrn.8      d0, d1\n"
+
+    "mov         r9, %2\n"
+
+    "vst1.64     {d0}, [r9], %3\n"
+    "vst1.64     {d1}, [r9]\n"
+
+    "add         %0, #2\n"              // src += 2
+    "add         %2, %3, lsl #1\n"      // dst += 2 * dst_stride
+    "subs        %4,  #2\n"             // w   -= 2
+    "beq         4f\n"
+
+    // 1x8 block
+    "3:\n"
+    "vld1.8      {d0[0]}, [%0], %1\n"
+    "vld1.8      {d0[1]}, [%0], %1\n"
+    "vld1.8      {d0[2]}, [%0], %1\n"
+    "vld1.8      {d0[3]}, [%0], %1\n"
+    "vld1.8      {d0[4]}, [%0], %1\n"
+    "vld1.8      {d0[5]}, [%0], %1\n"
+    "vld1.8      {d0[6]}, [%0], %1\n"
+    "vld1.8      {d0[7]}, [%0]\n"
+
+    "vst1.64     {d0}, [%2]\n"
+
+    "4:\n"
+
+    : "+r"(src),              // %0
+      "+r"(src_stride),       // %1
+      "+r"(dst),              // %2
+      "+r"(dst_stride),       // %3
+      "+r"(width)             // %4
+    : "r"(vtbl_4x4_transpose) // %5
+    : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
+  );
+}
+
+void ReverseLineUV_NEON(const uint8* src,
+                        uint8* dst_a, uint8* dst_b,
+                        int width) {
+  asm volatile(
+    // compute where to start writing destination
+    "add         %1, %3\n"            // dst_a + width
+    "add         %2, %3\n"            // dst_b + width
+
+    // work on input segments that are multiples of 16, but
+    // width that has been passed is output segments, half
+    // the size of input.
+    "lsrs        r12, %3, #3\n"
+
+    "beq         2f\n"
+
+    // the output is written in to two blocks.
+    "mov         r12, #-8\n"
+
+    // back of destination by the size of the register that is
+    // going to be reversed
+    "sub         %1, #8\n"
+    "sub         %2, #8\n"
+
+    // the loop needs to run on blocks of 8.  what will be left
+    // over is either a negative number, the residuals that need
+    // to be done, or 0.  if this isn't subtracted off here the
+    // loop will run one extra time.
+    "sub         %3, #8\n"
+
+    "1:\n"
+      "vld2.8      {d0, d1}, [%0]!\n"           // src += 16
+
+      // reverse the bytes in the 64 bit segments
+      "vrev64.8    q0, q0\n"
+
+      "vst1.8      {d0}, [%1], r12\n"           // dst_a -= 8
+      "vst1.8      {d1}, [%2], r12\n"           // dst_b -= 8
+
+      "subs        %3, #8\n"
+      "bge         1b\n"
+
+    // add 8 back to the counter.  if the result is 0 there is no
+    // residuals so return
+    "adds        %3, #8\n"
+    "beq         4f\n"
+
+    "add         %1, #8\n"
+    "add         %2, #8\n"
+
+    "2:\n"
+
+    "mov         r12, #-1\n"
+
+    "sub         %1, #1\n"
+    "sub         %2, #1\n"
+
+    "3:\n"
+      "vld2.8      {d0[0], d1[0]}, [%0]!\n"       // src += 2
+
+      "vst1.8      {d0[0]}, [%1], r12\n"          // dst_a -= 1
+      "vst1.8      {d1[0]}, [%2], r12\n"          // dst_b -= 1
+
+      "subs        %3, %3, #1\n"
+      "bgt         3b\n"
+    "4:\n"
+    : "+r"(src),              // %0
+      "+r"(dst_a),            // %1
+      "+r"(dst_b),            // %2
+      "+r"(width)             // %3
+    :
+    : "memory", "cc", "r12", "q0"
+  );
+}
+
+static const uint8 vtbl_4x4_transpose_di[16] __attribute__((vector_size(16))) =
+  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  asm volatile(
+    // loops are on blocks of 8.  loop will stop when
+    // counter gets to or below 0.  starting the counter
+    // at w-8 allow for this
+    "sub         %6, #8\n"
+
+    // handle 8x8 blocks.  this should be the majority of the plane
+    "1:\n"
+      "mov         r9, %0\n"
+
+      "vld2.8      {d0,  d1},  [r9], %1\n"
+      "vld2.8      {d2,  d3},  [r9], %1\n"
+      "vld2.8      {d4,  d5},  [r9], %1\n"
+      "vld2.8      {d6,  d7},  [r9], %1\n"
+      "vld2.8      {d16, d17}, [r9], %1\n"
+      "vld2.8      {d18, d19}, [r9], %1\n"
+      "vld2.8      {d20, d21}, [r9], %1\n"
+      "vld2.8      {d22, d23}, [r9]\n"
+
+      "vtrn.8      q1, q0\n"
+      "vtrn.8      q3, q2\n"
+      "vtrn.8      q9, q8\n"
+      "vtrn.8      q11, q10\n"
+
+      "vtrn.16     q1, q3\n"
+      "vtrn.16     q0, q2\n"
+      "vtrn.16     q9, q11\n"
+      "vtrn.16     q8, q10\n"
+
+      "vtrn.32     q1, q9\n"
+      "vtrn.32     q0, q8\n"
+      "vtrn.32     q3, q11\n"
+      "vtrn.32     q2, q10\n"
+
+      "vrev16.8    q0, q0\n"
+      "vrev16.8    q1, q1\n"
+      "vrev16.8    q2, q2\n"
+      "vrev16.8    q3, q3\n"
+      "vrev16.8    q8, q8\n"
+      "vrev16.8    q9, q9\n"
+      "vrev16.8    q10, q10\n"
+      "vrev16.8    q11, q11\n"
+
+      "mov         r9, %2\n"
+
+      "vst1.8      {d2},  [r9], %3\n"
+      "vst1.8      {d0},  [r9], %3\n"
+      "vst1.8      {d6},  [r9], %3\n"
+      "vst1.8      {d4},  [r9], %3\n"
+      "vst1.8      {d18}, [r9], %3\n"
+      "vst1.8      {d16}, [r9], %3\n"
+      "vst1.8      {d22}, [r9], %3\n"
+      "vst1.8      {d20}, [r9]\n"
+
+      "mov         r9, %4\n"
+
+      "vst1.8      {d3},  [r9], %5\n"
+      "vst1.8      {d1},  [r9], %5\n"
+      "vst1.8      {d7},  [r9], %5\n"
+      "vst1.8      {d5},  [r9], %5\n"
+      "vst1.8      {d19}, [r9], %5\n"
+      "vst1.8      {d17}, [r9], %5\n"
+      "vst1.8      {d23}, [r9], %5\n"
+      "vst1.8      {d21}, [r9]\n"
+
+      "add         %0, #8*2\n"            // src   += 8*2
+      "add         %2, %3, lsl #3\n"      // dst_a += 8 * dst_stride_a
+      "add         %4, %5, lsl #3\n"      // dst_b += 8 * dst_stride_b
+      "subs        %6,  #8\n"             // w     -= 8
+      "bge         1b\n"
+
+    // add 8 back to counter.  if the result is 0 there are
+    // no residuals.
+    "adds        %6, #8\n"
+    "beq         4f\n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %6, #2\n"
+    "blt         3f\n"
+
+    "cmp         %6, #4\n"
+    "blt         2f\n"
+
+    //TODO(frkoenig) : clean this up
+    // 4x8 block
+    "mov         r9, %0\n"
+    "vld1.64     {d0}, [r9], %1\n"
+    "vld1.64     {d1}, [r9], %1\n"
+    "vld1.64     {d2}, [r9], %1\n"
+    "vld1.64     {d3}, [r9], %1\n"
+    "vld1.64     {d4}, [r9], %1\n"
+    "vld1.64     {d5}, [r9], %1\n"
+    "vld1.64     {d6}, [r9], %1\n"
+    "vld1.64     {d7}, [r9]\n"
+
+    "vld1.8      {q15}, [%7]\n"
+
+    "vtrn.8      q0, q1\n"
+    "vtrn.8      q2, q3\n"
+
+    "vtbl.8      d16, {d0, d1}, d30\n"
+    "vtbl.8      d17, {d0, d1}, d31\n"
+    "vtbl.8      d18, {d2, d3}, d30\n"
+    "vtbl.8      d19, {d2, d3}, d31\n"
+    "vtbl.8      d20, {d4, d5}, d30\n"
+    "vtbl.8      d21, {d4, d5}, d31\n"
+    "vtbl.8      d22, {d6, d7}, d30\n"
+    "vtbl.8      d23, {d6, d7}, d31\n"
+
+    "mov         r9, %2\n"
+
+    "vst1.32     {d16[0]},  [r9], %3\n"
+    "vst1.32     {d16[1]},  [r9], %3\n"
+    "vst1.32     {d17[0]},  [r9], %3\n"
+    "vst1.32     {d17[1]},  [r9], %3\n"
+
+    "add         r9, %2, #4\n"
+    "vst1.32     {d20[0]}, [r9], %3\n"
+    "vst1.32     {d20[1]}, [r9], %3\n"
+    "vst1.32     {d21[0]}, [r9], %3\n"
+    "vst1.32     {d21[1]}, [r9]\n"
+
+    "mov         r9, %4\n"
+
+    "vst1.32     {d18[0]}, [r9], %5\n"
+    "vst1.32     {d18[1]}, [r9], %5\n"
+    "vst1.32     {d19[0]}, [r9], %5\n"
+    "vst1.32     {d19[1]}, [r9], %5\n"
+
+    "add         r9, %4, #4\n"
+    "vst1.32     {d22[0]},  [r9], %5\n"
+    "vst1.32     {d22[1]},  [r9], %5\n"
+    "vst1.32     {d23[0]},  [r9], %5\n"
+    "vst1.32     {d23[1]},  [r9]\n"
+
+    "add         %0, #4*2\n"            // src   += 4 * 2
+    "add         %2, %3, lsl #2\n"      // dst_a += 4 * dst_stride_a
+    "add         %4, %5, lsl #2\n"      // dst_b += 4 * dst_stride_b
+    "subs        %6,  #4\n"             // w     -= 4
+    "beq         4f\n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %6, #2\n"
+    "blt         3f\n"
+
+    // 2x8 block
+    "2:\n"
+    "mov         r9, %0\n"
+    "vld2.16     {d0[0], d2[0]}, [r9], %1\n"
+    "vld2.16     {d1[0], d3[0]}, [r9], %1\n"
+    "vld2.16     {d0[1], d2[1]}, [r9], %1\n"
+    "vld2.16     {d1[1], d3[1]}, [r9], %1\n"
+    "vld2.16     {d0[2], d2[2]}, [r9], %1\n"
+    "vld2.16     {d1[2], d3[2]}, [r9], %1\n"
+    "vld2.16     {d0[3], d2[3]}, [r9], %1\n"
+    "vld2.16     {d1[3], d3[3]}, [r9]\n"
+
+    "vtrn.8      d0, d1\n"
+    "vtrn.8      d2, d3\n"
+
+    "mov         r9, %2\n"
+
+    "vst1.64     {d0}, [r9], %3\n"
+    "vst1.64     {d2}, [r9]\n"
+
+    "mov         r9, %4\n"
+
+    "vst1.64     {d1}, [r9], %5\n"
+    "vst1.64     {d3}, [r9]\n"
+
+    "add         %0, #2*2\n"            // src   += 2 * 2
+    "add         %2, %3, lsl #1\n"      // dst_a += 2 * dst_stride_a
+    "add         %4, %5, lsl #1\n"      // dst_b += 2 * dst_stride_b
+    "subs        %6,  #2\n"             // w     -= 2
+    "beq         4f\n"
+
+    // 1x8 block
+    "3:\n"
+    "vld2.8      {d0[0], d1[0]}, [%0], %1\n"
+    "vld2.8      {d0[1], d1[1]}, [%0], %1\n"
+    "vld2.8      {d0[2], d1[2]}, [%0], %1\n"
+    "vld2.8      {d0[3], d1[3]}, [%0], %1\n"
+    "vld2.8      {d0[4], d1[4]}, [%0], %1\n"
+    "vld2.8      {d0[5], d1[5]}, [%0], %1\n"
+    "vld2.8      {d0[6], d1[6]}, [%0], %1\n"
+    "vld2.8      {d0[7], d1[7]}, [%0]\n"
+
+    "vst1.64     {d0}, [%2]\n"
+    "vst1.64     {d1}, [%4]\n"
+
+    "4:\n"
+
+    : "+r"(src),              // %0
+      "+r"(src_stride),       // %1
+      "+r"(dst_a),            // %2
+      "+r"(dst_stride_a),     // %3
+      "+r"(dst_b),            // %4
+      "+r"(dst_stride_b),     // %5
+      "+r"(width)             // %6
+    : "r"(vtbl_4x4_transpose_di)// %7
+    : "memory", "cc", "r9",
+      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+#endif
+}
diff --git a/source/rotate_neon.s b/source/rotate_neon.s
deleted file mode 100644
index 2d71c8937..000000000
--- a/source/rotate_neon.s
+++ /dev/null
@@ -1,545 +0,0 @@
-  .global ReverseLine_NEON
-  .global ReverseLineUV_NEON
-  .global TransposeWx8_NEON
-  .global TransposeUVWx8_NEON
-  .type ReverseLine_NEON, function
-  .type ReverseLineUV_NEON, function
-  .type TransposeWx8_NEON, function
-  .type TransposeUVWx8_NEON, function
-
-@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
-@ r0 const uint8* src
-@ r1 uint8* dst
-@ r2 width
-ReverseLine_NEON:
-
-  @ compute where to start writing destination
-  add         r1, r2      @ dst + width
-
-  @ work on segments that are multiples of 16
-  lsrs        r3, r2, #4
-
-  @ the output is written in two block.  8 bytes followed
-  @ by another 8.  reading is done sequentially, from left to
-  @ right.  writing is done from right to left in block sizes
-  @ r1, the destination pointer is incremented after writing
-  @ the first of the two blocks.  need to subtract that 8 off
-  @ along with 16 to get the next location.
-  mov         r3, #-24
-
-  beq         Lline_residuals
-
-  @ back of destination by the size of the register that is
-  @ going to be reversed
-  sub         r1, #16
-
-  @ the loop needs to run on blocks of 16.  what will be left
-  @ over is either a negative number, the residuals that need
-  @ to be done, or 0.  if this isn't subtracted off here the
-  @ loop will run one extra time.
-  sub         r2, #16
-
-Lsegments_of_16:
-    vld1.8      {q0}, [r0]!               @ src += 16
-
-    @ reverse the bytes in the 64 bit segments.  unable to reverse
-    @ the bytes in the entire 128 bits in one go.
-    vrev64.8    q0, q0
-
-    @ because of the inability to reverse the entire 128 bits
-    @ reverse the writing out of the two 64 bit segments.
-    vst1.8      {d1}, [r1]!
-    vst1.8      {d0}, [r1], r3            @ dst -= 16
-
-    subs        r2, #16
-    bge         Lsegments_of_16
-
-  @ add 16 back to the counter.  if the result is 0 there is no
-  @ residuals so return
-  adds        r2, #16
-  bxeq        lr
-
-  add         r1, #16
-
-Lline_residuals:
-
-  mov         r3, #-3
-
-  sub         r1, #2
-  subs        r2, #2
-  @ check for 16*n+1 scenarios where segments_of_2 should not
-  @ be run, but there is something left over.
-  blt         Lsegment_of_1
-
-@ do this in neon registers as per
-@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-Lsegments_of_2:
-    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
-
-    vst1.8      {d1[0]}, [r1]!
-    vst1.8      {d0[0]}, [r1], r3         @ dst -= 2
-
-    subs        r2, #2
-    bge         Lsegments_of_2
-
-  adds        r2, #2
-  bxeq        lr
-
-Lsegment_of_1:
-  add         r1, #1
-  vld1.8      {d0[0]}, [r0]
-  vst1.8      {d0[0]}, [r1]
-
-  bx          lr
-
-@ void TransposeWx8_NEON (const uint8* src, int src_stride,
-@                         uint8* dst, int dst_stride,
-@                         int w)
-@ r0 const uint8* src
-@ r1 int src_stride
-@ r2 uint8* dst
-@ r3 int dst_stride
-@ stack int w
-TransposeWx8_NEON:
-  push        {r4,r8,r9,lr}
-
-  ldr         r8, [sp, #16]        @ width
-
-  @ loops are on blocks of 8.  loop will stop when
-  @ counter gets to or below 0.  starting the counter
-  @ at w-8 allow for this
-  sub         r8, #8
-
-@ handle 8x8 blocks.  this should be the majority of the plane
-Lloop_8x8:
-    mov         r9, r0
-
-    vld1.8      {d0}, [r9], r1
-    vld1.8      {d1}, [r9], r1
-    vld1.8      {d2}, [r9], r1
-    vld1.8      {d3}, [r9], r1
-    vld1.8      {d4}, [r9], r1
-    vld1.8      {d5}, [r9], r1
-    vld1.8      {d6}, [r9], r1
-    vld1.8      {d7}, [r9]
-
-    vtrn.8      d1, d0
-    vtrn.8      d3, d2
-    vtrn.8      d5, d4
-    vtrn.8      d7, d6
-
-    vtrn.16     d1, d3
-    vtrn.16     d0, d2
-    vtrn.16     d5, d7
-    vtrn.16     d4, d6
-
-    vtrn.32     d1, d5
-    vtrn.32     d0, d4
-    vtrn.32     d3, d7
-    vtrn.32     d2, d6
-
-    vrev16.8    q0, q0
-    vrev16.8    q1, q1
-    vrev16.8    q2, q2
-    vrev16.8    q3, q3
-
-    mov         r9, r2
-
-    vst1.8      {d1}, [r9], r3
-    vst1.8      {d0}, [r9], r3
-    vst1.8      {d3}, [r9], r3
-    vst1.8      {d2}, [r9], r3
-    vst1.8      {d5}, [r9], r3
-    vst1.8      {d4}, [r9], r3
-    vst1.8      {d7}, [r9], r3
-    vst1.8      {d6}, [r9]
-
-    add         r0, #8            @ src += 8
-    add         r2, r3, lsl #3    @ dst += 8 * dst_stride
-    subs        r8,  #8           @ w   -= 8
-    bge         Lloop_8x8
-
-  @ add 8 back to counter.  if the result is 0 there are
-  @ no residuals.
-  adds        r8, #8
-  beq         Ldone
-
-  @ some residual, so between 1 and 7 lines left to transpose
-  cmp         r8, #2
-  blt         Lblock_1x8
-
-  cmp         r8, #4
-  blt         Lblock_2x8
-
-Lblock_4x8:
-  mov         r9, r0
-  vld1.32     {d0[0]}, [r9], r1
-  vld1.32     {d0[1]}, [r9], r1
-  vld1.32     {d1[0]}, [r9], r1
-  vld1.32     {d1[1]}, [r9], r1
-  vld1.32     {d2[0]}, [r9], r1
-  vld1.32     {d2[1]}, [r9], r1
-  vld1.32     {d3[0]}, [r9], r1
-  vld1.32     {d3[1]}, [r9]
-
-  mov         r9, r2
-
-  adr         r12, vtbl_4x4_transpose
-  vld1.8      {q3}, [r12]
-
-  vtbl.8      d4, {d0, d1}, d6
-  vtbl.8      d5, {d0, d1}, d7
-  vtbl.8      d0, {d2, d3}, d6
-  vtbl.8      d1, {d2, d3}, d7
-
-  @ TODO: rework shuffle above to write
-  @       out with 4 instead of 8 writes
-  vst1.32     {d4[0]}, [r9], r3
-  vst1.32     {d4[1]}, [r9], r3
-  vst1.32     {d5[0]}, [r9], r3
-  vst1.32     {d5[1]}, [r9]
-
-  add         r9, r2, #4
-  vst1.32     {d0[0]}, [r9], r3
-  vst1.32     {d0[1]}, [r9], r3
-  vst1.32     {d1[0]}, [r9], r3
-  vst1.32     {d1[1]}, [r9]
-
-  add         r0, #4            @ src += 4
-  add         r2, r3, lsl #2    @ dst += 4 * dst_stride
-  subs        r8,  #4           @ w   -= 4
-  beq         Ldone
-
-  @ some residual, check to see if it includes a 2x8 block,
-  @ or less
-  cmp         r8, #2
-  blt         Lblock_1x8
-
-Lblock_2x8:
-  mov         r9, r0
-  vld1.16     {d0[0]}, [r9], r1
-  vld1.16     {d1[0]}, [r9], r1
-  vld1.16     {d0[1]}, [r9], r1
-  vld1.16     {d1[1]}, [r9], r1
-  vld1.16     {d0[2]}, [r9], r1
-  vld1.16     {d1[2]}, [r9], r1
-  vld1.16     {d0[3]}, [r9], r1
-  vld1.16     {d1[3]}, [r9]
-
-  vtrn.8      d0, d1
-
-  mov         r9, r2
-
-  vst1.64     {d0}, [r9], r3
-  vst1.64     {d1}, [r9]
-
-  add         r0, #2            @ src += 2
-  add         r2, r3, lsl #1    @ dst += 2 * dst_stride
-  subs        r8,  #2           @ w   -= 2
-  beq         Ldone
-
-Lblock_1x8:
-  vld1.8      {d0[0]}, [r0], r1
-  vld1.8      {d0[1]}, [r0], r1
-  vld1.8      {d0[2]}, [r0], r1
-  vld1.8      {d0[3]}, [r0], r1
-  vld1.8      {d0[4]}, [r0], r1
-  vld1.8      {d0[5]}, [r0], r1
-  vld1.8      {d0[6]}, [r0], r1
-  vld1.8      {d0[7]}, [r0]
-
-  vst1.64     {d0}, [r2]
-
-Ldone:
-
-  pop         {r4,r8,r9,pc}
-
-vtbl_4x4_transpose:
-  .byte  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-@ void ReverseLineUV_NEON (const uint8* src,
-@                          uint8* dst_a,
-@                          uint8* dst_b,
-@                          int width)
-@ r0 const uint8* src
-@ r1 uint8* dst_a
-@ r2 uint8* dst_b
-@ r3 width
-ReverseLineUV_NEON:
-
-  @ compute where to start writing destination
-  add         r1, r1, r3      @ dst_a + width
-  add         r2, r2, r3      @ dst_b + width
-
-  @ work on input segments that are multiples of 16, but
-  @ width that has been passed is output segments, half
-  @ the size of input.
-  lsrs        r12, r3, #3
-
-  beq         Lline_residuals_di
-
-  @ the output is written in to two blocks.
-  mov         r12, #-8
-
-  @ back of destination by the size of the register that is
-  @ going to be reversed
-  sub         r1, r1, #8
-  sub         r2, r2, #8
-
-  @ the loop needs to run on blocks of 8.  what will be left
-  @ over is either a negative number, the residuals that need
-  @ to be done, or 0.  if this isn't subtracted off here the
-  @ loop will run one extra time.
-  sub         r3, r3, #8
-
-Lsegments_of_8_di:
-    vld2.8      {d0, d1}, [r0]!         @ src += 16
-
-    @ reverse the bytes in the 64 bit segments
-    vrev64.8    q0, q0
-
-    vst1.8      {d0}, [r1], r12         @ dst_a -= 8
-    vst1.8      {d1}, [r2], r12         @ dst_b -= 8
-
-    subs        r3, r3, #8
-    bge         Lsegments_of_8_di
-
-  @ add 8 back to the counter.  if the result is 0 there is no
-  @ residuals so return
-  adds        r3, r3, #8
-  bxeq        lr
-
-  add         r1, r1, #8
-  add         r2, r2, #8
-
-Lline_residuals_di:
-
-  mov         r12, #-1
-
-  sub         r1, r1, #1
-  sub         r2, r2, #1
-
-@ do this in neon registers as per
-@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-Lsegments_of_1:
-    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
-
-    vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
-    vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1
-
-    subs        r3, r3, #1
-    bgt         Lsegments_of_1
-
-  bx          lr
-
-@ void TransposeUVWx8_NEON (const uint8* src, int src_stride,
-@                           uint8* dst_a, int dst_stride_a,
-@                           uint8* dst_b, int dst_stride_b,
-@                           int width)
-@ r0 const uint8* src
-@ r1 int src_stride
-@ r2 uint8* dst_a
-@ r3 int dst_stride_a
-@ stack uint8* dst_b
-@ stack int dst_stride_b
-@ stack int width
-TransposeUVWx8_NEON:
-  push        {r4-r9,lr}
-
-  ldr         r4, [sp, #28]         @ dst_b
-  ldr         r5, [sp, #32]         @ dst_stride_b
-  ldr         r8, [sp, #36]         @ width
-  @ loops are on blocks of 8.  loop will stop when
-  @ counter gets to or below 0.  starting the counter
-  @ at w-8 allow for this
-  sub         r8, #8
-
-@ handle 8x8 blocks.  this should be the majority of the plane
-Lloop_8x8_di:
-    mov         r9, r0
-
-    vld2.8      {d0,  d1},  [r9], r1
-    vld2.8      {d2,  d3},  [r9], r1
-    vld2.8      {d4,  d5},  [r9], r1
-    vld2.8      {d6,  d7},  [r9], r1
-    vld2.8      {d16, d17}, [r9], r1
-    vld2.8      {d18, d19}, [r9], r1
-    vld2.8      {d20, d21}, [r9], r1
-    vld2.8      {d22, d23}, [r9]
-
-    vtrn.8      q1, q0
-    vtrn.8      q3, q2
-    vtrn.8      q9, q8
-    vtrn.8      q11, q10
-
-    vtrn.16     q1, q3
-    vtrn.16     q0, q2
-    vtrn.16     q9, q11
-    vtrn.16     q8, q10
-
-    vtrn.32     q1, q9
-    vtrn.32     q0, q8
-    vtrn.32     q3, q11
-    vtrn.32     q2, q10
-
-    vrev16.8    q0, q0
-    vrev16.8    q1, q1
-    vrev16.8    q2, q2
-    vrev16.8    q3, q3
-    vrev16.8    q8, q8
-    vrev16.8    q9, q9
-    vrev16.8    q10, q10
-    vrev16.8    q11, q11
-
-    mov         r9, r2
-
-    vst1.8      {d2},  [r9], r3
-    vst1.8      {d0},  [r9], r3
-    vst1.8      {d6},  [r9], r3
-    vst1.8      {d4},  [r9], r3
-    vst1.8      {d18}, [r9], r3
-    vst1.8      {d16}, [r9], r3
-    vst1.8      {d22}, [r9], r3
-    vst1.8      {d20}, [r9]
-
-    mov         r9, r4
-
-    vst1.8      {d3},  [r9], r5
-    vst1.8      {d1},  [r9], r5
-    vst1.8      {d7},  [r9], r5
-    vst1.8      {d5},  [r9], r5
-    vst1.8      {d19}, [r9], r5
-    vst1.8      {d17}, [r9], r5
-    vst1.8      {d23}, [r9], r5
-    vst1.8      {d21}, [r9]
-
-    add         r0, #8*2          @ src   += 8*2
-    add         r2, r3, lsl #3    @ dst_a += 8 * dst_stride_a
-    add         r4, r5, lsl #3    @ dst_b += 8 * dst_stride_b
-    subs        r8,  #8           @ w     -= 8
-    bge         Lloop_8x8_di
-
-  @ add 8 back to counter.  if the result is 0 there are
-  @ no residuals.
-  adds        r8, #8
-  beq         Ldone_di
-
-  @ some residual, so between 1 and 7 lines left to transpose
-  cmp         r8, #2
-  blt         Lblock_1x8_di
-
-  cmp         r8, #4
-  blt         Lblock_2x8_di
-
-@ TODO(frkoenig) : clean this up
-Lblock_4x8_di:
-  mov         r9, r0
-  vld1.64     {d0}, [r9], r1
-  vld1.64     {d1}, [r9], r1
-  vld1.64     {d2}, [r9], r1
-  vld1.64     {d3}, [r9], r1
-  vld1.64     {d4}, [r9], r1
-  vld1.64     {d5}, [r9], r1
-  vld1.64     {d6}, [r9], r1
-  vld1.64     {d7}, [r9]
-
-  adr         r12, vtbl_4x4_transpose_di
-  vld1.8      {q15}, [r12]
-
-  vtrn.8      q0, q1
-  vtrn.8      q2, q3
-
-  vtbl.8      d16, {d0, d1}, d30
-  vtbl.8      d17, {d0, d1}, d31
-  vtbl.8      d18, {d2, d3}, d30
-  vtbl.8      d19, {d2, d3}, d31
-  vtbl.8      d20, {d4, d5}, d30
-  vtbl.8      d21, {d4, d5}, d31
-  vtbl.8      d22, {d6, d7}, d30
-  vtbl.8      d23, {d6, d7}, d31
-
-  mov         r9, r2
-
-  vst1.32     {d16[0]},  [r9], r3
-  vst1.32     {d16[1]},  [r9], r3
-  vst1.32     {d17[0]},  [r9], r3
-  vst1.32     {d17[1]},  [r9], r3
-
-  add         r9, r2, #4
-  vst1.32     {d20[0]}, [r9], r3
-  vst1.32     {d20[1]}, [r9], r3
-  vst1.32     {d21[0]}, [r9], r3
-  vst1.32     {d21[1]}, [r9]
-
-  mov         r9, r4
-
-  vst1.32     {d18[0]}, [r9], r5
-  vst1.32     {d18[1]}, [r9], r5
-  vst1.32     {d19[0]}, [r9], r5
-  vst1.32     {d19[1]}, [r9], r5
-
-  add         r9, r4, #4
-  vst1.32     {d22[0]},  [r9], r5
-  vst1.32     {d22[1]},  [r9], r5
-  vst1.32     {d23[0]},  [r9], r5
-  vst1.32     {d23[1]},  [r9]
-
-  add         r0, #4*2          @ src   += 4 * 2
-  add         r2, r3, lsl #2    @ dst_a += 4 * dst_stride_a
-  add         r4, r5, lsl #2    @ dst_b += 4 * dst_stride_b
-  subs        r8,  #4           @ w     -= 4
-  beq         Ldone_di
-
-  @ some residual, check to see if it includes a 2x8 block,
-  @ or less
-  cmp         r8, #2
-  blt         Lblock_1x8_di
-
-Lblock_2x8_di:
-  mov         r9, r0
-  vld2.16     {d0[0], d2[0]}, [r9], r1
-  vld2.16     {d1[0], d3[0]}, [r9], r1
-  vld2.16     {d0[1], d2[1]}, [r9], r1
-  vld2.16     {d1[1], d3[1]}, [r9], r1
-  vld2.16     {d0[2], d2[2]}, [r9], r1
-  vld2.16     {d1[2], d3[2]}, [r9], r1
-  vld2.16     {d0[3], d2[3]}, [r9], r1
-  vld2.16     {d1[3], d3[3]}, [r9]
-
-  vtrn.8      d0, d1
-  vtrn.8      d2, d3
-
-  mov         r9, r2
-
-  vst1.64     {d0}, [r9], r3
-  vst1.64     {d2}, [r9]
-
-  mov         r9, r4
-
-  vst1.64     {d1}, [r9], r5
-  vst1.64     {d3}, [r9]
-
-  add         r0, #2*2          @ src   += 2 * 2
-  add         r2, r3, lsl #1    @ dst_a += 2 * dst_stride_a
-  add         r4, r5, lsl #1    @ dst_a += 2 * dst_stride_a
-  subs        r8,  #2           @ w     -= 2
-  beq         Ldone_di
-
-Lblock_1x8_di:
-  vld2.8      {d0[0], d1[0]}, [r0], r1
-  vld2.8      {d0[1], d1[1]}, [r0], r1
-  vld2.8      {d0[2], d1[2]}, [r0], r1
-  vld2.8      {d0[3], d1[3]}, [r0], r1
-  vld2.8      {d0[4], d1[4]}, [r0], r1
-  vld2.8      {d0[5], d1[5]}, [r0], r1
-  vld2.8      {d0[6], d1[6]}, [r0], r1
-  vld2.8      {d0[7], d1[7]}, [r0]
-
-  vst1.64     {d0}, [r2]
-  vst1.64     {d1}, [r4]
-
-Ldone_di:
-  pop         {r4-r9, pc}
-
-vtbl_4x4_transpose_di:
-  .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc
index 1c295b086..784b7d249 100644
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@@ -34,16 +34,12 @@ TEST_F(libyuvTest, Transpose) {
   for (iw = 8; iw < _rotate_max_w && !err; ++iw)
     for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_1;
-      uint8 *output_2;
-
       ow = ih;
       oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_1, ow * oh)
+      align_buffer_16(output_2, iw * ih)
 
       for (i = 0; i < (iw * ih); ++i)
         input[i] = i;
@@ -67,9 +63,9 @@ TEST_F(libyuvTest, Transpose) {
         print_array(output_2, iw, ih);
       }
 
-      free(input);
-      free(output_1);
-      free(output_2);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_1)
+      free_aligned_buffer_16(output_2)
     }
 
   EXPECT_EQ(0, err);
@@ -82,18 +78,15 @@ TEST_F(libyuvTest, TransposeUV) {
   for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
     for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_a1, *output_b1;
-      uint8 *output_a2, *output_b2;
 
       ow = ih;
       oh = iw >> 1;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_a1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_b1 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_a2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_b2 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_a1, ow * oh)
+      align_buffer_16(output_b1, ow * oh)
+      align_buffer_16(output_a2, iw * ih)
+      align_buffer_16(output_b2, iw * ih)
 
       for (i = 0; i < (iw * ih); i += 2) {
         input[i] = i >> 1;
@@ -125,11 +118,11 @@ TEST_F(libyuvTest, TransposeUV) {
         print_array(output_b2, oh, ow);
       }
 
-      free(input);
-      free(output_a1);
-      free(output_b1);
-      free(output_a2);
-      free(output_b2);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_a1)
+      free_aligned_buffer_16(output_b1)
+      free_aligned_buffer_16(output_a2)
+      free_aligned_buffer_16(output_b2)
     }
 
   EXPECT_EQ(0, err);
@@ -142,20 +135,15 @@ TEST_F(libyuvTest, RotatePlane90) {
   for (iw = 8; iw < _rotate_max_w && !err; ++iw)
     for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_90;
-      uint8 *output_180;
-      uint8 *output_270;
 
       ow = ih;
       oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_90, ow * oh)
+      align_buffer_16(output_180, iw * ih)
+      align_buffer_16(output_270, ow * oh)
 
       for (i = 0; i < (iw * ih); ++i)
         input[i] = i;
@@ -187,11 +175,11 @@ TEST_F(libyuvTest, RotatePlane90) {
         print_array(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_90);
-      free(output_180);
-      free(output_270);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_90)
+      free_aligned_buffer_16(output_180)
+      free_aligned_buffer_16(output_270)
     }
 
   EXPECT_EQ(0, err);
@@ -204,24 +192,17 @@ TEST_F(libyuvTest, RotateUV90) {
   for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
     for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0_u;
-      uint8 *output_0_v;
-      uint8 *output_90_u;
-      uint8 *output_90_v;
-      uint8 *output_180_u;
-      uint8 *output_180_v;
 
       ow = ih;
       oh = iw >> 1;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_90_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_90_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0_u, ow * oh)
+      align_buffer_16(output_0_v, ow * oh)
+      align_buffer_16(output_90_u, ow * oh)
+      align_buffer_16(output_90_v, ow * oh)
+      align_buffer_16(output_180_u, ow * oh)
+      align_buffer_16(output_180_v, ow * oh)
 
       for (i = 0; i < (iw * ih); i += 2) {
         input[i] = i >> 1;
@@ -266,13 +247,13 @@ TEST_F(libyuvTest, RotateUV90) {
         print_array(output_0_v, oh, ow);
       }
 
-      free(input);
-      free(output_0_u);
-      free(output_0_v);
-      free(output_90_u);
-      free(output_90_v);
-      free(output_180_u);
-      free(output_180_v);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0_u)
+      free_aligned_buffer_16(output_0_v)
+      free_aligned_buffer_16(output_90_u)
+      free_aligned_buffer_16(output_90_v)
+      free_aligned_buffer_16(output_180_u)
+      free_aligned_buffer_16(output_180_v)
     }
 
   EXPECT_EQ(0, err);
@@ -285,24 +266,17 @@ TEST_F(libyuvTest, RotateUV180) {
   for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
     for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0_u;
-      uint8 *output_0_v;
-      uint8 *output_90_u;
-      uint8 *output_90_v;
-      uint8 *output_180_u;
-      uint8 *output_180_v;
 
       ow = iw >> 1;
       oh = ih;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_90_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_90_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0_u, ow * oh)
+      align_buffer_16(output_0_v, ow * oh)
+      align_buffer_16(output_90_u, ow * oh)
+      align_buffer_16(output_90_v, ow * oh)
+      align_buffer_16(output_180_u, ow * oh)
+      align_buffer_16(output_180_v, ow * oh)
 
       for (i = 0; i < (iw * ih); i += 2) {
         input[i] = i >> 1;
@@ -347,13 +321,13 @@ TEST_F(libyuvTest, RotateUV180) {
         print_array(output_0_v, ow, oh);
       }
 
-      free(input);
-      free(output_0_u);
-      free(output_0_v);
-      free(output_90_u);
-      free(output_90_v);
-      free(output_180_u);
-      free(output_180_v);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0_u)
+      free_aligned_buffer_16(output_0_v)
+      free_aligned_buffer_16(output_90_u)
+      free_aligned_buffer_16(output_90_v)
+      free_aligned_buffer_16(output_180_u)
+      free_aligned_buffer_16(output_180_v)
     }
 
   EXPECT_EQ(0, err);
@@ -366,24 +340,17 @@ TEST_F(libyuvTest, RotateUV270) {
   for (iw = 16; iw < _rotate_max_w && !err; iw += 2)
     for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0_u;
-      uint8 *output_0_v;
-      uint8 *output_270_u;
-      uint8 *output_270_v;
-      uint8 *output_180_u;
-      uint8 *output_180_v;
 
       ow = ih;
       oh = iw >> 1;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_0_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_270_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_270_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_u = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180_v = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0_u, ow * oh)
+      align_buffer_16(output_0_v, ow * oh)
+      align_buffer_16(output_270_u, ow * oh)
+      align_buffer_16(output_270_v, ow * oh)
+      align_buffer_16(output_180_u, ow * oh)
+      align_buffer_16(output_180_v, ow * oh)
 
       for (i = 0; i < (iw * ih); i += 2) {
         input[i] = i >> 1;
@@ -429,13 +396,13 @@ TEST_F(libyuvTest, RotateUV270) {
         print_array(output_0_v, oh, ow);
       }
 
-      free(input);
-      free(output_0_u);
-      free(output_0_v);
-      free(output_270_u);
-      free(output_270_v);
-      free(output_180_u);
-      free(output_180_v);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0_u)
+      free_aligned_buffer_16(output_0_v)
+      free_aligned_buffer_16(output_270_u)
+      free_aligned_buffer_16(output_270_v)
+      free_aligned_buffer_16(output_180_u)
+      free_aligned_buffer_16(output_180_v)
     }
 
   EXPECT_EQ(0, err);
@@ -448,16 +415,13 @@ TEST_F(libyuvTest, RotatePlane180) {
   for (iw = 8; iw < _rotate_max_w && !err; ++iw)
     for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_180;
 
       ow = iw;
       oh = ih;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_180, iw * ih)
 
       for (i = 0; i < (iw * ih); ++i)
         input[i] = i;
@@ -481,9 +445,9 @@ TEST_F(libyuvTest, RotatePlane180) {
         print_array(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_180);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_180)
     }
 
   EXPECT_EQ(0, err);
@@ -496,20 +460,15 @@ TEST_F(libyuvTest, RotatePlane270) {
   for (iw = 8; iw < _rotate_max_w && !err; ++iw)
     for (ih = 8; ih < _rotate_max_h && !err; ++ih) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_90;
-      uint8 *output_180;
-      uint8 *output_270;
 
       ow = ih;
       oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
-      output_180 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_90, ow * oh)
+      align_buffer_16(output_180, iw * ih)
+      align_buffer_16(output_270, ow * oh)
 
       for (i = 0; i < (iw * ih); ++i)
         input[i] = i;
@@ -541,11 +500,11 @@ TEST_F(libyuvTest, RotatePlane270) {
         print_array(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_90);
-      free(output_180);
-      free(output_270);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_90)
+      free_aligned_buffer_16(output_180)
+      free_aligned_buffer_16(output_270)
     }
 
   EXPECT_EQ(0, err);
@@ -558,15 +517,13 @@ TEST_F(libyuvTest, RotatePlane90and270) {
   for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
     for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_90;
+
       ow = ih;
       oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_90, ow * oh)
 
       for (i = 0; i < (iw * ih); ++i)
         input[i] = i;
@@ -590,9 +547,9 @@ TEST_F(libyuvTest, RotatePlane90and270) {
         print_array(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_90);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_90)
     }
 
   EXPECT_EQ(0, err);
@@ -605,15 +562,13 @@ TEST_F(libyuvTest, RotatePlane90Pitch) {
   for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
     for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_90;
+
       int ow = ih;
       int oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_90 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_90, ow * oh)
 
       for (i = 0; i < (iw * ih); ++i)
         input[i] = i;
@@ -649,9 +604,9 @@ TEST_F(libyuvTest, RotatePlane90Pitch) {
         print_array(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_90);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_90)
     }
 
   EXPECT_EQ(0, err);
@@ -664,16 +619,13 @@ TEST_F(libyuvTest, RotatePlane270Pitch) {
   for (iw = 16; iw < _rotate_max_w && !err; iw += 4)
     for (ih = 16; ih < _rotate_max_h && !err; ih += 4) {
       int i;
-      uint8 *input;
-      uint8 *output_0;
-      uint8 *output_270;
 
       ow = ih;
       oh = iw;
 
-      input = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_0 = static_cast<uint8*>(calloc(iw * ih, sizeof(uint8)));
-      output_270 = static_cast<uint8*>(calloc(ow * oh, sizeof(uint8)));
+      align_buffer_16(input, iw * ih)
+      align_buffer_16(output_0, iw * ih)
+      align_buffer_16(output_270, ow * oh)
 
       for (i = 0; i < (iw * ih); ++i)
         input[i] = i;
@@ -709,9 +661,9 @@ TEST_F(libyuvTest, RotatePlane270Pitch) {
         print_array(output_0, iw, ih);
       }
 
-      free(input);
-      free(output_0);
-      free(output_270);
+      free_aligned_buffer_16(input)
+      free_aligned_buffer_16(output_0)
+      free_aligned_buffer_16(output_270)
     }
 
   EXPECT_EQ(0, err);
@@ -719,10 +671,6 @@ TEST_F(libyuvTest, RotatePlane270Pitch) {
 
 TEST_F(libyuvTest, I420Rotate90) {
   int err = 0;
-  uint8 *orig_y, *orig_u, *orig_v;
-  uint8 *ro0_y, *ro0_u, *ro0_v;
-  uint8 *ro90_y, *ro90_u, *ro90_v;
-  uint8 *ro270_y, *ro270_u, *ro270_v;
 
   int yw = 1024;
   int yh = 768;
@@ -737,21 +685,21 @@ TEST_F(libyuvTest, I420Rotate90) {
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  orig_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_u, uv_plane_size)
+  align_buffer_16(orig_v, uv_plane_size)
 
-  ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro0_y, y_plane_size)
+  align_buffer_16(ro0_u, uv_plane_size)
+  align_buffer_16(ro0_v, uv_plane_size)
 
-  ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro90_y, y_plane_size)
+  align_buffer_16(ro90_u, uv_plane_size)
+  align_buffer_16(ro90_v, uv_plane_size)
 
-  ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro270_y, y_plane_size)
+  align_buffer_16(ro270_u, uv_plane_size)
+  align_buffer_16(ro270_v, uv_plane_size)
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
@@ -816,28 +764,24 @@ TEST_F(libyuvTest, I420Rotate90) {
       ++err;
   }
 
-  free(orig_y);
-  free(orig_u);
-  free(orig_v);
-  free(ro0_y);
-  free(ro0_u);
-  free(ro0_v);
-  free(ro90_y);
-  free(ro90_u);
-  free(ro90_v);
-  free(ro270_y);
-  free(ro270_u);
-  free(ro270_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_u)
+  free_aligned_buffer_16(orig_v)
+  free_aligned_buffer_16(ro0_y)
+  free_aligned_buffer_16(ro0_u)
+  free_aligned_buffer_16(ro0_v)
+  free_aligned_buffer_16(ro90_y)
+  free_aligned_buffer_16(ro90_u)
+  free_aligned_buffer_16(ro90_v)
+  free_aligned_buffer_16(ro270_y)
+  free_aligned_buffer_16(ro270_u)
+  free_aligned_buffer_16(ro270_v)
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, I420Rotate270) {
   int err = 0;
-  uint8 *orig_y, *orig_u, *orig_v;
-  uint8 *ro0_y, *ro0_u, *ro0_v;
-  uint8 *ro90_y, *ro90_u, *ro90_v;
-  uint8 *ro270_y, *ro270_u, *ro270_v;
 
   int yw = 1024;
   int yh = 768;
@@ -852,21 +796,21 @@ TEST_F(libyuvTest, I420Rotate270) {
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  orig_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_u, uv_plane_size)
+  align_buffer_16(orig_v, uv_plane_size)
 
-  ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro0_y, y_plane_size)
+  align_buffer_16(ro0_u, uv_plane_size)
+  align_buffer_16(ro0_v, uv_plane_size)
 
-  ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro90_y, y_plane_size)
+  align_buffer_16(ro90_u, uv_plane_size)
+  align_buffer_16(ro90_v, uv_plane_size)
 
-  ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro270_y, y_plane_size)
+  align_buffer_16(ro270_u, uv_plane_size)
+  align_buffer_16(ro270_v, uv_plane_size)
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
@@ -931,27 +875,24 @@ TEST_F(libyuvTest, I420Rotate270) {
       ++err;
   }
 
-  free(orig_y);
-  free(orig_u);
-  free(orig_v);
-  free(ro0_y);
-  free(ro0_u);
-  free(ro0_v);
-  free(ro90_y);
-  free(ro90_u);
-  free(ro90_v);
-  free(ro270_y);
-  free(ro270_u);
-  free(ro270_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_u)
+  free_aligned_buffer_16(orig_v)
+  free_aligned_buffer_16(ro0_y)
+  free_aligned_buffer_16(ro0_u)
+  free_aligned_buffer_16(ro0_v)
+  free_aligned_buffer_16(ro90_y)
+  free_aligned_buffer_16(ro90_u)
+  free_aligned_buffer_16(ro90_v)
+  free_aligned_buffer_16(ro270_y)
+  free_aligned_buffer_16(ro270_u)
+  free_aligned_buffer_16(ro270_v)
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, NV12ToI420Rotate90) {
   int err = 0;
-  uint8 *orig_y, *orig_uv;
-  uint8 *ro0_y, *ro0_u, *ro0_v;
-  uint8 *ro90_y, *ro90_u, *ro90_v;
 
   int yw = 1024;
   int yh = 768;
@@ -966,16 +907,16 @@ TEST_F(libyuvTest, NV12ToI420Rotate90) {
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_uv, o_uv_plane_size)
 
-  ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro0_y, y_plane_size)
+  align_buffer_16(ro0_u, uv_plane_size)
+  align_buffer_16(ro0_v, uv_plane_size)
 
-  ro90_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro90_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro90_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro90_y, y_plane_size)
+  align_buffer_16(ro90_u, uv_plane_size)
+  align_buffer_16(ro90_v, uv_plane_size)
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
@@ -1036,23 +977,20 @@ TEST_F(libyuvTest, NV12ToI420Rotate90) {
   if (!zero_cnt)
     ++err;
 
-  free(orig_y);
-  free(orig_uv);
-  free(ro0_y);
-  free(ro0_u);
-  free(ro0_v);
-  free(ro90_y);
-  free(ro90_u);
-  free(ro90_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_uv)
+  free_aligned_buffer_16(ro0_y)
+  free_aligned_buffer_16(ro0_u)
+  free_aligned_buffer_16(ro0_v)
+  free_aligned_buffer_16(ro90_y)
+  free_aligned_buffer_16(ro90_u)
+  free_aligned_buffer_16(ro90_v)
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, NV12ToI420Rotate270) {
   int err = 0;
-  uint8 *orig_y, *orig_uv;
-  uint8 *ro0_y, *ro0_u, *ro0_v;
-  uint8 *ro270_y, *ro270_u, *ro270_v;
 
   int yw = 1024;
   int yh = 768;
@@ -1068,16 +1006,16 @@ TEST_F(libyuvTest, NV12ToI420Rotate270) {
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_uv, o_uv_plane_size)
 
-  ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro0_y, y_plane_size)
+  align_buffer_16(ro0_u, uv_plane_size)
+  align_buffer_16(ro0_v, uv_plane_size)
 
-  ro270_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro270_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro270_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro270_y, y_plane_size)
+  align_buffer_16(ro270_u, uv_plane_size)
+  align_buffer_16(ro270_v, uv_plane_size)
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
@@ -1138,23 +1076,20 @@ TEST_F(libyuvTest, NV12ToI420Rotate270) {
   if (!zero_cnt)
     ++err;
 
-  free(orig_y);
-  free(orig_uv);
-  free(ro0_y);
-  free(ro0_u);
-  free(ro0_v);
-  free(ro270_y);
-  free(ro270_u);
-  free(ro270_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_uv)
+  free_aligned_buffer_16(ro0_y)
+  free_aligned_buffer_16(ro0_u)
+  free_aligned_buffer_16(ro0_v)
+  free_aligned_buffer_16(ro270_y)
+  free_aligned_buffer_16(ro270_u)
+  free_aligned_buffer_16(ro270_v)
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, NV12ToI420Rotate180) {
   int err = 0;
-  uint8 *orig_y, *orig_uv;
-  uint8 *ro0_y, *ro0_u, *ro0_v;
-  uint8 *ro180_y, *ro180_u, *ro180_v;
 
   int yw = 1024;
   int yh = 768;
@@ -1170,16 +1105,16 @@ TEST_F(libyuvTest, NV12ToI420Rotate180) {
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_uv, o_uv_plane_size)
 
-  ro0_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro0_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro0_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro0_y, y_plane_size)
+  align_buffer_16(ro0_u, uv_plane_size)
+  align_buffer_16(ro0_v, uv_plane_size)
 
-  ro180_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  ro180_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  ro180_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(ro180_y, y_plane_size)
+  align_buffer_16(ro180_u, uv_plane_size)
+  align_buffer_16(ro180_v, uv_plane_size)
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
@@ -1236,24 +1171,20 @@ TEST_F(libyuvTest, NV12ToI420Rotate180) {
   if (!zero_cnt)
     ++err;
 
-  free(orig_y);
-  free(orig_uv);
-  free(ro0_y);
-  free(ro0_u);
-  free(ro0_v);
-  free(ro180_y);
-  free(ro180_u);
-  free(ro180_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_uv)
+  free_aligned_buffer_16(ro0_y)
+  free_aligned_buffer_16(ro0_u)
+  free_aligned_buffer_16(ro0_v)
+  free_aligned_buffer_16(ro180_y)
+  free_aligned_buffer_16(ro180_u)
+  free_aligned_buffer_16(ro180_v)
 
   EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) {
   int y_err = 0, uv_err = 0;
-  uint8 *orig_y, *orig_uv;
-  uint8 *roa_y, *roa_u, *roa_v;
-  uint8 *rob_y, *rob_u, *rob_v;
-  uint8 *roc_y, *roc_u, *roc_v;
 
   int yw = 1024;
   int yh = 768;
@@ -1268,20 +1199,20 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) {
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_uv, o_uv_plane_size)
 
-  roa_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  roa_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  roa_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(roa_y, y_plane_size)
+  align_buffer_16(roa_u, uv_plane_size)
+  align_buffer_16(roa_v, uv_plane_size)
 
-  rob_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  rob_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  rob_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(rob_y, y_plane_size)
+  align_buffer_16(rob_u, uv_plane_size)
+  align_buffer_16(rob_v, uv_plane_size)
 
-  roc_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  roc_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  roc_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(roc_y, y_plane_size)
+  align_buffer_16(roc_u, uv_plane_size)
+  align_buffer_16(roc_v, uv_plane_size)
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
@@ -1382,26 +1313,23 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) {
     print_array(roc_v, uv_st_0, uv_st_90);
   }
 
-  free(orig_y);
-  free(orig_uv);
-  free(roa_y);
-  free(roa_u);
-  free(roa_v);
-  free(rob_y);
-  free(rob_u);
-  free(rob_v);
-  free(roc_y);
-  free(roc_u);
-  free(roc_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_uv)
+  free_aligned_buffer_16(roa_y)
+  free_aligned_buffer_16(roa_u)
+  free_aligned_buffer_16(roa_v)
+  free_aligned_buffer_16(rob_y)
+  free_aligned_buffer_16(rob_u)
+  free_aligned_buffer_16(rob_v)
+  free_aligned_buffer_16(roc_y)
+  free_aligned_buffer_16(roc_u)
+  free_aligned_buffer_16(roc_v)
 
   EXPECT_EQ(0, y_err + uv_err);
 }
 
 TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) {
   int y_err = 0, uv_err = 0;
-  uint8 *orig_y, *orig_uv;
-  uint8 *roa_y, *roa_u, *roa_v;
-  uint8 *rob_y, *rob_u, *rob_v;
 
   int yw = 1024;
   int yh = 768;
@@ -1416,16 +1344,16 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) {
 
   srandom(time(NULL));
 
-  orig_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  orig_uv = static_cast<uint8*>(calloc(o_uv_plane_size, sizeof(uint8)));
+  align_buffer_16(orig_y, y_plane_size)
+  align_buffer_16(orig_uv, o_uv_plane_size)
 
-  roa_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  roa_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  roa_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(roa_y, y_plane_size)
+  align_buffer_16(roa_u, uv_plane_size)
+  align_buffer_16(roa_v, uv_plane_size)
 
-  rob_y = static_cast<uint8*>(calloc(y_plane_size, sizeof(uint8)));
-  rob_u = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
-  rob_v = static_cast<uint8*>(calloc(uv_plane_size, sizeof(uint8)));
+  align_buffer_16(rob_y, y_plane_size)
+  align_buffer_16(rob_u, uv_plane_size)
+  align_buffer_16(rob_v, uv_plane_size)
 
   // fill image buffers with random data
   for (i = b; i < (yh + b); ++i) {
@@ -1506,14 +1434,14 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) {
     print_array(rob_v, uv_st, uvh + (2 * b));
   }
 
-  free(orig_y);
-  free(orig_uv);
-  free(roa_y);
-  free(roa_u);
-  free(roa_v);
-  free(rob_y);
-  free(rob_u);
-  free(rob_v);
+  free_aligned_buffer_16(orig_y)
+  free_aligned_buffer_16(orig_uv)
+  free_aligned_buffer_16(roa_y)
+  free_aligned_buffer_16(roa_u)
+  free_aligned_buffer_16(roa_v)
+  free_aligned_buffer_16(rob_y)
+  free_aligned_buffer_16(rob_u)
+  free_aligned_buffer_16(rob_v)
 
   EXPECT_EQ(0, y_err + uv_err);
 }
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 6399e71b0..dc9c8bfee 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -16,43 +16,6 @@
 
 using namespace libyuv;
 
-#define align_buffer_16(var, size) \
-  uint8 *var; \
-  uint8 *var##_mem; \
-  var##_mem = reinterpret_cast<uint8*>(calloc(size+15, sizeof(uint8))); \
-  var = reinterpret_cast<uint8*> \
-        ((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f));
-
-#define free_aligned_buffer_16(var) \
-  free(var##_mem);  \
-  var = 0;
-
-#ifdef WIN32
-
-#include <windows.h>
-static double get_time()
-{
-    LARGE_INTEGER t, f;
-    QueryPerformanceCounter(&t);
-    QueryPerformanceFrequency(&f);
-    return double(t.QuadPart)/double(f.QuadPart);
-}
-
-#else
-
-#include <sys/time.h>
-#include <sys/resource.h>
-
-static double get_time()
-{
-    struct timeval t;
-    struct timezone tzp;
-    gettimeofday(&t, &tzp);
-    return t.tv_sec + t.tv_usec*1e-6;
-}
-
-#endif
-
 static int TestFilter(int src_width, int src_height,
                       int dst_width, int dst_height,
                       FilterMode f) {
diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h
index cac30c72a..43965b779 100644
--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@@ -13,6 +13,43 @@
 
 #include <gtest/gtest.h>
 
+#define align_buffer_16(var, size) \
+  uint8 *var; \
+  uint8 *var##_mem; \
+  var##_mem = reinterpret_cast<uint8*>(calloc(size+15, sizeof(uint8))); \
+  var = reinterpret_cast<uint8*> \
+        ((reinterpret_cast<intptr_t>(var##_mem) + 15) & (~0x0f));
+
+#define free_aligned_buffer_16(var) \
+  free(var##_mem);  \
+  var = 0;
+
+#ifdef WIN32
+
+#include <windows.h>
+static double get_time()
+{
+    LARGE_INTEGER t, f;
+    QueryPerformanceCounter(&t);
+    QueryPerformanceFrequency(&f);
+    return double(t.QuadPart)/double(f.QuadPart);
+}
+
+#else
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+static double get_time()
+{
+    struct timeval t;
+    struct timezone tzp;
+    gettimeofday(&t, &tzp);
+    return t.tv_sec + t.tv_usec*1e-6;
+}
+
+#endif
+
 class libyuvTest : public ::testing::Test {
  protected:
   libyuvTest();