diff --git a/libyuv.gyp b/libyuv.gyp index ea3575eee..c6b674962 100644 --- a/libyuv.gyp +++ b/libyuv.gyp @@ -28,7 +28,6 @@ 'include/libyuv/general.h', 'include/libyuv/scale.h', 'include/libyuv/planar_functions.h', - # headers 'source/conversion_tables.h', @@ -59,6 +58,15 @@ 'source/row_posix.cc', ], }], + ['target_arch=="arm"',{ + 'conditions': [ + ['arm_neon==1', { + 'sources' : [ + 'source/rotate_neon.cc', + ], + }], + ], + }], ] }, ], # targets diff --git a/source/rotate.cc b/source/rotate.cc index 752f756dd..efd674d86 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -43,7 +43,6 @@ typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int); typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int); #ifdef __ARM_NEON__ -extern "C" { #define HAS_REVERSE_LINE_NEON void ReverseLine_NEON(const uint8* src, uint8* dst, int width); #define HAS_REVERSE_LINE_UV_NEON @@ -58,7 +57,6 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width); -} // extern "C" #endif #if defined(WIN32) && !defined(COVERAGE_ENABLED) @@ -784,10 +782,7 @@ void TransposePlane(const uint8* src, int src_stride, rotate_wxh_func TransposeWxH; #if defined(HAS_TRANSPOSE_WX8_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (width % 8 == 0) && - IS_ALIGNED(src, 8) && (src_stride % 8 == 0) && - IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) { TransposeWx8 = TransposeWx8_NEON; TransposeWxH = TransposeWxH_C; } else @@ -917,10 +912,7 @@ void RotatePlane180(const uint8* src, int src_stride, reverse_func ReverseLine; #if defined(HAS_REVERSE_LINE_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (width % 16 == 0) && - IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && - IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) { + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) { ReverseLine = ReverseLine_NEON; } else #endif @@ -1145,11 +1137,7 @@ void RotateUV180(const uint8* src, int src_stride, reverse_uv_func ReverseLine; #if defined(HAS_REVERSE_LINE_UV_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && - (width % 16 == 0) && - IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && - IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) && - IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) { + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) { ReverseLine = ReverseLineUV_NEON; } else #endif diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc new file mode 100644 index 000000000..a6496d33b --- /dev/null +++ b/source/rotate_neon.cc @@ -0,0 +1,557 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +namespace libyuv { + +#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) + +void ReverseLine_NEON(const uint8* src, uint8* dst, int width) { + asm volatile( + // compute where to start writing destination + "add %1, %2\n" + + // work on segments that are multiples of 16 + "lsrs r3, %2, #4\n" + + // the output is written in two block. 8 bytes followed + // by another 8. reading is done sequentially, from left to + // right. writing is done from right to left in block sizes + // %1, the destination pointer is incremented after writing + // the first of the two blocks. need to subtract that 8 off + // along with 16 to get the next location. + "mov r3, #-24\n" + + "beq 2f\n" + + // back of destination by the size of the register that is + // going to be reversed + "sub %1, #16\n" + + // the loop needs to run on blocks of 16. what will be left + // over is either a negative number, the residuals that need + // to be done, or 0. if this isn't subtracted off here the + // loop will run one extra time. + "sub %2, #16\n" + + "1:\n" + "vld1.8 {q0}, [%0]!\n" // src += 16 + + // reverse the bytes in the 64 bit segments. unable to reverse + // the bytes in the entire 128 bits in one go. + "vrev64.8 q0, q0\n" + + // because of the inability to reverse the entire 128 bits + // reverse the writing out of the two 64 bit segments. + "vst1.8 {d1}, [%1]!\n" + "vst1.8 {d0}, [%1], r3\n" // dst -= 16 + + "subs %2, #16\n" + "bge 1b\n" + + // add 16 back to the counter. if the result is 0 there is no + // residuals so jump past + "adds %2, #16\n" + "beq 5f\n" + + "add %1, #16\n" + + "2:\n" + + "mov r3, #-3\n" + + "sub %1, #2\n" + "subs %2, #2\n" + // check for 16*n+1 scenarios where segments_of_2 should not + // be run, but there is something left over. + "blt 4f\n" + +// do this in neon registers as per +// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ + "3:\n" + "vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2 + + "vst1.8 {d1[0]}, [%1]!\n" + "vst1.8 {d0[0]}, [%1], r3\n" // dst -= 2 + + "subs %2, #2\n" + "bge 3b\n" + + "adds %2, #2\n" + "beq 5f\n" + + "4:\n" + "add %1, #1\n" + "vld1.8 {d0[0]}, [%0]\n" + "vst1.8 {d0[0]}, [%1]\n" + + "5:\n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "r3", "q0" + ); +} + +static const uint8 vtbl_4x4_transpose[16] __attribute__((vector_size(16))) = + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %4, #8\n" + + // handle 8x8 blocks. this should be the majority of the plane + "1:\n" + "mov r9, %0\n" + + "vld1.8 {d0}, [r9], %1\n" + "vld1.8 {d1}, [r9], %1\n" + "vld1.8 {d2}, [r9], %1\n" + "vld1.8 {d3}, [r9], %1\n" + "vld1.8 {d4}, [r9], %1\n" + "vld1.8 {d5}, [r9], %1\n" + "vld1.8 {d6}, [r9], %1\n" + "vld1.8 {d7}, [r9]\n" + + "vtrn.8 d1, d0\n" + "vtrn.8 d3, d2\n" + "vtrn.8 d5, d4\n" + "vtrn.8 d7, d6\n" + + "vtrn.16 d1, d3\n" + "vtrn.16 d0, d2\n" + "vtrn.16 d5, d7\n" + "vtrn.16 d4, d6\n" + + "vtrn.32 d1, d5\n" + "vtrn.32 d0, d4\n" + "vtrn.32 d3, d7\n" + "vtrn.32 d2, d6\n" + + "vrev16.8 q0, q0\n" + "vrev16.8 q1, q1\n" + "vrev16.8 q2, q2\n" + "vrev16.8 q3, q3\n" + + "mov r9, %2\n" + + "vst1.8 {d1}, [r9], %3\n" + "vst1.8 {d0}, [r9], %3\n" + "vst1.8 {d3}, [r9], %3\n" + "vst1.8 {d2}, [r9], %3\n" + "vst1.8 {d5}, [r9], %3\n" + "vst1.8 {d4}, [r9], %3\n" + "vst1.8 {d7}, [r9], %3\n" + "vst1.8 {d6}, [r9]\n" + + "add %0, #8\n" // src += 8 + "add %2, %3, lsl #3\n" // dst += 8 * dst_stride + "subs %4, #8\n" // w -= 8 + "bge 1b\n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %4, #8\n" + "beq 4f\n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %4, #2\n" + "blt 3f\n" + + "cmp %4, #4\n" + "blt 2f\n" + + // 4x8 block + "mov r9, %0\n" + "vld1.32 {d0[0]}, [r9], %1\n" + "vld1.32 {d0[1]}, [r9], %1\n" + "vld1.32 {d1[0]}, [r9], %1\n" + "vld1.32 {d1[1]}, [r9], %1\n" + "vld1.32 {d2[0]}, [r9], %1\n" + "vld1.32 {d2[1]}, [r9], %1\n" + "vld1.32 {d3[0]}, [r9], %1\n" + "vld1.32 {d3[1]}, [r9]\n" + + "mov r9, %2\n" + + "vld1.8 {q3}, [%5]\n" + + "vtbl.8 d4, {d0, d1}, d6\n" + "vtbl.8 d5, {d0, d1}, d7\n" + "vtbl.8 d0, {d2, d3}, d6\n" + "vtbl.8 d1, {d2, d3}, d7\n" + + // TODO: rework shuffle above to write + // out with 4 instead of 8 writes + "vst1.32 {d4[0]}, [r9], %3\n" + "vst1.32 {d4[1]}, [r9], %3\n" + "vst1.32 {d5[0]}, [r9], %3\n" + "vst1.32 {d5[1]}, [r9]\n" + + "add r9, %2, #4\n" + "vst1.32 {d0[0]}, [r9], %3\n" + "vst1.32 {d0[1]}, [r9], %3\n" + "vst1.32 {d1[0]}, [r9], %3\n" + "vst1.32 {d1[1]}, [r9]\n" + + "add %0, #4\n" // src += 4 + "add %2, %3, lsl #2\n" // dst += 4 * dst_stride + "subs %4, #4\n" // w -= 4 + "beq 4f\n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %4, #2\n" + "blt 3f\n" + + // 2x8 block + "2:\n" + "mov r9, %0\n" + "vld1.16 {d0[0]}, [r9], %1\n" + "vld1.16 {d1[0]}, [r9], %1\n" + "vld1.16 {d0[1]}, [r9], %1\n" + "vld1.16 {d1[1]}, [r9], %1\n" + "vld1.16 {d0[2]}, [r9], %1\n" + "vld1.16 {d1[2]}, [r9], %1\n" + "vld1.16 {d0[3]}, [r9], %1\n" + "vld1.16 {d1[3]}, [r9]\n" + + "vtrn.8 d0, d1\n" + + "mov r9, %2\n" + + "vst1.64 {d0}, [r9], %3\n" + "vst1.64 {d1}, [r9]\n" + + "add %0, #2\n" // src += 2 + "add %2, %3, lsl #1\n" // dst += 2 * dst_stride + "subs %4, #2\n" // w -= 2 + "beq 4f\n" + + // 1x8 block + "3:\n" + "vld1.8 {d0[0]}, [%0], %1\n" + "vld1.8 {d0[1]}, [%0], %1\n" + "vld1.8 {d0[2]}, [%0], %1\n" + "vld1.8 {d0[3]}, [%0], %1\n" + "vld1.8 {d0[4]}, [%0], %1\n" + "vld1.8 {d0[5]}, [%0], %1\n" + "vld1.8 {d0[6]}, [%0], %1\n" + "vld1.8 {d0[7]}, [%0]\n" + + "vst1.64 {d0}, [%2]\n" + + "4:\n" + + : "+r"(src), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_stride), // %3 + "+r"(width) // %4 + : "r"(vtbl_4x4_transpose) // %5 + : "memory", "cc", "r9", "q0", "q1", "q2", "q3" + ); +} + +void ReverseLineUV_NEON(const uint8* src, + uint8* dst_a, uint8* dst_b, + int width) { + asm volatile( + // compute where to start writing destination + "add %1, %3\n" // dst_a + width + "add %2, %3\n" // dst_b + width + + // work on input segments that are multiples of 16, but + // width that has been passed is output segments, half + // the size of input. + "lsrs r12, %3, #3\n" + + "beq 2f\n" + + // the output is written in to two blocks. + "mov r12, #-8\n" + + // back of destination by the size of the register that is + // going to be reversed + "sub %1, #8\n" + "sub %2, #8\n" + + // the loop needs to run on blocks of 8. what will be left + // over is either a negative number, the residuals that need + // to be done, or 0. if this isn't subtracted off here the + // loop will run one extra time. + "sub %3, #8\n" + + "1:\n" + "vld2.8 {d0, d1}, [%0]!\n" // src += 16 + + // reverse the bytes in the 64 bit segments + "vrev64.8 q0, q0\n" + + "vst1.8 {d0}, [%1], r12\n" // dst_a -= 8 + "vst1.8 {d1}, [%2], r12\n" // dst_b -= 8 + + "subs %3, #8\n" + "bge 1b\n" + + // add 8 back to the counter. if the result is 0 there is no + // residuals so return + "adds %3, #8\n" + "beq 4f\n" + + "add %1, #8\n" + "add %2, #8\n" + + "2:\n" + + "mov r12, #-1\n" + + "sub %1, #1\n" + "sub %2, #1\n" + + "3:\n" + "vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2 + + "vst1.8 {d0[0]}, [%1], r12\n" // dst_a -= 1 + "vst1.8 {d1[0]}, [%2], r12\n" // dst_b -= 1 + + "subs %3, %3, #1\n" + "bgt 3b\n" + "4:\n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "r12", "q0" + ); +} + +static const uint8 vtbl_4x4_transpose_di[16] __attribute__((vector_size(16))) = + { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; + +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %6, #8\n" + + // handle 8x8 blocks. this should be the majority of the plane + "1:\n" + "mov r9, %0\n" + + "vld2.8 {d0, d1}, [r9], %1\n" + "vld2.8 {d2, d3}, [r9], %1\n" + "vld2.8 {d4, d5}, [r9], %1\n" + "vld2.8 {d6, d7}, [r9], %1\n" + "vld2.8 {d16, d17}, [r9], %1\n" + "vld2.8 {d18, d19}, [r9], %1\n" + "vld2.8 {d20, d21}, [r9], %1\n" + "vld2.8 {d22, d23}, [r9]\n" + + "vtrn.8 q1, q0\n" + "vtrn.8 q3, q2\n" + "vtrn.8 q9, q8\n" + "vtrn.8 q11, q10\n" + + "vtrn.16 q1, q3\n" + "vtrn.16 q0, q2\n" + "vtrn.16 q9, q11\n" + "vtrn.16 q8, q10\n" + + "vtrn.32 q1, q9\n" + "vtrn.32 q0, q8\n" + "vtrn.32 q3, q11\n" + "vtrn.32 q2, q10\n" + + "vrev16.8 q0, q0\n" + "vrev16.8 q1, q1\n" + "vrev16.8 q2, q2\n" + "vrev16.8 q3, q3\n" + "vrev16.8 q8, q8\n" + "vrev16.8 q9, q9\n" + "vrev16.8 q10, q10\n" + "vrev16.8 q11, q11\n" + + "mov r9, %2\n" + + "vst1.8 {d2}, [r9], %3\n" + "vst1.8 {d0}, [r9], %3\n" + "vst1.8 {d6}, [r9], %3\n" + "vst1.8 {d4}, [r9], %3\n" + "vst1.8 {d18}, [r9], %3\n" + "vst1.8 {d16}, [r9], %3\n" + "vst1.8 {d22}, [r9], %3\n" + "vst1.8 {d20}, [r9]\n" + + "mov r9, %4\n" + + "vst1.8 {d3}, [r9], %5\n" + "vst1.8 {d1}, [r9], %5\n" + "vst1.8 {d7}, [r9], %5\n" + "vst1.8 {d5}, [r9], %5\n" + "vst1.8 {d19}, [r9], %5\n" + "vst1.8 {d17}, [r9], %5\n" + "vst1.8 {d23}, [r9], %5\n" + "vst1.8 {d21}, [r9]\n" + + "add %0, #8*2\n" // src += 8*2 + "add %2, %3, lsl #3\n" // dst_a += 8 * dst_stride_a + "add %4, %5, lsl #3\n" // dst_b += 8 * dst_stride_b + "subs %6, #8\n" // w -= 8 + "bge 1b\n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %6, #8\n" + "beq 4f\n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %6, #2\n" + "blt 3f\n" + + "cmp %6, #4\n" + "blt 2f\n" + + //TODO(frkoenig) : clean this up + // 4x8 block + "mov r9, %0\n" + "vld1.64 {d0}, [r9], %1\n" + "vld1.64 {d1}, [r9], %1\n" + "vld1.64 {d2}, [r9], %1\n" + "vld1.64 {d3}, [r9], %1\n" + "vld1.64 {d4}, [r9], %1\n" + "vld1.64 {d5}, [r9], %1\n" + "vld1.64 {d6}, [r9], %1\n" + "vld1.64 {d7}, [r9]\n" + + "vld1.8 {q15}, [%7]\n" + + "vtrn.8 q0, q1\n" + "vtrn.8 q2, q3\n" + + "vtbl.8 d16, {d0, d1}, d30\n" + "vtbl.8 d17, {d0, d1}, d31\n" + "vtbl.8 d18, {d2, d3}, d30\n" + "vtbl.8 d19, {d2, d3}, d31\n" + "vtbl.8 d20, {d4, d5}, d30\n" + "vtbl.8 d21, {d4, d5}, d31\n" + "vtbl.8 d22, {d6, d7}, d30\n" + "vtbl.8 d23, {d6, d7}, d31\n" + + "mov r9, %2\n" + + "vst1.32 {d16[0]}, [r9], %3\n" + "vst1.32 {d16[1]}, [r9], %3\n" + "vst1.32 {d17[0]}, [r9], %3\n" + "vst1.32 {d17[1]}, [r9], %3\n" + + "add r9, %2, #4\n" + "vst1.32 {d20[0]}, [r9], %3\n" + "vst1.32 {d20[1]}, [r9], %3\n" + "vst1.32 {d21[0]}, [r9], %3\n" + "vst1.32 {d21[1]}, [r9]\n" + + "mov r9, %4\n" + + "vst1.32 {d18[0]}, [r9], %5\n" + "vst1.32 {d18[1]}, [r9], %5\n" + "vst1.32 {d19[0]}, [r9], %5\n" + "vst1.32 {d19[1]}, [r9], %5\n" + + "add r9, %4, #4\n" + "vst1.32 {d22[0]}, [r9], %5\n" + "vst1.32 {d22[1]}, [r9], %5\n" + "vst1.32 {d23[0]}, [r9], %5\n" + "vst1.32 {d23[1]}, [r9]\n" + + "add %0, #4*2\n" // src += 4 * 2 + "add %2, %3, lsl #2\n" // dst_a += 4 * dst_stride_a + "add %4, %5, lsl #2\n" // dst_b += 4 * dst_stride_b + "subs %6, #4\n" // w -= 4 + "beq 4f\n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %6, #2\n" + "blt 3f\n" + + // 2x8 block + "2:\n" + "mov r9, %0\n" + "vld2.16 {d0[0], d2[0]}, [r9], %1\n" + "vld2.16 {d1[0], d3[0]}, [r9], %1\n" + "vld2.16 {d0[1], d2[1]}, [r9], %1\n" + "vld2.16 {d1[1], d3[1]}, [r9], %1\n" + "vld2.16 {d0[2], d2[2]}, [r9], %1\n" + "vld2.16 {d1[2], d3[2]}, [r9], %1\n" + "vld2.16 {d0[3], d2[3]}, [r9], %1\n" + "vld2.16 {d1[3], d3[3]}, [r9]\n" + + "vtrn.8 d0, d1\n" + "vtrn.8 d2, d3\n" + + "mov r9, %2\n" + + "vst1.64 {d0}, [r9], %3\n" + "vst1.64 {d2}, [r9]\n" + + "mov r9, %4\n" + + "vst1.64 {d1}, [r9], %5\n" + "vst1.64 {d3}, [r9]\n" + + "add %0, #2*2\n" // src += 2 * 2 + "add %2, %3, lsl #1\n" // dst_a += 2 * dst_stride_a + "add %4, %5, lsl #1\n" // dst_b += 2 * dst_stride_b + "subs %6, #2\n" // w -= 2 + "beq 4f\n" + + // 1x8 block + "3:\n" + "vld2.8 {d0[0], d1[0]}, [%0], %1\n" + "vld2.8 {d0[1], d1[1]}, [%0], %1\n" + "vld2.8 {d0[2], d1[2]}, [%0], %1\n" + "vld2.8 {d0[3], d1[3]}, [%0], %1\n" + "vld2.8 {d0[4], d1[4]}, [%0], %1\n" + "vld2.8 {d0[5], d1[5]}, [%0], %1\n" + "vld2.8 {d0[6], d1[6]}, [%0], %1\n" + "vld2.8 {d0[7], d1[7]}, [%0]\n" + + "vst1.64 {d0}, [%2]\n" + "vst1.64 {d1}, [%4]\n" + + "4:\n" + + : "+r"(src), // %0 + "+r"(src_stride), // %1 + "+r"(dst_a), // %2 + "+r"(dst_stride_a), // %3 + "+r"(dst_b), // %4 + "+r"(dst_stride_b), // %5 + "+r"(width) // %6 + : "r"(vtbl_4x4_transpose_di)// %7 + : "memory", "cc", "r9", + "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} +#endif +} diff --git a/source/rotate_neon.s b/source/rotate_neon.s deleted file mode 100644 index 2d71c8937..000000000 --- a/source/rotate_neon.s +++ /dev/null @@ -1,545 +0,0 @@ - .global ReverseLine_NEON - .global ReverseLineUV_NEON - .global TransposeWx8_NEON - .global TransposeUVWx8_NEON - .type ReverseLine_NEON, function - .type ReverseLineUV_NEON, function - .type TransposeWx8_NEON, function - .type TransposeUVWx8_NEON, function - -@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width) -@ r0 const uint8* src -@ r1 uint8* dst -@ r2 width -ReverseLine_NEON: - - @ compute where to start writing destination - add r1, r2 @ dst + width - - @ work on segments that are multiples of 16 - lsrs r3, r2, #4 - - @ the output is written in two block. 8 bytes followed - @ by another 8. reading is done sequentially, from left to - @ right. writing is done from right to left in block sizes - @ r1, the destination pointer is incremented after writing - @ the first of the two blocks. need to subtract that 8 off - @ along with 16 to get the next location. - mov r3, #-24 - - beq Lline_residuals - - @ back of destination by the size of the register that is - @ going to be reversed - sub r1, #16 - - @ the loop needs to run on blocks of 16. what will be left - @ over is either a negative number, the residuals that need - @ to be done, or 0. if this isn't subtracted off here the - @ loop will run one extra time. - sub r2, #16 - -Lsegments_of_16: - vld1.8 {q0}, [r0]! @ src += 16 - - @ reverse the bytes in the 64 bit segments. unable to reverse - @ the bytes in the entire 128 bits in one go. - vrev64.8 q0, q0 - - @ because of the inability to reverse the entire 128 bits - @ reverse the writing out of the two 64 bit segments. - vst1.8 {d1}, [r1]! - vst1.8 {d0}, [r1], r3 @ dst -= 16 - - subs r2, #16 - bge Lsegments_of_16 - - @ add 16 back to the counter. if the result is 0 there is no - @ residuals so return - adds r2, #16 - bxeq lr - - add r1, #16 - -Lline_residuals: - - mov r3, #-3 - - sub r1, #2 - subs r2, #2 - @ check for 16*n+1 scenarios where segments_of_2 should not - @ be run, but there is something left over. - blt Lsegment_of_1 - -@ do this in neon registers as per -@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ -Lsegments_of_2: - vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 - - vst1.8 {d1[0]}, [r1]! - vst1.8 {d0[0]}, [r1], r3 @ dst -= 2 - - subs r2, #2 - bge Lsegments_of_2 - - adds r2, #2 - bxeq lr - -Lsegment_of_1: - add r1, #1 - vld1.8 {d0[0]}, [r0] - vst1.8 {d0[0]}, [r1] - - bx lr - -@ void TransposeWx8_NEON (const uint8* src, int src_stride, -@ uint8* dst, int dst_stride, -@ int w) -@ r0 const uint8* src -@ r1 int src_stride -@ r2 uint8* dst -@ r3 int dst_stride -@ stack int w -TransposeWx8_NEON: - push {r4,r8,r9,lr} - - ldr r8, [sp, #16] @ width - - @ loops are on blocks of 8. loop will stop when - @ counter gets to or below 0. starting the counter - @ at w-8 allow for this - sub r8, #8 - -@ handle 8x8 blocks. this should be the majority of the plane -Lloop_8x8: - mov r9, r0 - - vld1.8 {d0}, [r9], r1 - vld1.8 {d1}, [r9], r1 - vld1.8 {d2}, [r9], r1 - vld1.8 {d3}, [r9], r1 - vld1.8 {d4}, [r9], r1 - vld1.8 {d5}, [r9], r1 - vld1.8 {d6}, [r9], r1 - vld1.8 {d7}, [r9] - - vtrn.8 d1, d0 - vtrn.8 d3, d2 - vtrn.8 d5, d4 - vtrn.8 d7, d6 - - vtrn.16 d1, d3 - vtrn.16 d0, d2 - vtrn.16 d5, d7 - vtrn.16 d4, d6 - - vtrn.32 d1, d5 - vtrn.32 d0, d4 - vtrn.32 d3, d7 - vtrn.32 d2, d6 - - vrev16.8 q0, q0 - vrev16.8 q1, q1 - vrev16.8 q2, q2 - vrev16.8 q3, q3 - - mov r9, r2 - - vst1.8 {d1}, [r9], r3 - vst1.8 {d0}, [r9], r3 - vst1.8 {d3}, [r9], r3 - vst1.8 {d2}, [r9], r3 - vst1.8 {d5}, [r9], r3 - vst1.8 {d4}, [r9], r3 - vst1.8 {d7}, [r9], r3 - vst1.8 {d6}, [r9] - - add r0, #8 @ src += 8 - add r2, r3, lsl #3 @ dst += 8 * dst_stride - subs r8, #8 @ w -= 8 - bge Lloop_8x8 - - @ add 8 back to counter. if the result is 0 there are - @ no residuals. - adds r8, #8 - beq Ldone - - @ some residual, so between 1 and 7 lines left to transpose - cmp r8, #2 - blt Lblock_1x8 - - cmp r8, #4 - blt Lblock_2x8 - -Lblock_4x8: - mov r9, r0 - vld1.32 {d0[0]}, [r9], r1 - vld1.32 {d0[1]}, [r9], r1 - vld1.32 {d1[0]}, [r9], r1 - vld1.32 {d1[1]}, [r9], r1 - vld1.32 {d2[0]}, [r9], r1 - vld1.32 {d2[1]}, [r9], r1 - vld1.32 {d3[0]}, [r9], r1 - vld1.32 {d3[1]}, [r9] - - mov r9, r2 - - adr r12, vtbl_4x4_transpose - vld1.8 {q3}, [r12] - - vtbl.8 d4, {d0, d1}, d6 - vtbl.8 d5, {d0, d1}, d7 - vtbl.8 d0, {d2, d3}, d6 - vtbl.8 d1, {d2, d3}, d7 - - @ TODO: rework shuffle above to write - @ out with 4 instead of 8 writes - vst1.32 {d4[0]}, [r9], r3 - vst1.32 {d4[1]}, [r9], r3 - vst1.32 {d5[0]}, [r9], r3 - vst1.32 {d5[1]}, [r9] - - add r9, r2, #4 - vst1.32 {d0[0]}, [r9], r3 - vst1.32 {d0[1]}, [r9], r3 - vst1.32 {d1[0]}, [r9], r3 - vst1.32 {d1[1]}, [r9] - - add r0, #4 @ src += 4 - add r2, r3, lsl #2 @ dst += 4 * dst_stride - subs r8, #4 @ w -= 4 - beq Ldone - - @ some residual, check to see if it includes a 2x8 block, - @ or less - cmp r8, #2 - blt Lblock_1x8 - -Lblock_2x8: - mov r9, r0 - vld1.16 {d0[0]}, [r9], r1 - vld1.16 {d1[0]}, [r9], r1 - vld1.16 {d0[1]}, [r9], r1 - vld1.16 {d1[1]}, [r9], r1 - vld1.16 {d0[2]}, [r9], r1 - vld1.16 {d1[2]}, [r9], r1 - vld1.16 {d0[3]}, [r9], r1 - vld1.16 {d1[3]}, [r9] - - vtrn.8 d0, d1 - - mov r9, r2 - - vst1.64 {d0}, [r9], r3 - vst1.64 {d1}, [r9] - - add r0, #2 @ src += 2 - add r2, r3, lsl #1 @ dst += 2 * dst_stride - subs r8, #2 @ w -= 2 - beq Ldone - -Lblock_1x8: - vld1.8 {d0[0]}, [r0], r1 - vld1.8 {d0[1]}, [r0], r1 - vld1.8 {d0[2]}, [r0], r1 - vld1.8 {d0[3]}, [r0], r1 - vld1.8 {d0[4]}, [r0], r1 - vld1.8 {d0[5]}, [r0], r1 - vld1.8 {d0[6]}, [r0], r1 - vld1.8 {d0[7]}, [r0] - - vst1.64 {d0}, [r2] - -Ldone: - - pop {r4,r8,r9,pc} - -vtbl_4x4_transpose: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - -@ void ReverseLineUV_NEON (const uint8* src, -@ uint8* dst_a, -@ uint8* dst_b, -@ int width) -@ r0 const uint8* src -@ r1 uint8* dst_a -@ r2 uint8* dst_b -@ r3 width -ReverseLineUV_NEON: - - @ compute where to start writing destination - add r1, r1, r3 @ dst_a + width - add r2, r2, r3 @ dst_b + width - - @ work on input segments that are multiples of 16, but - @ width that has been passed is output segments, half - @ the size of input. - lsrs r12, r3, #3 - - beq Lline_residuals_di - - @ the output is written in to two blocks. - mov r12, #-8 - - @ back of destination by the size of the register that is - @ going to be reversed - sub r1, r1, #8 - sub r2, r2, #8 - - @ the loop needs to run on blocks of 8. what will be left - @ over is either a negative number, the residuals that need - @ to be done, or 0. if this isn't subtracted off here the - @ loop will run one extra time. - sub r3, r3, #8 - -Lsegments_of_8_di: - vld2.8 {d0, d1}, [r0]! @ src += 16 - - @ reverse the bytes in the 64 bit segments - vrev64.8 q0, q0 - - vst1.8 {d0}, [r1], r12 @ dst_a -= 8 - vst1.8 {d1}, [r2], r12 @ dst_b -= 8 - - subs r3, r3, #8 - bge Lsegments_of_8_di - - @ add 8 back to the counter. if the result is 0 there is no - @ residuals so return - adds r3, r3, #8 - bxeq lr - - add r1, r1, #8 - add r2, r2, #8 - -Lline_residuals_di: - - mov r12, #-1 - - sub r1, r1, #1 - sub r2, r2, #1 - -@ do this in neon registers as per -@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ -Lsegments_of_1: - vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 - - vst1.8 {d0[0]}, [r1], r12 @ dst_a -= 1 - vst1.8 {d1[0]}, [r2], r12 @ dst_b -= 1 - - subs r3, r3, #1 - bgt Lsegments_of_1 - - bx lr - -@ void TransposeUVWx8_NEON (const uint8* src, int src_stride, -@ uint8* dst_a, int dst_stride_a, -@ uint8* dst_b, int dst_stride_b, -@ int width) -@ r0 const uint8* src -@ r1 int src_stride -@ r2 uint8* dst_a -@ r3 int dst_stride_a -@ stack uint8* dst_b -@ stack int dst_stride_b -@ stack int width -TransposeUVWx8_NEON: - push {r4-r9,lr} - - ldr r4, [sp, #28] @ dst_b - ldr r5, [sp, #32] @ dst_stride_b - ldr r8, [sp, #36] @ width - @ loops are on blocks of 8. loop will stop when - @ counter gets to or below 0. starting the counter - @ at w-8 allow for this - sub r8, #8 - -@ handle 8x8 blocks. this should be the majority of the plane -Lloop_8x8_di: - mov r9, r0 - - vld2.8 {d0, d1}, [r9], r1 - vld2.8 {d2, d3}, [r9], r1 - vld2.8 {d4, d5}, [r9], r1 - vld2.8 {d6, d7}, [r9], r1 - vld2.8 {d16, d17}, [r9], r1 - vld2.8 {d18, d19}, [r9], r1 - vld2.8 {d20, d21}, [r9], r1 - vld2.8 {d22, d23}, [r9] - - vtrn.8 q1, q0 - vtrn.8 q3, q2 - vtrn.8 q9, q8 - vtrn.8 q11, q10 - - vtrn.16 q1, q3 - vtrn.16 q0, q2 - vtrn.16 q9, q11 - vtrn.16 q8, q10 - - vtrn.32 q1, q9 - vtrn.32 q0, q8 - vtrn.32 q3, q11 - vtrn.32 q2, q10 - - vrev16.8 q0, q0 - vrev16.8 q1, q1 - vrev16.8 q2, q2 - vrev16.8 q3, q3 - vrev16.8 q8, q8 - vrev16.8 q9, q9 - vrev16.8 q10, q10 - vrev16.8 q11, q11 - - mov r9, r2 - - vst1.8 {d2}, [r9], r3 - vst1.8 {d0}, [r9], r3 - vst1.8 {d6}, [r9], r3 - vst1.8 {d4}, [r9], r3 - vst1.8 {d18}, [r9], r3 - vst1.8 {d16}, [r9], r3 - vst1.8 {d22}, [r9], r3 - vst1.8 {d20}, [r9] - - mov r9, r4 - - vst1.8 {d3}, [r9], r5 - vst1.8 {d1}, [r9], r5 - vst1.8 {d7}, [r9], r5 - vst1.8 {d5}, [r9], r5 - vst1.8 {d19}, [r9], r5 - vst1.8 {d17}, [r9], r5 - vst1.8 {d23}, [r9], r5 - vst1.8 {d21}, [r9] - - add r0, #8*2 @ src += 8*2 - add r2, r3, lsl #3 @ dst_a += 8 * dst_stride_a - add r4, r5, lsl #3 @ dst_b += 8 * dst_stride_b - subs r8, #8 @ w -= 8 - bge Lloop_8x8_di - - @ add 8 back to counter. if the result is 0 there are - @ no residuals. - adds r8, #8 - beq Ldone_di - - @ some residual, so between 1 and 7 lines left to transpose - cmp r8, #2 - blt Lblock_1x8_di - - cmp r8, #4 - blt Lblock_2x8_di - -@ TODO(frkoenig) : clean this up -Lblock_4x8_di: - mov r9, r0 - vld1.64 {d0}, [r9], r1 - vld1.64 {d1}, [r9], r1 - vld1.64 {d2}, [r9], r1 - vld1.64 {d3}, [r9], r1 - vld1.64 {d4}, [r9], r1 - vld1.64 {d5}, [r9], r1 - vld1.64 {d6}, [r9], r1 - vld1.64 {d7}, [r9] - - adr r12, vtbl_4x4_transpose_di - vld1.8 {q15}, [r12] - - vtrn.8 q0, q1 - vtrn.8 q2, q3 - - vtbl.8 d16, {d0, d1}, d30 - vtbl.8 d17, {d0, d1}, d31 - vtbl.8 d18, {d2, d3}, d30 - vtbl.8 d19, {d2, d3}, d31 - vtbl.8 d20, {d4, d5}, d30 - vtbl.8 d21, {d4, d5}, d31 - vtbl.8 d22, {d6, d7}, d30 - vtbl.8 d23, {d6, d7}, d31 - - mov r9, r2 - - vst1.32 {d16[0]}, [r9], r3 - vst1.32 {d16[1]}, [r9], r3 - vst1.32 {d17[0]}, [r9], r3 - vst1.32 {d17[1]}, [r9], r3 - - add r9, r2, #4 - vst1.32 {d20[0]}, [r9], r3 - vst1.32 {d20[1]}, [r9], r3 - vst1.32 {d21[0]}, [r9], r3 - vst1.32 {d21[1]}, [r9] - - mov r9, r4 - - vst1.32 {d18[0]}, [r9], r5 - vst1.32 {d18[1]}, [r9], r5 - vst1.32 {d19[0]}, [r9], r5 - vst1.32 {d19[1]}, [r9], r5 - - add r9, r4, #4 - vst1.32 {d22[0]}, [r9], r5 - vst1.32 {d22[1]}, [r9], r5 - vst1.32 {d23[0]}, [r9], r5 - vst1.32 {d23[1]}, [r9] - - add r0, #4*2 @ src += 4 * 2 - add r2, r3, lsl #2 @ dst_a += 4 * dst_stride_a - add r4, r5, lsl #2 @ dst_b += 4 * dst_stride_b - subs r8, #4 @ w -= 4 - beq Ldone_di - - @ some residual, check to see if it includes a 2x8 block, - @ or less - cmp r8, #2 - blt Lblock_1x8_di - -Lblock_2x8_di: - mov r9, r0 - vld2.16 {d0[0], d2[0]}, [r9], r1 - vld2.16 {d1[0], d3[0]}, [r9], r1 - vld2.16 {d0[1], d2[1]}, [r9], r1 - vld2.16 {d1[1], d3[1]}, [r9], r1 - vld2.16 {d0[2], d2[2]}, [r9], r1 - vld2.16 {d1[2], d3[2]}, [r9], r1 - vld2.16 {d0[3], d2[3]}, [r9], r1 - vld2.16 {d1[3], d3[3]}, [r9] - - vtrn.8 d0, d1 - vtrn.8 d2, d3 - - mov r9, r2 - - vst1.64 {d0}, [r9], r3 - vst1.64 {d2}, [r9] - - mov r9, r4 - - vst1.64 {d1}, [r9], r5 - vst1.64 {d3}, [r9] - - add r0, #2*2 @ src += 2 * 2 - add r2, r3, lsl #1 @ dst_a += 2 * dst_stride_a - add r4, r5, lsl #1 @ dst_a += 2 * dst_stride_a - subs r8, #2 @ w -= 2 - beq Ldone_di - -Lblock_1x8_di: - vld2.8 {d0[0], d1[0]}, [r0], r1 - vld2.8 {d0[1], d1[1]}, [r0], r1 - vld2.8 {d0[2], d1[2]}, [r0], r1 - vld2.8 {d0[3], d1[3]}, [r0], r1 - vld2.8 {d0[4], d1[4]}, [r0], r1 - vld2.8 {d0[5], d1[5]}, [r0], r1 - vld2.8 {d0[6], d1[6]}, [r0], r1 - vld2.8 {d0[7], d1[7]}, [r0] - - vst1.64 {d0}, [r2] - vst1.64 {d1}, [r4] - -Ldone_di: - pop {r4-r9, pc} - -vtbl_4x4_transpose_di: - .byte 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc index 1c295b086..784b7d249 100644 --- a/unit_test/rotate_test.cc +++ b/unit_test/rotate_test.cc @@ -34,16 +34,12 @@ TEST_F(libyuvTest, Transpose) { for (iw = 8; iw < _rotate_max_w && !err; ++iw) for (ih = 8; ih < _rotate_max_h && !err; ++ih) { int i; - uint8 *input; - uint8 *output_1; - uint8 *output_2; - ow = ih; oh = iw; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_1 = static_cast(calloc(ow * oh, sizeof(uint8))); - output_2 = static_cast(calloc(iw * ih, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_1, ow * oh) + align_buffer_16(output_2, iw * ih) for (i = 0; i < (iw * ih); ++i) input[i] = i; @@ -67,9 +63,9 @@ TEST_F(libyuvTest, Transpose) { print_array(output_2, iw, ih); } - free(input); - free(output_1); - free(output_2); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_1) + free_aligned_buffer_16(output_2) } EXPECT_EQ(0, err); @@ -82,18 +78,15 @@ TEST_F(libyuvTest, TransposeUV) { for (iw = 16; iw < _rotate_max_w && !err; iw += 2) for (ih = 8; ih < _rotate_max_h && !err; ++ih) { int i; - uint8 *input; - uint8 *output_a1, *output_b1; - uint8 *output_a2, *output_b2; ow = ih; oh = iw >> 1; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_a1 = static_cast(calloc(ow * oh, sizeof(uint8))); - output_b1 = static_cast(calloc(ow * oh, sizeof(uint8))); - output_a2 = static_cast(calloc(iw * ih, sizeof(uint8))); - output_b2 = static_cast(calloc(iw * ih, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_a1, ow * oh) + align_buffer_16(output_b1, ow * oh) + align_buffer_16(output_a2, iw * ih) + align_buffer_16(output_b2, iw * ih) for (i = 0; i < (iw * ih); i += 2) { input[i] = i >> 1; @@ -125,11 +118,11 @@ TEST_F(libyuvTest, TransposeUV) { print_array(output_b2, oh, ow); } - free(input); - free(output_a1); - free(output_b1); - free(output_a2); - free(output_b2); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_a1) + free_aligned_buffer_16(output_b1) + free_aligned_buffer_16(output_a2) + free_aligned_buffer_16(output_b2) } EXPECT_EQ(0, err); @@ -142,20 +135,15 @@ TEST_F(libyuvTest, RotatePlane90) { for (iw = 8; iw < _rotate_max_w && !err; ++iw) for (ih = 8; ih < _rotate_max_h && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_90; - uint8 *output_180; - uint8 *output_270; ow = ih; oh = iw; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast(calloc(iw * ih, sizeof(uint8))); - output_90 = static_cast(calloc(ow * oh, sizeof(uint8))); - output_180 = static_cast(calloc(iw * ih, sizeof(uint8))); - output_270 = static_cast(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_90, ow * oh) + align_buffer_16(output_180, iw * ih) + align_buffer_16(output_270, ow * oh) for (i = 0; i < (iw * ih); ++i) input[i] = i; @@ -187,11 +175,11 @@ TEST_F(libyuvTest, RotatePlane90) { print_array(output_0, iw, ih); } - free(input); - free(output_0); - free(output_90); - free(output_180); - free(output_270); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_90) + free_aligned_buffer_16(output_180) + free_aligned_buffer_16(output_270) } EXPECT_EQ(0, err); @@ -204,24 +192,17 @@ TEST_F(libyuvTest, RotateUV90) { for (iw = 16; iw < _rotate_max_w && !err; iw += 2) for (ih = 8; ih < _rotate_max_h && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0_u; - uint8 *output_0_v; - uint8 *output_90_u; - uint8 *output_90_v; - uint8 *output_180_u; - uint8 *output_180_v; ow = ih; oh = iw >> 1; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_0_u = static_cast(calloc(ow * oh, sizeof(uint8))); - output_0_v = static_cast(calloc(ow * oh, sizeof(uint8))); - output_90_u = static_cast(calloc(ow * oh, sizeof(uint8))); - output_90_v = static_cast(calloc(ow * oh, sizeof(uint8))); - output_180_u = static_cast(calloc(ow * oh, sizeof(uint8))); - output_180_v = static_cast(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0_u, ow * oh) + align_buffer_16(output_0_v, ow * oh) + align_buffer_16(output_90_u, ow * oh) + align_buffer_16(output_90_v, ow * oh) + align_buffer_16(output_180_u, ow * oh) + align_buffer_16(output_180_v, ow * oh) for (i = 0; i < (iw * ih); i += 2) { input[i] = i >> 1; @@ -266,13 +247,13 @@ TEST_F(libyuvTest, RotateUV90) { print_array(output_0_v, oh, ow); } - free(input); - free(output_0_u); - free(output_0_v); - free(output_90_u); - free(output_90_v); - free(output_180_u); - free(output_180_v); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0_u) + free_aligned_buffer_16(output_0_v) + free_aligned_buffer_16(output_90_u) + free_aligned_buffer_16(output_90_v) + free_aligned_buffer_16(output_180_u) + free_aligned_buffer_16(output_180_v) } EXPECT_EQ(0, err); @@ -285,24 +266,17 @@ TEST_F(libyuvTest, RotateUV180) { for (iw = 16; iw < _rotate_max_w && !err; iw += 2) for (ih = 8; ih < _rotate_max_h && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0_u; - uint8 *output_0_v; - uint8 *output_90_u; - uint8 *output_90_v; - uint8 *output_180_u; - uint8 *output_180_v; ow = iw >> 1; oh = ih; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_0_u = static_cast(calloc(ow * oh, sizeof(uint8))); - output_0_v = static_cast(calloc(ow * oh, sizeof(uint8))); - output_90_u = static_cast(calloc(ow * oh, sizeof(uint8))); - output_90_v = static_cast(calloc(ow * oh, sizeof(uint8))); - output_180_u = static_cast(calloc(ow * oh, sizeof(uint8))); - output_180_v = static_cast(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0_u, ow * oh) + align_buffer_16(output_0_v, ow * oh) + align_buffer_16(output_90_u, ow * oh) + align_buffer_16(output_90_v, ow * oh) + align_buffer_16(output_180_u, ow * oh) + align_buffer_16(output_180_v, ow * oh) for (i = 0; i < (iw * ih); i += 2) { input[i] = i >> 1; @@ -347,13 +321,13 @@ TEST_F(libyuvTest, RotateUV180) { print_array(output_0_v, ow, oh); } - free(input); - free(output_0_u); - free(output_0_v); - free(output_90_u); - free(output_90_v); - free(output_180_u); - free(output_180_v); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0_u) + free_aligned_buffer_16(output_0_v) + free_aligned_buffer_16(output_90_u) + free_aligned_buffer_16(output_90_v) + free_aligned_buffer_16(output_180_u) + free_aligned_buffer_16(output_180_v) } EXPECT_EQ(0, err); @@ -366,24 +340,17 @@ TEST_F(libyuvTest, RotateUV270) { for (iw = 16; iw < _rotate_max_w && !err; iw += 2) for (ih = 8; ih < _rotate_max_h && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0_u; - uint8 *output_0_v; - uint8 *output_270_u; - uint8 *output_270_v; - uint8 *output_180_u; - uint8 *output_180_v; ow = ih; oh = iw >> 1; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_0_u = static_cast(calloc(ow * oh, sizeof(uint8))); - output_0_v = static_cast(calloc(ow * oh, sizeof(uint8))); - output_270_u = static_cast(calloc(ow * oh, sizeof(uint8))); - output_270_v = static_cast(calloc(ow * oh, sizeof(uint8))); - output_180_u = static_cast(calloc(ow * oh, sizeof(uint8))); - output_180_v = static_cast(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0_u, ow * oh) + align_buffer_16(output_0_v, ow * oh) + align_buffer_16(output_270_u, ow * oh) + align_buffer_16(output_270_v, ow * oh) + align_buffer_16(output_180_u, ow * oh) + align_buffer_16(output_180_v, ow * oh) for (i = 0; i < (iw * ih); i += 2) { input[i] = i >> 1; @@ -429,13 +396,13 @@ TEST_F(libyuvTest, RotateUV270) { print_array(output_0_v, oh, ow); } - free(input); - free(output_0_u); - free(output_0_v); - free(output_270_u); - free(output_270_v); - free(output_180_u); - free(output_180_v); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0_u) + free_aligned_buffer_16(output_0_v) + free_aligned_buffer_16(output_270_u) + free_aligned_buffer_16(output_270_v) + free_aligned_buffer_16(output_180_u) + free_aligned_buffer_16(output_180_v) } EXPECT_EQ(0, err); @@ -448,16 +415,13 @@ TEST_F(libyuvTest, RotatePlane180) { for (iw = 8; iw < _rotate_max_w && !err; ++iw) for (ih = 8; ih < _rotate_max_h && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_180; ow = iw; oh = ih; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast(calloc(iw * ih, sizeof(uint8))); - output_180 = static_cast(calloc(iw * ih, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_180, iw * ih) for (i = 0; i < (iw * ih); ++i) input[i] = i; @@ -481,9 +445,9 @@ TEST_F(libyuvTest, RotatePlane180) { print_array(output_0, iw, ih); } - free(input); - free(output_0); - free(output_180); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_180) } EXPECT_EQ(0, err); @@ -496,20 +460,15 @@ TEST_F(libyuvTest, RotatePlane270) { for (iw = 8; iw < _rotate_max_w && !err; ++iw) for (ih = 8; ih < _rotate_max_h && !err; ++ih) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_90; - uint8 *output_180; - uint8 *output_270; ow = ih; oh = iw; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast(calloc(iw * ih, sizeof(uint8))); - output_90 = static_cast(calloc(ow * oh, sizeof(uint8))); - output_180 = static_cast(calloc(iw * ih, sizeof(uint8))); - output_270 = static_cast(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_90, ow * oh) + align_buffer_16(output_180, iw * ih) + align_buffer_16(output_270, ow * oh) for (i = 0; i < (iw * ih); ++i) input[i] = i; @@ -541,11 +500,11 @@ TEST_F(libyuvTest, RotatePlane270) { print_array(output_0, iw, ih); } - free(input); - free(output_0); - free(output_90); - free(output_180); - free(output_270); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_90) + free_aligned_buffer_16(output_180) + free_aligned_buffer_16(output_270) } EXPECT_EQ(0, err); @@ -558,15 +517,13 @@ TEST_F(libyuvTest, RotatePlane90and270) { for (iw = 16; iw < _rotate_max_w && !err; iw += 4) for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_90; + ow = ih; oh = iw; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast(calloc(iw * ih, sizeof(uint8))); - output_90 = static_cast(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_90, ow * oh) for (i = 0; i < (iw * ih); ++i) input[i] = i; @@ -590,9 +547,9 @@ TEST_F(libyuvTest, RotatePlane90and270) { print_array(output_0, iw, ih); } - free(input); - free(output_0); - free(output_90); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_90) } EXPECT_EQ(0, err); @@ -605,15 +562,13 @@ TEST_F(libyuvTest, RotatePlane90Pitch) { for (iw = 16; iw < _rotate_max_w && !err; iw += 4) for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_90; + int ow = ih; int oh = iw; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast(calloc(iw * ih, sizeof(uint8))); - output_90 = static_cast(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_90, ow * oh) for (i = 0; i < (iw * ih); ++i) input[i] = i; @@ -649,9 +604,9 @@ TEST_F(libyuvTest, RotatePlane90Pitch) { print_array(output_0, iw, ih); } - free(input); - free(output_0); - free(output_90); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_90) } EXPECT_EQ(0, err); @@ -664,16 +619,13 @@ TEST_F(libyuvTest, RotatePlane270Pitch) { for (iw = 16; iw < _rotate_max_w && !err; iw += 4) for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { int i; - uint8 *input; - uint8 *output_0; - uint8 *output_270; ow = ih; oh = iw; - input = static_cast(calloc(iw * ih, sizeof(uint8))); - output_0 = static_cast(calloc(iw * ih, sizeof(uint8))); - output_270 = static_cast(calloc(ow * oh, sizeof(uint8))); + align_buffer_16(input, iw * ih) + align_buffer_16(output_0, iw * ih) + align_buffer_16(output_270, ow * oh) for (i = 0; i < (iw * ih); ++i) input[i] = i; @@ -709,9 +661,9 @@ TEST_F(libyuvTest, RotatePlane270Pitch) { print_array(output_0, iw, ih); } - free(input); - free(output_0); - free(output_270); + free_aligned_buffer_16(input) + free_aligned_buffer_16(output_0) + free_aligned_buffer_16(output_270) } EXPECT_EQ(0, err); @@ -719,10 +671,6 @@ TEST_F(libyuvTest, RotatePlane270Pitch) { TEST_F(libyuvTest, I420Rotate90) { int err = 0; - uint8 *orig_y, *orig_u, *orig_v; - uint8 *ro0_y, *ro0_u, *ro0_v; - uint8 *ro90_y, *ro90_u, *ro90_v; - uint8 *ro270_y, *ro270_u, *ro270_v; int yw = 1024; int yh = 768; @@ -737,21 +685,21 @@ TEST_F(libyuvTest, I420Rotate90) { srandom(time(NULL)); - orig_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - orig_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - orig_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_u, uv_plane_size) + align_buffer_16(orig_v, uv_plane_size) - ro0_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro0_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro0_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro0_y, y_plane_size) + align_buffer_16(ro0_u, uv_plane_size) + align_buffer_16(ro0_v, uv_plane_size) - ro90_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro90_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro90_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro90_y, y_plane_size) + align_buffer_16(ro90_u, uv_plane_size) + align_buffer_16(ro90_v, uv_plane_size) - ro270_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro270_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro270_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro270_y, y_plane_size) + align_buffer_16(ro270_u, uv_plane_size) + align_buffer_16(ro270_v, uv_plane_size) // fill image buffers with random data for (i = b; i < (yh + b); ++i) { @@ -816,28 +764,24 @@ TEST_F(libyuvTest, I420Rotate90) { ++err; } - free(orig_y); - free(orig_u); - free(orig_v); - free(ro0_y); - free(ro0_u); - free(ro0_v); - free(ro90_y); - free(ro90_u); - free(ro90_v); - free(ro270_y); - free(ro270_u); - free(ro270_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_u) + free_aligned_buffer_16(orig_v) + free_aligned_buffer_16(ro0_y) + free_aligned_buffer_16(ro0_u) + free_aligned_buffer_16(ro0_v) + free_aligned_buffer_16(ro90_y) + free_aligned_buffer_16(ro90_u) + free_aligned_buffer_16(ro90_v) + free_aligned_buffer_16(ro270_y) + free_aligned_buffer_16(ro270_u) + free_aligned_buffer_16(ro270_v) EXPECT_EQ(0, err); } TEST_F(libyuvTest, I420Rotate270) { int err = 0; - uint8 *orig_y, *orig_u, *orig_v; - uint8 *ro0_y, *ro0_u, *ro0_v; - uint8 *ro90_y, *ro90_u, *ro90_v; - uint8 *ro270_y, *ro270_u, *ro270_v; int yw = 1024; int yh = 768; @@ -852,21 +796,21 @@ TEST_F(libyuvTest, I420Rotate270) { srandom(time(NULL)); - orig_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - orig_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - orig_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_u, uv_plane_size) + align_buffer_16(orig_v, uv_plane_size) - ro0_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro0_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro0_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro0_y, y_plane_size) + align_buffer_16(ro0_u, uv_plane_size) + align_buffer_16(ro0_v, uv_plane_size) - ro90_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro90_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro90_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro90_y, y_plane_size) + align_buffer_16(ro90_u, uv_plane_size) + align_buffer_16(ro90_v, uv_plane_size) - ro270_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro270_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro270_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro270_y, y_plane_size) + align_buffer_16(ro270_u, uv_plane_size) + align_buffer_16(ro270_v, uv_plane_size) // fill image buffers with random data for (i = b; i < (yh + b); ++i) { @@ -931,27 +875,24 @@ TEST_F(libyuvTest, I420Rotate270) { ++err; } - free(orig_y); - free(orig_u); - free(orig_v); - free(ro0_y); - free(ro0_u); - free(ro0_v); - free(ro90_y); - free(ro90_u); - free(ro90_v); - free(ro270_y); - free(ro270_u); - free(ro270_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_u) + free_aligned_buffer_16(orig_v) + free_aligned_buffer_16(ro0_y) + free_aligned_buffer_16(ro0_u) + free_aligned_buffer_16(ro0_v) + free_aligned_buffer_16(ro90_y) + free_aligned_buffer_16(ro90_u) + free_aligned_buffer_16(ro90_v) + free_aligned_buffer_16(ro270_y) + free_aligned_buffer_16(ro270_u) + free_aligned_buffer_16(ro270_v) EXPECT_EQ(0, err); } TEST_F(libyuvTest, NV12ToI420Rotate90) { int err = 0; - uint8 *orig_y, *orig_uv; - uint8 *ro0_y, *ro0_u, *ro0_v; - uint8 *ro90_y, *ro90_u, *ro90_v; int yw = 1024; int yh = 768; @@ -966,16 +907,16 @@ TEST_F(libyuvTest, NV12ToI420Rotate90) { srandom(time(NULL)); - orig_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - orig_uv = static_cast(calloc(o_uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_uv, o_uv_plane_size) - ro0_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro0_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro0_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro0_y, y_plane_size) + align_buffer_16(ro0_u, uv_plane_size) + align_buffer_16(ro0_v, uv_plane_size) - ro90_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro90_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro90_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro90_y, y_plane_size) + align_buffer_16(ro90_u, uv_plane_size) + align_buffer_16(ro90_v, uv_plane_size) // fill image buffers with random data for (i = b; i < (yh + b); ++i) { @@ -1036,23 +977,20 @@ TEST_F(libyuvTest, NV12ToI420Rotate90) { if (!zero_cnt) ++err; - free(orig_y); - free(orig_uv); - free(ro0_y); - free(ro0_u); - free(ro0_v); - free(ro90_y); - free(ro90_u); - free(ro90_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_uv) + free_aligned_buffer_16(ro0_y) + free_aligned_buffer_16(ro0_u) + free_aligned_buffer_16(ro0_v) + free_aligned_buffer_16(ro90_y) + free_aligned_buffer_16(ro90_u) + free_aligned_buffer_16(ro90_v) EXPECT_EQ(0, err); } TEST_F(libyuvTest, NV12ToI420Rotate270) { int err = 0; - uint8 *orig_y, *orig_uv; - uint8 *ro0_y, *ro0_u, *ro0_v; - uint8 *ro270_y, *ro270_u, *ro270_v; int yw = 1024; int yh = 768; @@ -1068,16 +1006,16 @@ TEST_F(libyuvTest, NV12ToI420Rotate270) { srandom(time(NULL)); - orig_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - orig_uv = static_cast(calloc(o_uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_uv, o_uv_plane_size) - ro0_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro0_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro0_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro0_y, y_plane_size) + align_buffer_16(ro0_u, uv_plane_size) + align_buffer_16(ro0_v, uv_plane_size) - ro270_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro270_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro270_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro270_y, y_plane_size) + align_buffer_16(ro270_u, uv_plane_size) + align_buffer_16(ro270_v, uv_plane_size) // fill image buffers with random data for (i = b; i < (yh + b); ++i) { @@ -1138,23 +1076,20 @@ TEST_F(libyuvTest, NV12ToI420Rotate270) { if (!zero_cnt) ++err; - free(orig_y); - free(orig_uv); - free(ro0_y); - free(ro0_u); - free(ro0_v); - free(ro270_y); - free(ro270_u); - free(ro270_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_uv) + free_aligned_buffer_16(ro0_y) + free_aligned_buffer_16(ro0_u) + free_aligned_buffer_16(ro0_v) + free_aligned_buffer_16(ro270_y) + free_aligned_buffer_16(ro270_u) + free_aligned_buffer_16(ro270_v) EXPECT_EQ(0, err); } TEST_F(libyuvTest, NV12ToI420Rotate180) { int err = 0; - uint8 *orig_y, *orig_uv; - uint8 *ro0_y, *ro0_u, *ro0_v; - uint8 *ro180_y, *ro180_u, *ro180_v; int yw = 1024; int yh = 768; @@ -1170,16 +1105,16 @@ TEST_F(libyuvTest, NV12ToI420Rotate180) { srandom(time(NULL)); - orig_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - orig_uv = static_cast(calloc(o_uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_uv, o_uv_plane_size) - ro0_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro0_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro0_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro0_y, y_plane_size) + align_buffer_16(ro0_u, uv_plane_size) + align_buffer_16(ro0_v, uv_plane_size) - ro180_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - ro180_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - ro180_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(ro180_y, y_plane_size) + align_buffer_16(ro180_u, uv_plane_size) + align_buffer_16(ro180_v, uv_plane_size) // fill image buffers with random data for (i = b; i < (yh + b); ++i) { @@ -1236,24 +1171,20 @@ TEST_F(libyuvTest, NV12ToI420Rotate180) { if (!zero_cnt) ++err; - free(orig_y); - free(orig_uv); - free(ro0_y); - free(ro0_u); - free(ro0_v); - free(ro180_y); - free(ro180_u); - free(ro180_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_uv) + free_aligned_buffer_16(ro0_y) + free_aligned_buffer_16(ro0_u) + free_aligned_buffer_16(ro0_v) + free_aligned_buffer_16(ro180_y) + free_aligned_buffer_16(ro180_u) + free_aligned_buffer_16(ro180_v) EXPECT_EQ(0, err); } TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) { int y_err = 0, uv_err = 0; - uint8 *orig_y, *orig_uv; - uint8 *roa_y, *roa_u, *roa_v; - uint8 *rob_y, *rob_u, *rob_v; - uint8 *roc_y, *roc_u, *roc_v; int yw = 1024; int yh = 768; @@ -1268,20 +1199,20 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) { srandom(time(NULL)); - orig_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - orig_uv = static_cast(calloc(o_uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_uv, o_uv_plane_size) - roa_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - roa_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - roa_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(roa_y, y_plane_size) + align_buffer_16(roa_u, uv_plane_size) + align_buffer_16(roa_v, uv_plane_size) - rob_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - rob_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - rob_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(rob_y, y_plane_size) + align_buffer_16(rob_u, uv_plane_size) + align_buffer_16(rob_v, uv_plane_size) - roc_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - roc_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - roc_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(roc_y, y_plane_size) + align_buffer_16(roc_u, uv_plane_size) + align_buffer_16(roc_v, uv_plane_size) // fill image buffers with random data for (i = b; i < (yh + b); ++i) { @@ -1382,26 +1313,23 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight90) { print_array(roc_v, uv_st_0, uv_st_90); } - free(orig_y); - free(orig_uv); - free(roa_y); - free(roa_u); - free(roa_v); - free(rob_y); - free(rob_u); - free(rob_v); - free(roc_y); - free(roc_u); - free(roc_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_uv) + free_aligned_buffer_16(roa_y) + free_aligned_buffer_16(roa_u) + free_aligned_buffer_16(roa_v) + free_aligned_buffer_16(rob_y) + free_aligned_buffer_16(rob_u) + free_aligned_buffer_16(rob_v) + free_aligned_buffer_16(roc_y) + free_aligned_buffer_16(roc_u) + free_aligned_buffer_16(roc_v) EXPECT_EQ(0, y_err + uv_err); } TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) { int y_err = 0, uv_err = 0; - uint8 *orig_y, *orig_uv; - uint8 *roa_y, *roa_u, *roa_v; - uint8 *rob_y, *rob_u, *rob_v; int yw = 1024; int yh = 768; @@ -1416,16 +1344,16 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) { srandom(time(NULL)); - orig_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - orig_uv = static_cast(calloc(o_uv_plane_size, sizeof(uint8))); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(orig_uv, o_uv_plane_size) - roa_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - roa_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - roa_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(roa_y, y_plane_size) + align_buffer_16(roa_u, uv_plane_size) + align_buffer_16(roa_v, uv_plane_size) - rob_y = static_cast(calloc(y_plane_size, sizeof(uint8))); - rob_u = static_cast(calloc(uv_plane_size, sizeof(uint8))); - rob_v = static_cast(calloc(uv_plane_size, sizeof(uint8))); + align_buffer_16(rob_y, y_plane_size) + align_buffer_16(rob_u, uv_plane_size) + align_buffer_16(rob_v, uv_plane_size) // fill image buffers with random data for (i = b; i < (yh + b); ++i) { @@ -1506,14 +1434,14 @@ TEST_F(libyuvTest, NV12ToI420RotateNegHeight180) { print_array(rob_v, uv_st, uvh + (2 * b)); } - free(orig_y); - free(orig_uv); - free(roa_y); - free(roa_u); - free(roa_v); - free(rob_y); - free(rob_u); - free(rob_v); + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(orig_uv) + free_aligned_buffer_16(roa_y) + free_aligned_buffer_16(roa_u) + free_aligned_buffer_16(roa_v) + free_aligned_buffer_16(rob_y) + free_aligned_buffer_16(rob_u) + free_aligned_buffer_16(rob_v) EXPECT_EQ(0, y_err + uv_err); } diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 6399e71b0..dc9c8bfee 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -16,43 +16,6 @@ using namespace libyuv; -#define align_buffer_16(var, size) \ - uint8 *var; \ - uint8 *var##_mem; \ - var##_mem = reinterpret_cast(calloc(size+15, sizeof(uint8))); \ - var = reinterpret_cast \ - ((reinterpret_cast(var##_mem) + 15) & (~0x0f)); - -#define free_aligned_buffer_16(var) \ - free(var##_mem); \ - var = 0; - -#ifdef WIN32 - -#include -static double get_time() -{ - LARGE_INTEGER t, f; - QueryPerformanceCounter(&t); - QueryPerformanceFrequency(&f); - return double(t.QuadPart)/double(f.QuadPart); -} - -#else - -#include -#include - -static double get_time() -{ - struct timeval t; - struct timezone tzp; - gettimeofday(&t, &tzp); - return t.tv_sec + t.tv_usec*1e-6; -} - -#endif - static int TestFilter(int src_width, int src_height, int dst_width, int dst_height, FilterMode f) { diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h index cac30c72a..43965b779 100644 --- a/unit_test/unit_test.h +++ b/unit_test/unit_test.h @@ -13,6 +13,43 @@ #include +#define align_buffer_16(var, size) \ + uint8 *var; \ + uint8 *var##_mem; \ + var##_mem = reinterpret_cast(calloc(size+15, sizeof(uint8))); \ + var = reinterpret_cast \ + ((reinterpret_cast(var##_mem) + 15) & (~0x0f)); + +#define free_aligned_buffer_16(var) \ + free(var##_mem); \ + var = 0; + +#ifdef WIN32 + +#include +static double get_time() +{ + LARGE_INTEGER t, f; + QueryPerformanceCounter(&t); + QueryPerformanceFrequency(&f); + return double(t.QuadPart)/double(f.QuadPart); +} + +#else + +#include +#include + +static double get_time() +{ + struct timeval t; + struct timezone tzp; + gettimeofday(&t, &tzp); + return t.tv_sec + t.tv_usec*1e-6; +} + +#endif + class libyuvTest : public ::testing::Test { protected: libyuvTest();