From ed6edcab8bb0ff24e774bc5aba50351b44776dde Mon Sep 17 00:00:00 2001 From: "frkoenig@google.com" Date: Wed, 12 Oct 2011 21:37:43 +0000 Subject: [PATCH] Fixed image rotators. 90, 180, 270 rotate of array with a minimum size of 8x8. Also deinterleave on rotate for NV12/NV21 formats. Review URL: http://webrtc-codereview.appspot.com/195002 git-svn-id: http://libyuv.googlecode.com/svn/trunk@23 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- libyuv_test.gyp | 1 + source/rotate.cc | 121 +++++ source/rotate.h | 45 ++ source/rotate_deinterleave.cc | 171 ++++++++ source/rotate_deinterleave_neon.s | 310 +++++++++++++ source/rotate_neon.s | 254 +++++++++++ unit_test/rotate_test.cc | 707 ++++++++++++++++++++++++++++++ unit_test/unit_test.cc | 4 +- unit_test/unit_test.h | 5 + 9 files changed, 1617 insertions(+), 1 deletion(-) create mode 100644 source/rotate.cc create mode 100644 source/rotate.h create mode 100644 source/rotate_deinterleave.cc create mode 100644 source/rotate_deinterleave_neon.s create mode 100644 source/rotate_neon.s create mode 100644 unit_test/rotate_test.cc diff --git a/libyuv_test.gyp b/libyuv_test.gyp index b132217a9..f67a85269 100644 --- a/libyuv_test.gyp +++ b/libyuv_test.gyp @@ -23,6 +23,7 @@ # sources 'unit_test/unit_test.cc', + 'unit_test/rotate_test.cc', ], # source 'conditions': [ ['OS=="linux"', { diff --git a/source/rotate.cc b/source/rotate.cc new file mode 100644 index 000000000..8075d47fb --- /dev/null +++ b/source/rotate.cc @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "rotate.h" + +namespace libyuv { + +typedef void (*reverse_func)(const uint8*, uint8*, int); +typedef void (*rotate_wx8func)(const uint8*, int, uint8*, int, int); +typedef void (*rotate_wxhfunc)(const uint8*, int, uint8*, int, int, int); + +#ifdef __ARM_NEON__ +extern "C" { +void ReverseLine_NEON(const uint8* src, uint8* dst, int width); +void Transpose_wx8_NEON(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, int width); +} // extern "C" +#endif + +static void Transpose_wx8_C(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, + int w) { + int i, j; + for (i = 0; i < w; ++i) + for (j = 0; j < 8; ++j) + dst[i * dst_pitch + j] = src[j * src_pitch + i]; +} + +static void Transpose_wxh_C(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, + int width, int height) { + int i, j; + for (i = 0; i < width; ++i) + for (j = 0; j < height; ++j) + dst[i * dst_pitch + j] = src[j * src_pitch + i]; +} + +void Transpose(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, + int width, int height) { + int i = height; + rotate_wx8func Transpose_wx8; + rotate_wxhfunc Transpose_wxh; + + // do processor detection here. +#ifdef __ARM_NEON__ + Transpose_wx8 = Transpose_wx8_NEON; + Transpose_wxh = Transpose_wxh_C; +#else + Transpose_wx8 = Transpose_wx8_C; + Transpose_wxh = Transpose_wxh_C; +#endif + + // work across the source in 8x8 tiles + do { + Transpose_wx8(src, src_pitch, dst, dst_pitch, width); + + src += 8 * src_pitch; + dst += 8; + i -= 8; + } while (i >= 8); + +// TODO(frkoenig): Have wx4 and maybe wx2 + Transpose_wxh(src, src_pitch, dst, dst_pitch, width, i); +} + +void Rotate90(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, + int width, int height) { + src += src_pitch*(height-1); + src_pitch = -src_pitch; + + Transpose(src, src_pitch, dst, dst_pitch, width, height); +} + +void Rotate270(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, + int width, int height) { + dst += dst_pitch*(width-1); + dst_pitch = -dst_pitch; + + Transpose(src, src_pitch, dst, dst_pitch, width, height); +} + +void ReverseLine_C(const uint8* src, uint8* dst, int width) { + int i; + for (i = 0; i < width; ++i) + dst[width-1 - i] = src[i]; +} + +void Rotate180(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, + int width, int height) { + int i; + reverse_func ReverseLine; + + // do processor detection here. +#ifdef __ARM_NEON__ + ReverseLine = ReverseLine_NEON; +#else + ReverseLine = ReverseLine_C; +#endif + + dst += dst_pitch*(height-1); + + for (i = 0; i < height; ++i) { + ReverseLine(src, dst, width); + + src += src_pitch; + dst -= dst_pitch; + } +} + +} // namespace libyuv diff --git a/source/rotate.h b/source/rotate.h new file mode 100644 index 000000000..d15ad6709 --- /dev/null +++ b/source/rotate.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_SOURCE_ROTATE_H_ +#define LIBYUV_SOURCE_ROTATE_H_ + +#include "basic_types.h" + +namespace libyuv { +void Rotate90(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, + int width, int height); +void Rotate180(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, + int width, int height); +void Rotate270(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, + int width, int height); + +void Rotate90_deinterleave(const uint8* src, int src_pitch, + uint8* dst_a, int dst_pitch_a, + uint8* dst_b, int dst_pitch_b, + int width, int height); +void Rotate180_deinterleave(const uint8* src, int src_pitch, + uint8* dst_a, int dst_pitch_a, + uint8* dst_b, int dst_pitch_b, + int width, int height); +void Rotate270_deinterleave(const uint8* src, int src_pitch, + uint8* dst_a, int dst_pitch_a, + uint8* dst_b, int dst_pitch_b, + int width, int height); + +void Transpose(const uint8* src, int src_pitch, + uint8* dst, int dst_pitch, + int width, int height); +} // namespace libyuv + +#endif // LIBYUV_SOURCE_ROTATE_H_ diff --git a/source/rotate_deinterleave.cc b/source/rotate_deinterleave.cc new file mode 100644 index 000000000..fcbb0d42b --- /dev/null +++ b/source/rotate_deinterleave.cc @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "rotate.h" + +namespace libyuv { + +typedef void (*reverse_func)(const uint8*, uint8*, uint8*, int); +typedef void (*rotate_wx8func)(const uint8*, int, + uint8*, int, + uint8*, int, int); +typedef void (*rotate_wxhfunc)(const uint8*, int, + uint8*, int, + uint8*, int, int, int); + +#ifdef __ARM_NEON__ +extern "C" { +void RestoreRegisters_NEON(unsigned long long *restore); +void ReverseLine_di_NEON(const uint8* src, + uint8* dst_a, uint8* dst_b, + int width); +void SaveRegisters_NEON(unsigned long long *store); +void Transpose_di_wx8_NEON(const uint8* src, int src_pitch, + uint8* dst_a, int dst_pitch_a, + uint8* dst_b, int dst_pitch_b, + int width); +} // extern "C" +#endif + +static void Transpose_di_wx8_C(const uint8* src, int src_pitch, + uint8* dst_a, int dst_pitch_a, + uint8* dst_b, int dst_pitch_b, + int w) { + int i, j; + for (i = 0; i < w*2; i += 2) + for (j = 0; j < 8; ++j) { + dst_a[j + (i>>1)*dst_pitch_a] = src[i + j*src_pitch]; + dst_b[j + (i>>1)*dst_pitch_b] = src[i + j*src_pitch + 1]; + } +} + +static void Transpose_di_wxh_C(const uint8* src, int src_pitch, + uint8* dst_a, int dst_pitch_a, + uint8* dst_b, int dst_pitch_b, + int w, int h) { + int i, j; + for (i = 0; i < w*2; i += 2) + for (j = 0; j < h; ++j) { + dst_a[j + (i>>1)*dst_pitch_a] = src[i + j*src_pitch]; + dst_b[j + (i>>1)*dst_pitch_b] = src[i + j*src_pitch + 1]; + } +} + +void Transpose_deinterleave(const uint8* src, int src_pitch, + uint8* dst_a, int dst_pitch_a, + uint8* dst_b, int dst_pitch_b, + int width, int height) { + int i = height; + rotate_wx8func Transpose_wx8; + rotate_wxhfunc Transpose_wxh; + + // do processor detection here. +#ifdef __ARM_NEON__ + unsigned long long store_reg[8]; + SaveRegisters_NEON(store_reg); + Transpose_wx8 = Transpose_di_wx8_NEON; + Transpose_wxh = Transpose_di_wxh_C; +#else + Transpose_wx8 = Transpose_di_wx8_C; + Transpose_wxh = Transpose_di_wxh_C; +#endif + + width >>= 1; + + // work across the source in 8x8 tiles + do { + Transpose_wx8(src, src_pitch, + dst_a, dst_pitch_a, + dst_b, dst_pitch_b, + width); + + src += 8 * src_pitch; + dst_a += 8; + dst_b += 8; + i -= 8; + } while (i >= 8); + + Transpose_wxh(src, src_pitch, + dst_a, dst_pitch_a, + dst_b, dst_pitch_b, + width, i); + +#ifdef __ARM_NEON__ + RestoreRegisters_NEON(store_reg); +#endif +} + +void Rotate90_deinterleave(const uint8* src, int src_pitch, + uint8* dst_a, int dst_pitch_a, + uint8* dst_b, int dst_pitch_b, + int width, int height) { + src += src_pitch*(height-1); + src_pitch = -src_pitch; + + Transpose_deinterleave(src, src_pitch, + dst_a, dst_pitch_a, + dst_b, dst_pitch_b, + width, height); +} + +void Rotate270_deinterleave(const uint8* src, int src_pitch, + uint8* dst_a, int dst_pitch_a, + uint8* dst_b, int dst_pitch_b, + int width, int height) { + dst_a += dst_pitch_a*((width>>1)-1); + dst_b += dst_pitch_b*((width>>1)-1); + dst_pitch_a = -dst_pitch_a; + dst_pitch_b = -dst_pitch_b; + + Transpose_deinterleave(src, src_pitch, + dst_a, dst_pitch_a, + dst_b, dst_pitch_b, + width, height); +} + +static void ReverseLine_di_C(const uint8* src, + uint8* dst_a, uint8* dst_b, + int width) { + int i; + for (i = 0; i < width*2; i += 2) { + dst_a[width-1 - (i>>1)] = src[i]; + dst_b[width-1 - (i>>1)] = src[i+1]; + } +} + +void Rotate180_deinterleave(const uint8* src, int src_pitch, + uint8* dst_a, int dst_pitch_a, + uint8* dst_b, int dst_pitch_b, + int width, int height) { + int i; + reverse_func ReverseLine; + + // do processor detection here. +#ifdef __ARM_NEON__ + ReverseLine = ReverseLine_di_NEON; +#else + ReverseLine = ReverseLine_di_C; +#endif + + dst_a += dst_pitch_a*(height-1); + dst_b += dst_pitch_b*(height-1); + + width >>= 1; + + for (i = 0; i < height; ++i) { + ReverseLine(src, dst_a, dst_b, width); + + src += src_pitch; + dst_a -= dst_pitch_a; + dst_b -= dst_pitch_b; + } +} + +} // namespace libyuv diff --git a/source/rotate_deinterleave_neon.s b/source/rotate_deinterleave_neon.s new file mode 100644 index 000000000..b5bb38517 --- /dev/null +++ b/source/rotate_deinterleave_neon.s @@ -0,0 +1,310 @@ + .global RestoreRegisters_NEON + .global ReverseLine_di_NEON + .global SaveRegisters_NEON + .global Transpose_di_wx8_NEON + .type RestoreRegisters_NEON, function + .type ReverseLine_di_NEON, function + .type SaveRegisters_NEON, function + .type Transpose_di_wx8_NEON, function + +@ void SaveRegisters_NEON (unsigned long long store) +@ r0 unsigned long long store +SaveRegisters_NEON: + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + +@ void RestoreRegisters_NEON (unsigned long long store) +@ r0 unsigned long long store +RestoreRegisters_NEON: + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + +@ void ReverseLine_NEON (const uint8* src, +@ uint8* dst_a, +@ uint8* dst_b, +@ int width) +@ r0 const uint8* src +@ r1 uint8* dst_a +@ r2 uint8* dst_b +@ r3 width +ReverseLine_di_NEON: + + @ compute where to start writing destination + add r1, r1, r3 @ dst_a + width + add r2, r2, r3 @ dst_b + width + + @ work on input segments that are multiples of 16, but + @ width that has been passed is output segments, half + @ the size of input. + lsrs r12, r3, #3 + + beq .line_residuals + + @ the output is written in to two blocks. + mov r12, #-8 + + @ back of destination by the size of the register that is + @ going to be reversed + sub r1, r1, #8 + sub r2, r2, #8 + + @ the loop needs to run on blocks of 16. what will be left + @ over is either a negative number, the residuals that need + @ to be done, or 0. if this isn't subtracted off here the + @ loop will run one extra time. + sub r3, r3, #8 + +.segments_of_8: + vld2.8 {d0, d1}, [r0]! @ src += 16 + + @ reverse the bytes in the 64 bit segments + vrev64.8 q0, q0 + + vst1.8 {d0}, [r1], r12 @ dst_a -= 8 + vst1.8 {d1}, [r2], r12 @ dst_b -= 8 + + subs r3, r3, #8 + bge .segments_of_8 + + @ add 16 back to the counter. if the result is 0 there is no + @ residuals so return + adds r3, r3, #8 + bxeq lr + + add r1, r1, #8 + add r2, r2, #8 + +.line_residuals: + + mov r12, #-1 + + sub r1, r1, #1 + sub r2, r2, #1 + +@ do this in neon registers as per +@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ +.segments_of_2: + vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 + + vst1.8 {d0[0]}, [r1], r12 @ dst_a -= 1 + vst1.8 {d1[0]}, [r2], r12 @ dst_b -= 1 + + subs r3, r3, #1 + bgt .segments_of_2 + + bx lr + +@ void Transpose_di_wx8_NEON (const uint8* src, int src_pitch, +@ uint8* dst_a, int dst_pitch_a, +@ uint8* dst_b, int dst_pitch_b, +@ int width) +@ r0 const uint8* src +@ r1 int src_pitch +@ r2 uint8* dst_a +@ r3 int dst_pitch_a +@ stack uint8* dst_b +@ stack int dst_pitch_b +@ stack int width +Transpose_di_wx8_NEON: + push {r4-r9,lr} + + ldr r4, [sp, #28] @ dst_b + ldr r5, [sp, #32] @ dst_pitch_b + ldr r7, [sp, #36] @ width + @ loops are on blocks of 8. loop will stop when + @ counter gets to or below 0. starting the counter + @ at w-8 allow for this + sub r8, #8 + +@ handle 8x8 blocks. this should be the majority of the plane +.loop_8x8: + mov r9, r0 + + vld2.8 {d0, d1}, [r9], r1 + vld2.8 {d2, d3}, [r9], r1 + vld2.8 {d4, d5}, [r9], r1 + vld2.8 {d6, d7}, [r9], r1 + vld2.8 {d8, d9}, [r9], r1 + vld2.8 {d10, d11}, [r9], r1 + vld2.8 {d12, d13}, [r9], r1 + vld2.8 {d14, d15}, [r9] + + vtrn.8 q1, q0 + vtrn.8 q3, q2 + vtrn.8 q5, q4 + vtrn.8 q7, q6 + + vtrn.16 q1, q3 + vtrn.16 q0, q2 + vtrn.16 q5, q7 + vtrn.16 q4, q6 + + vtrn.32 q1, q5 + vtrn.32 q0, q4 + vtrn.32 q3, q7 + vtrn.32 q2, q6 + + vrev16.8 q0, q0 + vrev16.8 q1, q1 + vrev16.8 q2, q2 + vrev16.8 q3, q3 + vrev16.8 q4, q4 + vrev16.8 q5, q5 + vrev16.8 q6, q6 + vrev16.8 q7, q7 + + mov r9, r2 + + vst1.8 {d2}, [r9], r3 + vst1.8 {d0}, [r9], r3 + vst1.8 {d6}, [r9], r3 + vst1.8 {d4}, [r9], r3 + vst1.8 {d10}, [r9], r3 + vst1.8 {d8}, [r9], r3 + vst1.8 {d14}, [r9], r3 + vst1.8 {d12}, [r9] + + mov r9, r4 + + vst1.8 {d3}, [r9], r5 + vst1.8 {d1}, [r9], r5 + vst1.8 {d7}, [r9], r5 + vst1.8 {d5}, [r9], r5 + vst1.8 {d11}, [r9], r5 + vst1.8 {d9}, [r9], r5 + vst1.8 {d15}, [r9], r5 + vst1.8 {d13}, [r9] + + add r0, #8*2 @ src += 8*2 + add r2, r3, lsl #3 @ dst_a += 8 * dst_pitch_a + add r4, r5, lsl #3 @ dst_b += 8 * dst_pitch_b + subs r8, #8 @ w -= 8 + bge .loop_8x8 + + @ add 8 back to counter. if the result is 0 there are + @ no residuals. + adds r8, #8 + beq .done + + @ some residual, so between 1 and 7 lines left to transpose + cmp r8, #2 + blt .block_1x8 + + cmp r8, #4 + blt .block_2x8 + +@ TODO(frkoenig) : clean this up +.block_4x8: + mov r9, r0 + vld1.64 {d0}, [r9], r1 + vld1.64 {d1}, [r9], r1 + vld1.64 {d2}, [r9], r1 + vld1.64 {d3}, [r9], r1 + vld1.64 {d4}, [r9], r1 + vld1.64 {d5}, [r9], r1 + vld1.64 {d6}, [r9], r1 + vld1.64 {d7}, [r9] + + adr r12, vtbl_4x4_transpose + vld1.8 {q7}, [r12] + + vtrn.8 q0, q1 + vtrn.8 q2, q3 + + vtbl.8 d8, {d0, d1}, d14 + vtbl.8 d9, {d0, d1}, d15 + vtbl.8 d10, {d2, d3}, d14 + vtbl.8 d11, {d2, d3}, d15 + vtbl.8 d12, {d4, d5}, d14 + vtbl.8 d13, {d4, d5}, d15 + vtbl.8 d0, {d6, d7}, d14 + vtbl.8 d1, {d6, d7}, d15 + + mov r9, r2 + + vst1.32 {d8[0]}, [r9], r3 + vst1.32 {d8[1]}, [r9], r3 + vst1.32 {d9[0]}, [r9], r3 + vst1.32 {d9[1]}, [r9], r3 + + add r9, r2, #4 + vst1.32 {d12[0]}, [r9], r3 + vst1.32 {d12[1]}, [r9], r3 + vst1.32 {d13[0]}, [r9], r3 + vst1.32 {d13[1]}, [r9] + + mov r9, r4 + + vst1.32 {d10[0]}, [r9], r5 + vst1.32 {d10[1]}, [r9], r5 + vst1.32 {d11[0]}, [r9], r5 + vst1.32 {d11[1]}, [r9], r5 + + add r9, r4, #4 + vst1.32 {d0[0]}, [r9], r5 + vst1.32 {d0[1]}, [r9], r5 + vst1.32 {d1[0]}, [r9], r5 + vst1.32 {d1[1]}, [r9] + + add r0, #4*2 @ src += 4 * 2 + add r2, r3, lsl #2 @ dst_a += 4 * dst_pitch_a + add r4, r5, lsl #2 @ dst_b += 4 * dst_pitch_b + subs r8, #4 @ w -= 4 + beq .done + + @ some residual, check to see if it includes a 2x8 block, + @ or less + cmp r8, #2 + blt .block_1x8 + +.block_2x8: + mov r9, r0 + vld2.16 {d0[0], d2[0]}, [r9], r1 + vld2.16 {d1[0], d3[0]}, [r9], r1 + vld2.16 {d0[1], d2[1]}, [r9], r1 + vld2.16 {d1[1], d3[1]}, [r9], r1 + vld2.16 {d0[2], d2[2]}, [r9], r1 + vld2.16 {d1[2], d3[2]}, [r9], r1 + vld2.16 {d0[3], d2[3]}, [r9], r1 + vld2.16 {d1[3], d3[3]}, [r9] + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + + mov r9, r2 + + vst1.64 {d0}, [r9], r3 + vst1.64 {d2}, [r9] + + mov r9, r4 + + vst1.64 {d1}, [r9], r5 + vst1.64 {d3}, [r9] + + add r0, #2*2 @ src += 2 * 2 + add r2, r3, lsl #1 @ dst_a += 2 * dst_pitch_a + add r4, r5, lsl #1 @ dst_a += 2 * dst_pitch_a + subs r8, #2 @ w -= 2 + beq .done + +.block_1x8: + vld2.8 {d0[0], d1[0]}, [r0], r1 + vld2.8 {d0[1], d1[1]}, [r0], r1 + vld2.8 {d0[2], d1[2]}, [r0], r1 + vld2.8 {d0[3], d1[3]}, [r0], r1 + vld2.8 {d0[4], d1[4]}, [r0], r1 + vld2.8 {d0[5], d1[5]}, [r0], r1 + vld2.8 {d0[6], d1[6]}, [r0], r1 + vld2.8 {d0[7], d1[7]}, [r0] + + vst1.64 {d0}, [r2] + vst1.64 {d1}, [r4] + +.done: + pop {r4-r9, pc} + +vtbl_4x4_transpose: + .byte 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 diff --git a/source/rotate_neon.s b/source/rotate_neon.s new file mode 100644 index 000000000..fc29f56d7 --- /dev/null +++ b/source/rotate_neon.s @@ -0,0 +1,254 @@ + .global ReverseLine_NEON + .global Transpose_wx8_NEON + .type ReverseLine_NEON, function + .type Transpose_wx8_NEON, function + +@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width) +@ r0 const uint8* src +@ r1 uint8* dst +@ r2 width +ReverseLine_NEON: + + @ compute where to start writing destination + add r1, r2 @ dst + width + + @ work on segments that are multiples of 16 + lsrs r3, r2, #4 + + @ the output is written in two block. 8 bytes followed + @ by another 8. reading is done sequentially, from left to + @ right. writing is done from right to left in block sizes + @ r1, the destination pointer is incremented after writing + @ the first of the two blocks. need to subtract that 8 off + @ along with 16 to get the next location. + mov r3, #-24 + + beq .line_residuals + + @ back of destination by the size of the register that is + @ going to be reversed + sub r1, #16 + + @ the loop needs to run on blocks of 16. what will be left + @ over is either a negative number, the residuals that need + @ to be done, or 0. if this isn't subtracted off here the + @ loop will run one extra time. + sub r2, #16 + +.segments_of_16: + vld1.8 {q0}, [r0]! @ src += 16 + + @ reverse the bytes in the 64 bit segments. unable to reverse + @ the bytes in the entire 128 bits in one go. + vrev64.8 q0, q0 + + @ because of the inability to reverse the entire 128 bits + @ reverse the writing out of the two 64 bit segments. + vst1.8 {d1}, [r1]! + vst1.8 {d0}, [r1], r3 @ dst -= 16 + + subs r2, #16 + bge .segments_of_16 + + @ add 16 back to the counter. if the result is 0 there is no + @ residuals so return + adds r2, #16 + bxeq lr + + add r1, #16 + +.line_residuals: + + mov r3, #-3 + + sub r1, #2 + subs r2, #2 + @ check for 16*n+1 scenarios where segments_of_2 should not + @ be run, but there is something left over. + blt .segment_of_1 + +@ do this in neon registers as per +@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ +.segments_of_2: + vld2.8 {d0[0], d1[0]}, [r0]! @ src += 2 + + vst1.8 {d1[0]}, [r1]! + vst1.8 {d0[0]}, [r1], r3 @ dst -= 2 + + subs r2, #2 + bge .segments_of_2 + + adds r2, #2 + bxeq lr + +.segment_of_1: + add r1, #1 + vld1.8 {d0[0]}, [r0] + vst1.8 {d0[0]}, [r1] + + bx lr + +@ void Transpose_wx8_NEON (const uint8* src, int src_pitch, +@ uint8* dst, int dst_pitch, +@ int w) +@ r0 const uint8* src +@ r1 int src_pitch +@ r2 uint8* dst +@ r3 int dst_pitch +@ stack int w +Transpose_wx8_NEON: + push {r4,r8,r9,lr} + + ldr r8, [sp, #16] @ width + + @ loops are on blocks of 8. loop will stop when + @ counter gets to or below 0. starting the counter + @ at w-8 allow for this + sub r8, #8 + +@ handle 8x8 blocks. this should be the majority of the plane +.loop_8x8: + mov r9, r0 + + vld1.8 {d0}, [r9], r1 + vld1.8 {d1}, [r9], r1 + vld1.8 {d2}, [r9], r1 + vld1.8 {d3}, [r9], r1 + vld1.8 {d4}, [r9], r1 + vld1.8 {d5}, [r9], r1 + vld1.8 {d6}, [r9], r1 + vld1.8 {d7}, [r9] + + vtrn.8 d1, d0 + vtrn.8 d3, d2 + vtrn.8 d5, d4 + vtrn.8 d7, d6 + + vtrn.16 d1, d3 + vtrn.16 d0, d2 + vtrn.16 d5, d7 + vtrn.16 d4, d6 + + vtrn.32 d1, d5 + vtrn.32 d0, d4 + vtrn.32 d3, d7 + vtrn.32 d2, d6 + + vrev16.8 q0, q0 + vrev16.8 q1, q1 + vrev16.8 q2, q2 + vrev16.8 q3, q3 + + mov r9, r2 + + vst1.8 {d1}, [r9], r3 + vst1.8 {d0}, [r9], r3 + vst1.8 {d3}, [r9], r3 + vst1.8 {d2}, [r9], r3 + vst1.8 {d5}, [r9], r3 + vst1.8 {d4}, [r9], r3 + vst1.8 {d7}, [r9], r3 + vst1.8 {d6}, [r9] + + add r0, #8 @ src += 8 + add r2, r3, lsl #3 @ dst += 8 * dst_pitch + subs r8, #8 @ w -= 8 + bge .loop_8x8 + + @ add 8 back to counter. if the result is 0 there are + @ no residuals. + adds r8, #8 + beq .done + + @ some residual, so between 1 and 7 lines left to transpose + cmp r8, #2 + blt .block_1x8 + + cmp r8, #4 + blt .block_2x8 + +.block_4x8: + mov r9, r0 + vld1.32 {d0[0]}, [r9], r1 + vld1.32 {d0[1]}, [r9], r1 + vld1.32 {d1[0]}, [r9], r1 + vld1.32 {d1[1]}, [r9], r1 + vld1.32 {d2[0]}, [r9], r1 + vld1.32 {d2[1]}, [r9], r1 + vld1.32 {d3[0]}, [r9], r1 + vld1.32 {d3[1]}, [r9] + + mov r9, r2 + + adr r12, vtbl_4x4_transpose + vld1.8 {q3}, [r12] + + vtbl.8 d4, {d0, d1}, d6 + vtbl.8 d5, {d0, d1}, d7 + vtbl.8 d0, {d2, d3}, d6 + vtbl.8 d1, {d2, d3}, d7 + + @ TODO: rework shuffle above to write + @ out with 4 instead of 8 writes + vst1.32 {d4[0]}, [r9], r3 + vst1.32 {d4[1]}, [r9], r3 + vst1.32 {d5[0]}, [r9], r3 + vst1.32 {d5[1]}, [r9] + + add r9, r2, #4 + vst1.32 {d0[0]}, [r9], r3 + vst1.32 {d0[1]}, [r9], r3 + vst1.32 {d1[0]}, [r9], r3 + vst1.32 {d1[1]}, [r9] + + add r0, #4 @ src += 4 + add r2, r3, lsl #2 @ dst += 4 * dst_pitch + subs r8, #4 @ w -= 4 + beq .done + + @ some residual, check to see if it includes a 2x8 block, + @ or less + cmp r8, #2 + blt .block_1x8 + +.block_2x8: + mov r9, r0 + vld1.16 {d0[0]}, [r9], r1 + vld1.16 {d1[0]}, [r9], r1 + vld1.16 {d0[1]}, [r9], r1 + vld1.16 {d1[1]}, [r9], r1 + vld1.16 {d0[2]}, [r9], r1 + vld1.16 {d1[2]}, [r9], r1 + vld1.16 {d0[3]}, [r9], r1 + vld1.16 {d1[3]}, [r9] + + vtrn.8 d0, d1 + + mov r9, r2 + + vst1.64 {d0}, [r9], r3 + vst1.64 {d1}, [r9] + + add r0, #2 @ src += 2 + add r2, r3, lsl #1 @ dst += 2 * dst_pitch + subs r8, #2 @ w -= 2 + beq .done + +.block_1x8: + vld1.8 {d0[0]}, [r0], r1 + vld1.8 {d0[1]}, [r0], r1 + vld1.8 {d0[2]}, [r0], r1 + vld1.8 {d0[3]}, [r0], r1 + vld1.8 {d0[4]}, [r0], r1 + vld1.8 {d0[5]}, [r0], r1 + vld1.8 {d0[6]}, [r0], r1 + vld1.8 {d0[7]}, [r0] + + vst1.64 {d0}, [r2] + +.done: + + pop {r4,r8,r9,pc} + +vtbl_4x4_transpose: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc new file mode 100644 index 000000000..6245ada2d --- /dev/null +++ b/unit_test/rotate_test.cc @@ -0,0 +1,707 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "unit_test.h" +#include "rotate.h" +#include + +using namespace libyuv; + +void print_array(uint8 *array, int w, int h) { + int i, j; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) + printf("%4d", array[i*w + j]); + + printf("\n"); + } +} + +TEST_F(libyuvTest, Transpose) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 8; iw < _rotate_max_w && !err; ++iw) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_1; + uint8 *output_2; + + ow = ih; + oh = iw; + + input = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_1 = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_2 = static_cast(malloc(sizeof(uint8)*iw*ih)); + + for (i = 0; i < iw*ih; ++i) { + input[i] = i; + output_1[i] = 0; + output_2[i] = 0; + } + + Transpose(input, iw, output_1, ow, iw, ih); + Transpose(output_1, ow, output_2, oh, ow, oh); + + for (i = 0; i < iw*ih; ++i) { + if (input[i] != output_2[i]) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("transpose 1\n"); + print_array(output_1, ow, oh); + + printf("transpose 2\n"); + print_array(output_2, iw, ih); + } + + free(input); + free(output_1); + free(output_2); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, Rotate90) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 8; iw < _rotate_max_w && !err; ++iw) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_90; + uint8 *output_180; + uint8 *output_270; + + ow = ih; + oh = iw; + + input = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_0 = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_90 = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_180 = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_270 = static_cast(malloc(sizeof(uint8)*ow*oh)); + + for (i = 0; i < iw*ih; ++i) { + input[i] = i; + output_0[i] = 0; + output_90[i] = 0; + output_180[i] = 0; + output_270[i] = 0; + } + + Rotate90(input, iw, output_90, ow, iw, ih); + Rotate90(output_90, ow, output_180, oh, ow, oh); + Rotate90(output_180, oh, output_270, ow, oh, ow); + Rotate90(output_270, ow, output_0, iw, ow, oh); + + for (i = 0; i < iw*ih; ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 90\n"); + print_array(output_90, ow, oh); + + printf("output 180\n"); + print_array(output_180, iw, ih); + + printf("output 270\n"); + print_array(output_270, ow, oh); + + printf("output 0\n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_90); + free(output_180); + free(output_270); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, Rotate90Deinterleave) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 2) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0_u; + uint8 *output_0_v; + uint8 *output_90_u; + uint8 *output_90_v; + uint8 *output_180_u; + uint8 *output_180_v; + + ow = ih; + oh = iw>>1; + + input = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_0_u = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_0_v = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_90_u = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_90_v = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_180_u = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_180_v = static_cast(malloc(sizeof(uint8)*ow*oh)); + + for (i = 0; i < iw*ih; i +=2) { + input[i] = i>>1; + input[i+1] = -(i>>1); + } + + for (i = 0; i < ow*oh; ++i) { + output_0_u[i] = 0; + output_0_v[i] = 0; + output_90_u[i] = 0; + output_90_v[i] = 0; + output_180_u[i] = 0; + output_180_v[i] = 0; + } + + Rotate90_deinterleave(input, iw, + output_90_u, ow, + output_90_v, ow, + iw, ih); + + Rotate90(output_90_u, ow, output_180_u, oh, ow, oh); + Rotate90(output_90_v, ow, output_180_v, oh, ow, oh); + + Rotate180(output_180_u, ow, output_0_u, ow, ow, oh); + Rotate180(output_180_v, ow, output_0_v, ow, ow, oh); + + for (i = 0; i < ow*oh; ++i) { + if (output_0_u[i] != (uint8)i) + err++; + if (output_0_v[i] != (uint8)(-i)) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 90_u\n"); + print_array(output_90_u, ow, oh); + + printf("output 90_v\n"); + print_array(output_90_v, ow, oh); + + printf("output 180_u\n"); + print_array(output_180_u, oh, ow); + + printf("output 180_v\n"); + print_array(output_180_v, oh, ow); + + printf("output 0_u\n"); + print_array(output_0_u, oh, ow); + + printf("output 0_v\n"); + print_array(output_0_v, oh, ow); + } + + free(input); + free(output_0_u); + free(output_0_v); + free(output_90_u); + free(output_90_v); + free(output_180_u); + free(output_180_v); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, Rotate180Deinterleave) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 2) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0_u; + uint8 *output_0_v; + uint8 *output_90_u; + uint8 *output_90_v; + uint8 *output_180_u; + uint8 *output_180_v; + + ow = iw>>1; + oh = ih; + + input = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_0_u = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_0_v = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_90_u = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_90_v = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_180_u = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_180_v = static_cast(malloc(sizeof(uint8)*ow*oh)); + + for (i = 0; i < iw*ih; i +=2) { + input[i] = i>>1; + input[i+1] = -(i>>1); + } + + for (i = 0; i < ow*oh; ++i) { + output_0_u[i] = 0; + output_0_v[i] = 0; + output_90_u[i] = 0; + output_90_v[i] = 0; + output_180_u[i] = 0; + output_180_v[i] = 0; + } + + Rotate180_deinterleave(input, iw, + output_180_u, ow, + output_180_v, ow, + iw, ih); + + Rotate90(output_180_u, ow, output_90_u, oh, ow, oh); + Rotate90(output_180_v, ow, output_90_v, oh, ow, oh); + + Rotate90(output_90_u, oh, output_0_u, ow, oh, ow); + Rotate90(output_90_v, oh, output_0_v, ow, oh, ow); + + for (i = 0; i < ow*oh; ++i) { + if (output_0_u[i] != (uint8)i) + err++; + if (output_0_v[i] != (uint8)(-i)) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 180_u\n"); + print_array(output_180_u, oh, ow); + + printf("output 180_v\n"); + print_array(output_180_v, oh, ow); + + printf("output 90_u\n"); + print_array(output_90_u, oh, ow); + + printf("output 90_v\n"); + print_array(output_90_v, oh, ow); + + printf("output 0_u\n"); + print_array(output_0_u, ow, oh); + + printf("output 0_v\n"); + print_array(output_0_v, ow, oh); + } + + free(input); + free(output_0_u); + free(output_0_v); + free(output_90_u); + free(output_90_v); + free(output_180_u); + free(output_180_v); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, Rotate270Deinterleave) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 2) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0_u; + uint8 *output_0_v; + uint8 *output_270_u; + uint8 *output_270_v; + uint8 *output_180_u; + uint8 *output_180_v; + + ow = ih; + oh = iw>>1; + + input = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_0_u = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_0_v = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_270_u = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_270_v = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_180_u = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_180_v = static_cast(malloc(sizeof(uint8)*ow*oh)); + + for (i = 0; i < iw*ih; i +=2) { + input[i] = i>>1; + input[i+1] = -(i>>1); + } + + for (i = 0; i < ow*oh; ++i) { + output_0_u[i] = 0; + output_0_v[i] = 0; + output_270_u[i] = 0; + output_270_v[i] = 0; + output_180_u[i] = 0; + output_180_v[i] = 0; + } + + Rotate270_deinterleave(input, iw, + output_270_u, ow, + output_270_v, ow, + iw, ih); + + Rotate270(output_270_u, ow, output_180_u, oh, ow, oh); + Rotate270(output_270_v, ow, output_180_v, oh, ow, oh); + + Rotate180(output_180_u, ow, output_0_u, ow, ow, oh); + Rotate180(output_180_v, ow, output_0_v, ow, ow, oh); + + for (i = 0; i < ow*oh; ++i) { + if (output_0_u[i] != (uint8)i) + err++; + if (output_0_v[i] != (uint8)(-i)) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 270_u\n"); + print_array(output_270_u, ow, oh); + + printf("output 270_v\n"); + print_array(output_270_v, ow, oh); + + printf("output 180_u\n"); + print_array(output_180_u, oh, ow); + + printf("output 180_v\n"); + print_array(output_180_v, oh, ow); + + printf("output 0_u\n"); + print_array(output_0_u, oh, ow); + + printf("output 0_v\n"); + print_array(output_0_v, oh, ow); + } + + free(input); + free(output_0_u); + free(output_0_v); + free(output_270_u); + free(output_270_v); + free(output_180_u); + free(output_180_v); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, Rotate180) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 8; iw < _rotate_max_w && !err; ++iw) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_180; + + ow = iw; + oh = ih; + + input = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_0 = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_180 = static_cast(malloc(sizeof(uint8)*iw*ih)); + + for (i = 0; i < iw*ih; ++i) { + input[i] = i; + output_0[i] = 0; + output_180[i] = 0; + } + + Rotate180(input, iw, output_180, ow, iw, ih); + Rotate180(output_180, ow, output_0, iw, ow, oh); + + for (i = 0; i < iw*ih; ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 180\n"); + print_array(output_180, iw, ih); + + printf("output 0\n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_180); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, Rotate270) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 8; iw < _rotate_max_w && !err; ++iw) + for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_90; + uint8 *output_180; + uint8 *output_270; + + ow = ih; + oh = iw; + + input = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_0 = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_90 = static_cast(malloc(sizeof(uint8)*ow*oh)); + output_180 = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_270 = static_cast(malloc(sizeof(uint8)*ow*oh)); + + for (i = 0; i < iw*ih; ++i) { + input[i] = i; + output_0[i] = 0; + output_90[i] = 0; + output_180[i] = 0; + output_270[i] = 0; + } + + Rotate270(input, iw, output_270, ow, iw, ih); + Rotate270(output_270, ow, output_180, oh, ow, oh); + Rotate270(output_180, oh, output_90, ow, oh, ow); + Rotate270(output_90, ow, output_0, iw, ow, oh); + + for (i = 0; i < iw*ih; ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("input %dx%d \n", iw, ih); + print_array(input, iw, ih); + + printf("output 270\n"); + print_array(output_270, ow, oh); + + printf("output 180\n"); + print_array(output_180, iw, ih); + + printf("output 90\n"); + print_array(output_90, ow, oh); + + printf("output 0\n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_90); + free(output_180); + free(output_270); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, Rotate90and270) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 4) + for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_90; + ow = ih; + oh = iw; + + input = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_0 = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_90 = static_cast(malloc(sizeof(uint8)*ow*oh)); + + for (i = 0; i < iw*ih; ++i) { + input[i] = i; + output_0[i] = 0; + output_90[i] = 0; + } + + Rotate90(input, iw, output_90, ow, iw, ih); + Rotate270(output_90, ow, output_0, iw, ow, oh); + + for (i = 0; i < iw*ih; ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("intput %dx%d\n", iw, ih); + print_array(input, iw, ih); + + printf("output \n"); + print_array(output_90, ow, oh); + + printf("output \n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_90); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, Rotate90Pitch) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 4) + for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_90; + ow = ih; + oh = iw; + + input = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_0 = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_90 = static_cast(malloc(sizeof(uint8)*ow*oh)); + + for (i = 0; i < iw*ih; ++i) { + input[i] = i; + output_0[i] = 0; + output_90[i] = 0; + } + + Rotate90(input, iw, + output_90 + (ow>>1), ow, iw>>1, ih>>1); + Rotate90(input + (iw>>1), iw, + output_90 + (ow>>1) + ow*(oh>>1), ow, iw>>1, ih>>1); + Rotate90(input + iw*(ih>>1), iw, + output_90, ow, iw>>1, ih>>1); + Rotate90(input + (iw>>1) + iw*(ih>>1), iw, + output_90 + ow*(oh>>1), ow, iw>>1, ih>>1); + + Rotate270(output_90, ih, output_0, iw, ow, oh); + + for (i = 0; i < iw*ih; ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("intput %dx%d\n", iw, ih); + print_array(input, iw, ih); + + printf("output \n"); + print_array(output_90, ow, oh); + + printf("output \n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_90); + } + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, Rotate270Pitch) { + int iw, ih, ow, oh; + int err = 0; + + for (iw = 16; iw < _rotate_max_w && !err; iw += 4) + for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + int i; + uint8 *input; + uint8 *output_0; + uint8 *output_270; + + ow = ih; + oh = iw; + + input = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_0 = static_cast(malloc(sizeof(uint8)*iw*ih)); + output_270 = static_cast(malloc(sizeof(uint8)*ow*oh)); + + for (i = 0; i < iw*ih; ++i) { + input[i] = i; + output_270[i] = 0; + } + + Rotate270(input, iw, + output_270 + ow*(oh>>1), ow, iw>>1, ih>>1); + Rotate270(input + (iw>>1), iw, + output_270, ow, iw>>1, ih>>1); + Rotate270(input + iw*(ih>>1), iw, + output_270 + (ow>>1) + ow*(oh>>1), ow, iw>>1, ih>>1); + Rotate270(input + (iw>>1) + iw*(ih>>1), iw, + output_270 + (ow>>1), ow, iw>>1, ih>>1); + + Rotate90(output_270, ih, output_0, iw, ow, oh); + + for (i = 0; i < iw*ih; ++i) { + if (input[i] != output_0[i]) + err++; + } + + if (err) { + printf("intput %dx%d\n", iw, ih); + print_array(input, iw, ih); + + printf("output \n"); + print_array(output_270, ow, oh); + + printf("output \n"); + print_array(output_0, iw, ih); + } + + free(input); + free(output_0); + free(output_270); + } + + EXPECT_EQ(0, err); +} diff --git a/unit_test/unit_test.cc b/unit_test/unit_test.cc index 2259cf60c..73bb6a8a5 100644 --- a/unit_test/unit_test.cc +++ b/unit_test/unit_test.cc @@ -20,7 +20,9 @@ class libyuvEnvironment : public ::testing::Environment { } }; -libyuvTest::libyuvTest() +libyuvTest::libyuvTest() : + _rotate_max_w(128), + _rotate_max_h(128) { } diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h index 817c88fd2..5265c1656 100644 --- a/unit_test/unit_test.h +++ b/unit_test/unit_test.h @@ -11,6 +11,7 @@ #ifndef UINIT_TEST_H_ #define UINIT_TEST_H_ +#include "basic_types.h" #include class libyuvTest : public ::testing::Test { @@ -18,6 +19,10 @@ class libyuvTest : public ::testing::Test { libyuvTest(); virtual void SetUp(); virtual void TearDown(); + + const uint32 _rotate_max_w; + const uint32 _rotate_max_h; + }; #endif // UNIT_TEST_H_