From 62a961bee72e48e4fa14365bd7444c9280540b6f Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 22 Oct 2012 17:24:50 +0000 Subject: [PATCH] Neon version of I420ToNV12 and I420ToNV21. NV21ToI420 added as function. CopyRow changed to vld4.8 to allow unaligned copy. BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/922005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@435 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/convert.h | 11 +++++- include/libyuv/convert_from.h | 17 +++++++++- include/libyuv/row.h | 6 ++++ include/libyuv/version.h | 2 +- source/convert.cc | 20 +++++++++-- source/convert_from.cc | 58 ++++++++++++++++++++++++++++++- source/planar_functions.cc | 2 +- source/rotate.cc | 2 +- source/rotate_argb.cc | 2 +- source/row_common.cc | 15 ++++++++ source/row_neon.cc | 39 ++++++++++++++++----- unit_test/convert_test.cc | 64 +++++++++++++++++++---------------- 13 files changed, 193 insertions(+), 47 deletions(-) diff --git a/README.chromium b/README.chromium index 8ff9d4d56..2db963548 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 433 +Version: 435 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 2db1f14aa..e07bfd199 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -73,7 +73,7 @@ int I400ToI420(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height); -// Convert NV12 to I420. Also used for NV21. +// Convert NV12 to I420. LIBYUV_API int NV12ToI420(const uint8* src_y, int src_stride_y, const uint8* src_uv, int src_stride_uv, @@ -82,6 +82,15 @@ int NV12ToI420(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height); +// Convert NV21 to I420. +LIBYUV_API +int NV21ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + // Convert M420 to I420. LIBYUV_API int M420ToI420(const uint8* src_m420, int src_stride_m420, diff --git a/include/libyuv/convert_from.h b/include/libyuv/convert_from.h index 4eae950cc..44ff4d98f 100644 --- a/include/libyuv/convert_from.h +++ b/include/libyuv/convert_from.h @@ -56,10 +56,25 @@ int I400Copy(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height); -// TODO(fbarchard): I420ToNV12 // TODO(fbarchard): I420ToM420 // TODO(fbarchard): I420ToQ420 +LIBYUV_API +int I420ToNV12(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +LIBYUV_API +int I420ToNV21(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + LIBYUV_API int I420ToYUY2(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 94fd99720..89e35a608 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -168,6 +168,7 @@ extern "C" { #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON #define HAS_ARGBTOYROW_NEON +#define HAS_MERGEUV_NEON #endif // The following are available on Mips platforms @@ -308,6 +309,11 @@ void SplitUV_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); +void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); + void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 8f1c42561..3a3ad4182 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 433 +#define LIBYUV_VERSION 435 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index 51198602d..57ad6139e 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -302,7 +302,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, int width, int height) { void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_NEON; } #elif defined(HAS_COPYROW_X86) @@ -460,6 +460,22 @@ int NV12ToI420(const uint8* src_y, int src_stride_y, width, height); } +// Convert NV21 to I420. Same as NV12 but u and v pointers swapped. +LIBYUV_API +int NV21ToI420(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, + src_vu, src_stride_vu, + dst_y, dst_stride_y, + dst_v, dst_stride_v, + dst_u, dst_stride_u, + width, height); +} + // Convert M420 to I420. LIBYUV_API int M420ToI420(const uint8* src_m420, int src_stride_m420, @@ -503,7 +519,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, // CopyRow for rows of just Y in Q420 copied to Y plane of I420. void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_NEON; } #endif diff --git a/source/convert_from.cc b/source/convert_from.cc index 443c140b7..474fea56f 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -50,7 +50,7 @@ int I420ToI422(const uint8* src_y, int src_stride_y, int halfwidth = (width + 1) >> 1; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 32)) { CopyRow = CopyRow_NEON; } #elif defined(HAS_COPYROW_X86) @@ -477,6 +477,62 @@ int I420ToV210(const uint8* src_y, int src_stride_y, return 0; } +LIBYUV_API +int I420ToNV12(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + int halfheight = (height + 1) >> 1; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv; + dst_stride_y = -dst_stride_y; + dst_stride_uv = -dst_stride_uv; + } + + int halfwidth = (width + 1) >> 1; + void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) = MergeUV_C; +#if defined(HAS_SPLITUV_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) { + MergeUV = MergeUV_NEON; + } +#endif + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + int halfheight = (height + 1) >> 1; + for (int y = 0; y < halfheight; ++y) { + // Copy a row of UV. + MergeUV_C(src_u, src_v, dst_uv, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } + return 0; +} + +LIBYUV_API +int I420ToNV21(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height) { + return I420ToNV12(src_y, src_stride_y, + src_v, src_stride_v, + src_u, src_stride_u, + dst_y, src_stride_y, + dst_vu, dst_stride_vu, + width, height); +} + // Convert I420 to ARGB. LIBYUV_API int I420ToARGB(const uint8* src_y, int src_stride_y, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 2ffee68f3..2d0366fbf 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -30,7 +30,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y, int width, int height) { void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_NEON; } #endif diff --git a/source/rotate.cc b/source/rotate.cc index 15ac961ac..8f9883f47 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -859,7 +859,7 @@ void RotatePlane180(const uint8* src, int src_stride, #endif void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_NEON; } #endif diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 9c9944674..7dcefa385 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -90,7 +90,7 @@ void ARGBRotate180(const uint8* src, int src_stride, #endif void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 64)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) { CopyRow = CopyRow_NEON; } #endif diff --git a/source/row_common.cc b/source/row_common.cc index 1f54a07f6..cf826ea58 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -717,6 +717,21 @@ void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { } } +void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + for (int x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x]; + dst_uv[1] = src_v[x]; + dst_uv[2] = src_u[x + 1]; + dst_uv[3] = src_v[x + 1]; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1]; + dst_uv[1] = src_v[width - 1]; + } +} + void CopyRow_C(const uint8* src, uint8* dst, int count) { memcpy(dst, src, count); } diff --git a/source/row_neon.cc b/source/row_neon.cc index ca0cab5c3..2a4d6b3a1 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -345,7 +345,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { "vld2.u8 {q0, q1}, [%0:128]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop "vst1.u8 {q0}, [%1:128]! \n" // store U - "vst1.u8 {q1}, [%2:128]! \n" // Store V + "vst1.u8 {q1}, [%2:128]! \n" // store V "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 @@ -355,6 +355,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { : "memory", "cc", "q0", "q1" // Clobber List ); } + // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v // Alignment requirement: Multiple of 16 pixels, pointers unaligned. void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, @@ -365,7 +366,7 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop "vst1.u8 {q0}, [%1]! \n" // store U - "vst1.u8 {q1}, [%2]! \n" // Store V + "vst1.u8 {q1}, [%2]! \n" // store V "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 @@ -377,21 +378,43 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, } #endif // HAS_SPLITUV_NEON +#ifdef HAS_MERGEUV_NEON +// Reads 16 U's and V's and writes out 16 pairs of UV. +void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld1.u8 {q0}, [%1]! \n" // load U + "vld1.u8 {q1}, [%2]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + "vst2.u8 {q0, q1}, [%0]! \n" // store 16 pairs of UV + "bgt 1b \n" + : + "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "memory", "cc", "q0", "q1" // Clobber List + ); +} +#endif // HAS_MERGEUV_NEON #ifdef HAS_COPYROW_NEON -// Copy multiple of 64 +// Copy multiple of 32. vld4.u8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( ".p2align 2 \n" "1: \n" - "vldm %0!, {q0, q1, q2, q3} \n" // load 64 - "subs %2, %2, #64 \n" // 64 processed per loop - "vstm %1!, {q0, q1, q2, q3} \n" // store 64 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + "vst4.u8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(count) // %2 // Output registers : // Input registers - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + : "memory", "cc", "q0", "q1" // Clobber List ); } #endif // HAS_COPYROW_NEON @@ -403,7 +426,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) { "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop - "vst1.u32 {q0}, [%0]! \n" // store + "vst1.u8 {q0}, [%0]! \n" // store "bgt 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index deda59687..c81a7ab3b 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -31,13 +31,15 @@ namespace libyuv { #define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG) \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ const int kWidth = W1280; \ const int kHeight = 720; \ - align_buffer_16(src_y, kWidth * kHeight); \ - align_buffer_16(src_u, kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y); \ - align_buffer_16(src_v, kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y); \ + align_buffer_16(src_y, kWidth * kHeight + OFF); \ + align_buffer_16(src_u, \ + kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y + OFF); \ + align_buffer_16(src_v, \ + kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y + OFF); \ align_buffer_16(dst_y_c, kWidth * kHeight); \ align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ @@ -47,26 +49,26 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ srandom(time(NULL)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ - src_y[(i * kWidth) + j] = (random() & 0xff); \ + src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \ for (int i = 0; i < kHeight / SRC_SUBSAMP_Y; ++i) { \ for (int j = 0; j < kWidth / SRC_SUBSAMP_X; ++j) { \ - src_u[(i * kWidth / SRC_SUBSAMP_X) + j] = (random() & 0xff); \ - src_v[(i * kWidth / SRC_SUBSAMP_X) + j] = (random() & 0xff); \ + src_u[(i * kWidth / SRC_SUBSAMP_X) + j + OFF] = (random() & 0xff); \ + src_v[(i * kWidth / SRC_SUBSAMP_X) + j + OFF] = (random() & 0xff); \ } \ } \ MaskCpuFlags(0); \ - SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth, \ - src_u, kWidth / SRC_SUBSAMP_X, \ - src_v, kWidth / SRC_SUBSAMP_X, \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ + src_u + OFF, kWidth / SRC_SUBSAMP_X, \ + src_v + OFF, kWidth / SRC_SUBSAMP_X, \ dst_y_c, kWidth, \ dst_u_c, kWidth / SUBSAMP_X, \ dst_v_c, kWidth / SUBSAMP_X, \ kWidth, NEG kHeight); \ MaskCpuFlags(-1); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth, \ - src_u, kWidth / SRC_SUBSAMP_X, \ - src_v, kWidth / SRC_SUBSAMP_X, \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ + src_u + OFF, kWidth / SRC_SUBSAMP_X, \ + src_v + OFF, kWidth / SRC_SUBSAMP_X, \ dst_y_opt, kWidth, \ dst_u_opt, kWidth / SUBSAMP_X, \ dst_v_opt, kWidth / SUBSAMP_X, \ @@ -120,11 +122,13 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ #define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Opt, +) \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Opt, +, 0) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Invert, -) \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Unaligned, +, 1) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1276, _Any, +) + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Invert, -, 0) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1276, _Any, +, 0) TESTPLANARTOP(I420, 2, 2, I420, 2, 2) TESTPLANARTOP(I422, 2, 1, I420, 2, 2) @@ -137,13 +141,13 @@ TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2) #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG) \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ const int kWidth = W1280; \ const int kHeight = 720; \ - align_buffer_16(src_y, kWidth * kHeight); \ + align_buffer_16(src_y, kWidth * kHeight + OFF); \ align_buffer_16(src_uv, 2 * kWidth / SRC_SUBSAMP_X * \ - kHeight / SRC_SUBSAMP_Y); \ + kHeight / SRC_SUBSAMP_Y + OFF); \ align_buffer_16(dst_y_c, kWidth * kHeight); \ align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ @@ -153,23 +157,23 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ srandom(time(NULL)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ - src_y[(i * kWidth) + j] = (random() & 0xff); \ + src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \ for (int i = 0; i < kHeight / SRC_SUBSAMP_Y; ++i) { \ for (int j = 0; j < 2 * kWidth / SRC_SUBSAMP_X; ++j) { \ - src_uv[(i * 2 * kWidth / SRC_SUBSAMP_X) + j] = (random() & 0xff); \ + src_uv[(i * 2 * kWidth / SRC_SUBSAMP_X) + j + OFF] = (random() & 0xff); \ } \ } \ MaskCpuFlags(0); \ - SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth, \ - src_uv, 2 * kWidth / SRC_SUBSAMP_X, \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth, \ + src_uv + OFF, 2 * kWidth / SRC_SUBSAMP_X, \ dst_y_c, kWidth, \ dst_u_c, kWidth / SUBSAMP_X, \ dst_v_c, kWidth / SUBSAMP_X, \ kWidth, NEG kHeight); \ MaskCpuFlags(-1); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth, \ - src_uv, 2 * kWidth / SRC_SUBSAMP_X, \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth, \ + src_uv + OFF, 2 * kWidth / SRC_SUBSAMP_X, \ dst_y_opt, kWidth, \ dst_u_opt, kWidth / SUBSAMP_X, \ dst_v_opt, kWidth / SUBSAMP_X, \ @@ -222,14 +226,16 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ #define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Opt, +) \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Opt, +, 0) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Invert, -) \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Unaligned, +, 1) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1276, _Any, +) + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Invert, -, 0) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1276, _Any, +, 0) TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2) - +TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2) #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ W1280, DIFF, N, NEG) \