From b4ddbaf549a1bf5572bf703fd2862d1eb7380c6a Mon Sep 17 00:00:00 2001 From: Justin Green Date: Thu, 3 Feb 2022 11:46:44 -0500 Subject: [PATCH] Add support for MM21. Add support for MM21 to NV12 and I420 conversion, and add SIMD optimizations for arm, aarch64, SSE2, and SSSE3 machines. Bug: libyuv:915, b/215425056 Change-Id: Iecb0c33287f35766a6169d4adf3b7397f1ba8b5d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3433269 Reviewed-by: Frank Barchard Commit-Queue: Justin Green --- README.chromium | 4 +- include/libyuv/convert.h | 28 ++++ include/libyuv/planar_functions.h | 12 ++ include/libyuv/row.h | 40 +++++ source/convert.cc | 54 +++++++ source/planar_functions.cc | 73 +++++++++- source/row_any.cc | 45 ++++++ source/row_common.cc | 24 +++ source/row_gcc.cc | 58 +++++++- source/row_neon.cc | 46 ++++++ source/row_neon64.cc | 23 +++ unit_test/convert_test.cc | 233 ++++++++++++++++-------------- unit_test/planar_test.cc | 101 +++++++++++++ 13 files changed, 628 insertions(+), 113 deletions(-) mode change 100755 => 100644 unit_test/planar_test.cc diff --git a/README.chromium b/README.chromium index b6a068144..5074954c5 100644 --- a/README.chromium +++ b/README.chromium @@ -1,8 +1,8 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1810 +Version: 1811 License: BSD License File: LICENSE Description: -libyuv is an open source project that includes YUV conversion and scaling functionality. \ No newline at end of file +libyuv is an open source project that includes YUV conversion and scaling functionality. diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 93e7550be..bcdefb5df 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -106,6 +106,34 @@ int I422ToI444(const uint8_t* src_y, int width, int height); +// Convert MM21 to NV12. +LIBYUV_API +int MM21ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Convert MM21 to I420. +LIBYUV_API +int MM21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + // Convert I422 to NV21. LIBYUV_API int I422ToNV21(const uint8_t* src_y, diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 47f3446ac..1037d7320 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -93,6 +93,18 @@ void DetilePlane(const uint8_t* src_y, int height, int tile_height); +// Convert a UV plane of tiles of 16 x H into linear U and V planes. +LIBYUV_API +void DetileSplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int tile_height); + // Split interleaved UV plane into separate U and V planes. LIBYUV_API void SplitUVPlane(const uint8_t* src_uv, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 12e53dbe2..8cac886ee 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -290,6 +290,8 @@ extern "C" { #define HAS_AB64TOARGBROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 +#define HAS_DETILEROW_SSE2 +#define HAS_DETILESPLITUVROW_SSSE3 #define HAS_HALFMERGEUVROW_SSSE3 #define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 @@ -537,6 +539,7 @@ extern "C" { #define HAS_GAUSSROW_F32_NEON #define HAS_GAUSSCOL_F32_NEON #define HAS_DETILEROW_NEON +#define HAS_DETILESPLITUVROW_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_ABGRTOUVROW_MSA @@ -1839,6 +1842,43 @@ void DetileRow_NEON(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width); +void DetileRow_Any_NEON(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width); +void DetileRow_SSE2(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width); +void DetileRow_Any_SSE2(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width); +void DetileSplitUVRow_C(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void DetileSplitUVRow_Any_SSSE3(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void DetileSplitUVRow_NEON(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void DetileSplitUVRow_Any_NEON(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void MergeUVRow_C(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, diff --git a/source/convert.cc b/source/convert.cc index b54f88b7d..9c5e8aa8f 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -564,6 +564,60 @@ int I422ToNV21(const uint8_t* src_y, return 0; } +LIBYUV_API +int MM21ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_uv || !dst_uv || width <= 0) { + return -1; + } + + int sign = height < 0 ? -1 : 1; + + if (dst_y) { + DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32); + } + DetilePlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, (width + 1) & ~1, + (height + sign) / 2, 16); + + return 0; +} + +LIBYUV_API +int MM21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int sign = height < 0 ? -1 : 1; + + if (!src_uv || !dst_u || !dst_v || width <= 0) { + return -1; + } + + if (dst_y) { + DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32); + } + DetileSplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, (width + 1) & ~1, (height + sign) / 2, 16); + + return 0; +} + #ifdef I422TONV21_ROW_VERSION // Unittest fails for this version. // 422 chroma is 1/2 width, 1x height diff --git a/source/planar_functions.cc b/source/planar_functions.cc index d7cb8dc77..f2f2d6951 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -882,9 +882,20 @@ void DetilePlane(const uint8_t* src_y, dst_stride_y = -dst_stride_y; } +#if defined(HAS_DETILEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + DetileRow = DetileRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + DetileRow = DetileRow_SSE2; + } + } +#endif #if defined(HAS_DETILEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - DetileRow = DetileRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + DetileRow = DetileRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileRow = DetileRow_NEON; + } } #endif @@ -900,6 +911,64 @@ void DetilePlane(const uint8_t* src_y, } } +LIBYUV_API +void DetileSplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int tile_height) { + const ptrdiff_t src_tile_stride = 16 * tile_height; + int y; + void (*DetileSplitUVRow)(const uint8_t* src, ptrdiff_t src_tile_stride, + uint8_t* dst_u, uint8_t* dst_v, int width) = + DetileSplitUVRow_C; + assert(src_stride_uv >= 0); + assert(tile_height > 0); + assert(src_stride_uv > 0); + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_u = dst_u + (height - 1) * dst_stride_u; + dst_stride_u = -dst_stride_u; + dst_v = dst_v + (height - 1) * dst_stride_v; + dst_stride_v = -dst_stride_v; + } + +#if defined(HAS_DETILESPLITUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + DetileSplitUVRow = DetileSplitUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + DetileSplitUVRow = DetileSplitUVRow_SSSE3; + } + } +#endif +#if defined(HAS_DETILESPLITROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DetileSplitUVRow = DetileSplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileSplitUVRow = DetileSplitUVRow_NEON; + } + } +#endif + + // Detile plane + for (y = 0; y < height; ++y) { + DetileSplitUVRow(src_uv, src_tile_stride, dst_u, dst_v, width); + dst_u += dst_stride_u; + dst_v += dst_stride_v; + src_uv += 16; + // Advance to next row of tiles. + if ((y & (tile_height - 1)) == (tile_height - 1)) { + src_uv = src_uv - src_tile_stride + src_stride_uv * tile_height; + } + } +} + // Support function for NV12 etc RGB channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API diff --git a/source/row_any.cc b/source/row_any.cc index e45f85366..755851283 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2059,6 +2059,51 @@ ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) #endif #undef ANY11S +#define ANYDETILE(NAMEANY, ANY_SIMD, MASK) \ + void NAMEANY(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[16 * 2]); \ + memset(temp, 0, 16); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src, src_tile_stride, dst, n); \ + } \ + memcpy(temp, src + (n / 16) * src_tile_stride, r); \ + ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1); \ + memcpy(dst + n, temp + 16, r); \ + } + +#ifdef HAS_DETILEROW_NEON +ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, 15) +#endif +#ifdef HAS_DETILEROW_SSE2 +ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, 15) +#endif + +#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \ + void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + SIMD_ALIGNED(uint8_t temp[16 * 2]); \ + memset(temp, 0, 16 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_uv + (n / 16) * src_tile_stride, r); \ + ANY_SIMD(temp, src_tile_stride, temp + 16, temp + 24, r); \ + memcpy(dst_u + n / 2, temp + 16, (r + 1) / 2); \ + memcpy(dst_v + n / 2, temp + 24, (r + 1) / 2); \ + } + +#ifdef HAS_DETILESPLITUVROW_NEON +ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15) +#endif +#ifdef HAS_DETILESPLITUVROW_SSSE3 +ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) +#endif + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index 84b395b64..3e5f419dd 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2674,6 +2674,30 @@ void DetileRow_C(const uint8_t* src, } } +void DetileSplitUVRow_C(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int tile; + for (tile = 0; tile < width / 16; tile++) { + for (int x = 0; x < 8; x++) { + *dst_u++ = src_uv[0]; + *dst_v++ = src_uv[1]; + src_uv += 2; + } + src_uv += src_tile_stride - 16; + } + for (int x = 0; x < (width & 0xF) / 2; ++x) { + *dst_u = *src_uv; + dst_u++; + src_uv++; + *dst_v = *src_uv; + dst_v++; + src_uv++; + } +} + void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 5f7ab0c74..85376e4b0 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -9,7 +9,6 @@ */ #include "libyuv/row.h" - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -4765,6 +4764,63 @@ void SplitUVRow_SSE2(const uint8_t* src_uv, } #endif // HAS_SPLITUVROW_SSE2 +#ifdef HAS_DETILEROW_SSE2 +void DetileRow_SSE2(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width) { + asm volatile( + "1: \n" + "movdqu (%0),%%xmm0 \n" + "sub $0x10,%2 \n" + "lea (%0,%3),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "xmm0"); +} +#endif // HAS_DETILEROW_SSE2 + +#ifdef HAS_DETILESPLITUVROW_SSSE3 +// TODO(greenjustin): Look into generating these constants instead of loading +// them since this can cause branch mispredicts for fPIC code on 32-bit +// machines. +static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14, + 1, 3, 5, 7, 9, 11, 13, 15}; + +// TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very +// slow on older SSE2 processors. +void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqu %4,%%xmm1 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea (%0, %5),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "movhps %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "m"(kDeinterlaceUV), // %4 + "r"(src_tile_stride) // %5 + : "cc", "memory", "xmm0", "xmm1"); +} +#endif // HAS_DETILESPLITUVROW_SSSE3 + #ifdef HAS_MERGEUVROW_AVX2 void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, diff --git a/source/row_neon.cc b/source/row_neon.cc index 543ebec9e..4781e2f6a 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -575,6 +575,52 @@ void SplitUVRow_NEON(const uint8_t* src_uv, ); } +// Reads 16 byte Y's from tile and writes out 16 Y's. +// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes +// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes +// width measured in bytes so 8 UV = 16. +void DetileRow_NEON(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width) { + asm volatile( + "1: \n" + "vld1.16 {q0}, [%0], %3 \n" // load 16 bytes + "subs %2, %2, #16 \n" // 16 processed per loop + "pld [%0, 1792] \n" + "vst1.16 {q0}, [%1]! \n" // store 16 bytes + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "q0" // Clobber List + ); +} + +// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. +void DetileSplitUVRow_NEON(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d0, d1}, [%0], %4 \n" + "subs %3, %3, #16 \n" + "pld [%0, 1792] \n" + "vst1.8 {d0}, [%1]! \n" + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(src_tile_stride) // %4 + : "cc", "memory", "d0", "d1" // Clobber List + ); +} + // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 1d1f9bb10..442e60cdc 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -627,6 +627,29 @@ void DetileRow_NEON(const uint8_t* src, ); } +// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. +void DetileSplitUVRow_NEON(const uint8_t* src_uv, + ptrdiff_t src_tile_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld2 {v0.8b,v1.8b}, [%0], %4 \n" + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%0, 1792] \n" + "st1 {v0.8b}, [%1], #8 \n" + "st1 {v1.8b}, [%2], #8 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(src_tile_stride) // %4 + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + #if LIBYUV_USE_ST2 // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 741812656..20ac91c83 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -419,7 +419,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) #define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ - DOY, SRC_DEPTH) \ + DOY, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ @@ -433,13 +433,18 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) "DST_SUBSAMP_Y unsupported"); \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ - const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ - const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ + const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ - align_buffer_page_end(src_uv, \ - 2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \ + const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \ + const int kPaddedHeight = \ + (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1); \ + const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X); \ + const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \ + align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \ + align_buffer_page_end( \ + src_uv, \ + 2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ align_buffer_page_end(dst_uv_c, \ 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ @@ -448,11 +453,11 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ SRC_T* src_uv_p = reinterpret_cast(src_uv + OFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ + for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) { \ src_y_p[i] = \ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ } \ - for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) { \ + for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \ src_uv_p[i] = \ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ } \ @@ -497,136 +502,148 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) #define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ - DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, 1, \ - SRC_DEPTH) \ + SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, 1, \ - SRC_DEPTH) \ + SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1, \ - SRC_DEPTH) \ + SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH) \ + DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, \ + TILE_WIDTH, TILE_HEIGHT) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0, \ - SRC_DEPTH) + SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) -TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8) -TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8) -TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8) -TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8) -TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10) -TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10) -TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10) -TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12) -TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12) -TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12) +TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1) +TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1) +TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1) +TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1) +TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1) +TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1) +TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1) +TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1) +TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1) +TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1) +TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32) -#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ - DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ - SRC_DEPTH) \ - TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ - static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ - static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ - "SRC_SUBSAMP_X unsupported"); \ - static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ - "SRC_SUBSAMP_Y unsupported"); \ - static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ - "DST_SUBSAMP_X unsupported"); \ - static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ - "DST_SUBSAMP_Y unsupported"); \ - const int kWidth = W1280; \ - const int kHeight = benchmark_height_; \ - const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ - const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ - const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ - const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ - align_buffer_page_end(src_uv, \ - kSrcHalfWidth* kSrcHalfHeight* SRC_BPC * 2 + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ - align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ - align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ - SRC_T* src_uv_p = reinterpret_cast(src_uv + OFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - src_y_p[i] = \ - (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ - } \ - for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) { \ - src_uv_p[i] = \ - (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ - } \ - memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ - memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ - memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - MaskCpuFlags(disable_cpu_flags_); \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \ - reinterpret_cast(dst_y_c), kWidth, \ - reinterpret_cast(dst_u_c), kDstHalfWidth, \ - reinterpret_cast(dst_v_c), kDstHalfWidth, kWidth, \ - NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \ - reinterpret_cast(dst_y_opt), kWidth, \ - reinterpret_cast(dst_u_opt), kDstHalfWidth, \ - reinterpret_cast(dst_v_opt), kDstHalfWidth, kWidth, \ - NEG kHeight); \ - } \ - for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \ - EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \ - } \ - for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \ - EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \ - EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \ - } \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_u_c); \ - free_aligned_buffer_page_end(dst_v_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_u_opt); \ - free_aligned_buffer_page_end(dst_v_opt); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_uv); \ +#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ + SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ + static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ + static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ + "SRC_SUBSAMP_X unsupported"); \ + static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ + "SRC_SUBSAMP_Y unsupported"); \ + static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ + "DST_SUBSAMP_X unsupported"); \ + static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ + "DST_SUBSAMP_Y unsupported"); \ + const int kWidth = W1280; \ + const int kHeight = benchmark_height_; \ + const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ + const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ + const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ + const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \ + const int kPaddedHeight = \ + (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1); \ + const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X); \ + const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \ + align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \ + align_buffer_page_end( \ + src_uv, kSrcHalfPaddedWidth* kSrcHalfPaddedHeight* SRC_BPC * 2 + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ + SRC_T* src_uv_p = reinterpret_cast(src_uv + OFF); \ + for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) { \ + src_y_p[i] = \ + (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ + } \ + for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \ + src_uv_p[i] = \ + (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ + } \ + memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ + memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ + memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \ + reinterpret_cast(dst_y_c), kWidth, \ + reinterpret_cast(dst_u_c), kDstHalfWidth, \ + reinterpret_cast(dst_v_c), kDstHalfWidth, kWidth, \ + NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \ + reinterpret_cast(dst_y_opt), kWidth, \ + reinterpret_cast(dst_u_opt), kDstHalfWidth, \ + reinterpret_cast(dst_v_opt), kDstHalfWidth, kWidth, \ + NEG kHeight); \ + } \ + for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \ + EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \ + } \ + for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \ + EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \ + EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \ + } \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_u_c); \ + free_aligned_buffer_page_end(dst_v_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_u_opt); \ + free_aligned_buffer_page_end(dst_v_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ } #define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ - DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \ + DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, \ + TILE_WIDTH, TILE_HEIGHT) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, \ - SRC_DEPTH) \ + SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH) \ + DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH, \ + TILE_WIDTH, TILE_HEIGHT) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH) + DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH, \ + TILE_WIDTH, TILE_HEIGHT) -TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) -TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) +TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1) +TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1) +TESTBIPLANARTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32) // Provide matrix wrappers for full range bt.709 #define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \ diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc old mode 100755 new mode 100644 index bdbdb6a45..118259876 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1523,6 +1523,107 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) { free_aligned_buffer_page_end(dst_opt); } +TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) { + int i, j; + + // orig is tiled. Allocate enough memory for tiles. + int orig_width = (benchmark_width_ + 15) & ~15; + int orig_height = (benchmark_height_ + 15) & ~15; + int orig_plane_size = orig_width * orig_height; + int u_plane_size = benchmark_width_ * benchmark_height_; + int v_plane_size = u_plane_size; + align_buffer_page_end(orig_uv, orig_plane_size); + align_buffer_page_end(dst_u_c, u_plane_size); + align_buffer_page_end(dst_u_opt, u_plane_size); + align_buffer_page_end(dst_v_c, v_plane_size); + align_buffer_page_end(dst_v_opt, v_plane_size); + + MemRandomize(orig_uv, orig_plane_size); + memset(dst_u_c, 0, u_plane_size); + memset(dst_u_opt, 0, u_plane_size); + memset(dst_v_c, 0, v_plane_size); + memset(dst_v_opt, 0, v_plane_size); + + // Disable all optimizations. + MaskCpuFlags(disable_cpu_flags_); + for (j = 0; j < benchmark_iterations_; j++) { + DetileSplitUVPlane(orig_uv, orig_width, dst_u_c, (benchmark_width_ + 1) / 2, + dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_, + benchmark_height_, 16); + } + + // Enable optimizations. + MaskCpuFlags(benchmark_cpu_info_); + for (j = 0; j < benchmark_iterations_; j++) { + DetileSplitUVPlane( + orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt, + (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16); + } + + for (i = 0; i < u_plane_size; ++i) { + EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); + } + for (i = 0; i < v_plane_size; ++i) { + EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); + } + + free_aligned_buffer_page_end(orig_uv); + free_aligned_buffer_page_end(dst_u_c); + free_aligned_buffer_page_end(dst_u_opt); + free_aligned_buffer_page_end(dst_v_c); + free_aligned_buffer_page_end(dst_v_opt); +} + +TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) { + int i, j; + + // orig is tiled. Allocate enough memory for tiles. + int orig_width = (benchmark_width_ + 15) & ~15; + int orig_height = (benchmark_height_ + 15) & ~15; + int orig_plane_size = orig_width * orig_height; + int u_plane_size = benchmark_width_ * benchmark_height_; + int v_plane_size = u_plane_size; + align_buffer_page_end(orig_uv, orig_plane_size); + align_buffer_page_end(detiled_uv, orig_plane_size); + align_buffer_page_end(dst_u_two_stage, u_plane_size); + align_buffer_page_end(dst_u_opt, u_plane_size); + align_buffer_page_end(dst_v_two_stage, v_plane_size); + align_buffer_page_end(dst_v_opt, v_plane_size); + + MemRandomize(orig_uv, orig_plane_size); + memset(detiled_uv, 0, orig_plane_size); + memset(dst_u_two_stage, 0, u_plane_size); + memset(dst_u_opt, 0, u_plane_size); + memset(dst_v_two_stage, 0, v_plane_size); + memset(dst_v_opt, 0, v_plane_size); + + for (j = 0; j < benchmark_iterations_; j++) { + DetileSplitUVPlane( + orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt, + (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16); + } + + DetilePlane(orig_uv, orig_width, detiled_uv, benchmark_width_, + benchmark_width_, benchmark_height_, 16); + SplitUVPlane(detiled_uv, orig_width, dst_u_two_stage, + (benchmark_width_ + 1) / 2, dst_v_two_stage, + (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_); + + for (i = 0; i < u_plane_size; ++i) { + EXPECT_EQ(dst_u_two_stage[i], dst_u_opt[i]); + } + for (i = 0; i < v_plane_size; ++i) { + EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]); + } + + free_aligned_buffer_page_end(orig_uv); + free_aligned_buffer_page_end(detiled_uv); + free_aligned_buffer_page_end(dst_u_two_stage); + free_aligned_buffer_page_end(dst_u_opt); + free_aligned_buffer_page_end(dst_v_two_stage); + free_aligned_buffer_page_end(dst_v_opt); +} + static int TestMultiply(int width, int height, int benchmark_iterations,