From 610e0cdead3fcd3288693d18eab8c7323805ad9e Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 5 Dec 2022 16:10:38 -0800 Subject: [PATCH] MT2T Warning fixes for fuchsia Bug: b/258474032, b/257266635 Change-Id: Ic5cbbc60e2e1463361e359a2fe3e97976c1ea929 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4081348 Commit-Queue: Frank Barchard Reviewed-by: Justin Green --- README.chromium | 2 +- include/libyuv/convert.h | 13 ++++++++ include/libyuv/row.h | 3 ++ include/libyuv/version.h | 2 +- source/compare.cc | 6 ++-- source/compare_gcc.cc | 2 +- source/convert.cc | 61 ++++++++++++++++++++++++++++++++++++++ source/mjpeg_decoder.cc | 4 +-- source/planar_functions.cc | 3 +- source/rotate_neon64.cc | 32 ++++++++++---------- source/row_common.cc | 27 +++++++++++++++++ source/row_neon.cc | 54 +++++++++++++++++++++++++++++++++ source/row_neon64.cc | 48 ++++++++++++++++++++++++++++++ unit_test/convert_test.cc | 20 +++++++++++++ 14 files changed, 252 insertions(+), 25 deletions(-) diff --git a/README.chromium b/README.chromium index 357334de0..ccd2ca06a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1850 +Version: 1852 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index e1eb36b62..5c5231bb3 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -162,6 +162,19 @@ int MM21ToYUY2(const uint8_t* src_y, int width, int height); +// Convert MT2T to P010 +LIBYUV_API +int MT2TToP010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height); + // Convert I422 to NV21. LIBYUV_API int I422ToNV21(const uint8_t* src_y, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 861c6d3e9..27f468add 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -455,6 +455,7 @@ extern "C" { #define HAS_DETILEROW_NEON #define HAS_DETILESPLITUVROW_NEON #define HAS_DETILETOYUY2_NEON +#define HAS_UNPACKMT2T_NEON #define HAS_DIVIDEROW_16_NEON #define HAS_HALFFLOATROW_NEON #define HAS_HALFMERGEUVROW_NEON @@ -2122,6 +2123,8 @@ void DetileToYUY2_Any_NEON(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width); +void UnpackMT2T_C(const uint16_t* src, uint16_t* dst, size_t size); +void UnpackMT2T_NEON(const uint16_t* src, uint16_t* dst, size_t size); void MergeUVRow_C(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index ca1dc1373..6254d978f 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1850 +#define LIBYUV_VERSION 1852 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare.cc b/source/compare.cc index d4713b605..50a736bdd 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -45,7 +45,7 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { } #endif - while (count >= (uint64_t)(kBlockSize)) { + while (count >= (uint64_t)kBlockSize) { seed = HashDjb2_SSE(src, kBlockSize, seed); src += kBlockSize; count -= kBlockSize; @@ -359,10 +359,10 @@ static double Ssim8x8_C(const uint8_t* src_a, (sum_a_sq + sum_b_sq + c1) * (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2); - if (ssim_d == 0.0) { + if (ssim_d == 0) { return DBL_MAX; } - return ssim_n * 1.0 / ssim_d; + return (double)ssim_n / (double)ssim_d; } } diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index b834b42ac..33cbe25de 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -67,7 +67,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, : : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); - return static_cast(diff); + return (uint32_t)(diff); } #else uint32_t HammingDistance_SSE42(const uint8_t* src_a, diff --git a/source/convert.cc b/source/convert.cc index ad0edd1f2..65d4ba16f 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -732,6 +732,67 @@ int MM21ToYUY2(const uint8_t* src_y, return 0; } +// Convert MT2T into P010. See tinyurl.com/mtk-10bit-video-format for format +// documentation. +// TODO(greenjustin): Add an MT2T to I420 conversion. +// TODO(greenjustin): Investigate if there are valid stride parameters other +// than width. +LIBYUV_API +int MT2TToP010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width <= 0 || height <= 0 || !src_y || !src_uv || !dst_y || !dst_uv) { + return -1; + } + + // TODO(greenjustin): Investigate if we can allow arbitrary sizes. This may + // not be semantically meaningful in this format, but we do not have samples + // of unaligned data to conclude that yet. This format is 16x32 tiled, so we + // must pad the width and height to reflect that. + int aligned_width = (width + 15) & ~15; + int aligned_height = (height + 31) & ~31; + + { + size_t y_size = aligned_width * aligned_height * 10 / 8; + size_t uv_size = aligned_width * ((aligned_height + 1) / 2) * 10 / 8; + size_t tmp_y_size = aligned_width * aligned_height * sizeof(uint16_t); + size_t tmp_uv_size = + aligned_width * ((aligned_height + 1) / 2) * sizeof(uint16_t); + void (*UnpackMT2T)(const uint16_t* src, uint16_t* dst, size_t size) = + UnpackMT2T_C; + align_buffer_64(tmp_y, tmp_y_size); + align_buffer_64(tmp_uv, tmp_uv_size); + +#if defined(HAS_UNPACKMT2T_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UnpackMT2T = UnpackMT2T_NEON; + } +#endif + + // TODO(greenjustin): Unpack and detile in rows rather than planes to keep + // the caches hot. + UnpackMT2T(src_y, (uint16_t*)tmp_y, y_size); + UnpackMT2T(src_uv, (uint16_t*)tmp_uv, uv_size); + + DetilePlane_16((uint16_t*)tmp_y, src_stride_y, dst_y, dst_stride_y, width, + height, 32); + DetilePlane_16((uint16_t*)tmp_uv, src_stride_uv, dst_uv, dst_stride_uv, + width, (height + 1) / 2, 16); + + free_aligned_buffer_64(tmp_y); + free_aligned_buffer_64(tmp_uv); + } + + return 0; +} + #ifdef I422TONV21_ROW_VERSION // Unittest fails for this version. // 422 chroma is 1/2 width, 1x height diff --git a/source/mjpeg_decoder.cc b/source/mjpeg_decoder.cc index 4ccf00a36..0141da8a1 100644 --- a/source/mjpeg_decoder.cc +++ b/source/mjpeg_decoder.cc @@ -109,7 +109,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) { } buf_.data = src; - buf_.len = static_cast(src_len); + buf_.len = (int)src_len; buf_vec_.pos = 0; decompress_struct_->client_data = &buf_vec_; #ifdef HAVE_SETJMP @@ -428,7 +428,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) { void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT jpeg_source_mgr* src = cinfo->src; - size_t bytes = static_cast(num_bytes); + size_t bytes = (size_t)num_bytes; if (bytes > src->bytes_in_buffer) { src->next_input_byte = nullptr; src->bytes_in_buffer = 0; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 96914e08d..f43525d57 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -3196,6 +3196,7 @@ int RAWToRGB24(const uint8_t* src_raw, return 0; } +// TODO(fbarchard): Consider uint8_t value LIBYUV_API void SetPlane(uint8_t* dst_y, int dst_stride_y, @@ -3256,7 +3257,7 @@ void SetPlane(uint8_t* dst_y, // Set plane for (y = 0; y < height; ++y) { - SetRow(dst_y, value, width); + SetRow(dst_y, (uint8_t)value, width); dst_y += dst_stride_y; } } diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc index 43c158173..ea1cf82c2 100644 --- a/source/rotate_neon64.cc +++ b/source/rotate_neon64.cc @@ -201,13 +201,13 @@ void TransposeWx8_NEON(const uint8_t* src, "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst), // %2 - "+r"(width) // %3 - : "r"(&kVTbl4x4Transpose), // %4 - "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride)) // %6 + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"((ptrdiff_t)src_stride), // %5 + "r"((ptrdiff_t)dst_stride) // %6 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } @@ -423,15 +423,15 @@ void TransposeUVWx8_NEON(const uint8_t* src, "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst_a), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride_a)), // %6 - "r"(static_cast(dst_stride_b)), // %7 - "r"(&kVTbl4x4TransposeDi) // %8 + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "r"((ptrdiff_t)src_stride), // %5 + "r"((ptrdiff_t)dst_stride_a), // %6 + "r"((ptrdiff_t)dst_stride_b), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); } diff --git a/source/row_common.cc b/source/row_common.cc index 5ee5b17f0..7d084e76f 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2801,6 +2801,33 @@ void DetileToYUY2_C(const uint8_t* src_y, } } +// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded +// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the +// block contain all of the lower 2 bits of each pixel packed together, and the +// next 64 bytes represent all the upper 8 bits of the pixel. +void UnpackMT2T_C(const uint16_t* src, uint16_t* dst, size_t size) { + for (size_t i = 0; i < size; i += 80) { + const uint8_t* src_lower_bits = (uint8_t*)src; + const uint8_t* src_upper_bits = ((uint8_t*)src) + 16; + + for (int j = 0; j < 16; j++) { + uint8_t lower_bits = src_lower_bits[j]; + *dst++ = (lower_bits & 0x03) << 6 | (uint16_t)src_upper_bits[j * 4] << 8 | + (uint16_t)src_upper_bits[j * 4] >> 2; + *dst++ = (lower_bits & 0x0C) << 4 | + (uint16_t)src_upper_bits[j * 4 + 1] << 8 | + (uint16_t)src_upper_bits[j * 4 + 1] >> 2; + *dst++ = (lower_bits & 0x30) << 2 | + (uint16_t)src_upper_bits[j * 4 + 2] << 8 | + (uint16_t)src_upper_bits[j * 4 + 2] >> 2; + *dst++ = (lower_bits & 0xC0) | (uint16_t)src_upper_bits[j * 4 + 3] << 8 | + (uint16_t)src_upper_bits[j * 4 + 3] >> 2; + } + + src += 40; + } +} + void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, diff --git a/source/row_neon.cc b/source/row_neon.cc index d2815d17b..0c6065f81 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -720,6 +720,60 @@ void DetileToYUY2_NEON(const uint8_t* src_y, } #endif +void UnpackMT2T_NEON(const uint16_t* src, uint16_t* dst, size_t size) { + const uint16_t* src_lower_bits = src; + const uint16_t* src_upper_bits = src + 8; + asm volatile( + "1: \n" + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // Load 32 bytes of upper + // bits. + "vld1.8 {d6}, [%0]! \n" // Load 8 bytes of lower + // bits. + "vshl.u8 d4, d6, #2 \n" // Align lower bits. + "vshl.u8 d2, d6, #4 \n" + "vshl.u8 d0, d6, #6 \n" + "vzip.u8 d0, d1 \n" // Zip lower and upper + // bits together. + "vzip.u8 d2, d3 \n" + "vzip.u8 d4, d5 \n" + "vzip.u8 d6, d7 \n" + "vsri.u16 q0, q0, #10 \n" // Copy upper 6 bits into + // lower 6 bits for better + // accuracy in + // conversions. + "vsri.u16 q1, q1, #10 \n" + "vsri.u16 q2, q2, #10 \n" + "vsri.u16 q3, q3, #10 \n" + "vst4.16 {d0, d2, d4, d6}, [%2]! \n" // Store 32 pixels + "vst4.16 {d1, d3, d5, d7}, [%2]! \n" + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // Process last 32 pixels + // in the block + "vld1.8 {d6}, [%0]! \n" + "vshl.u8 d4, d6, #2 \n" + "vshl.u8 d2, d6, #4 \n" + "vshl.u8 d0, d6, #6 \n" + "vzip.u8 d0, d1 \n" + "vzip.u8 d2, d3 \n" + "vzip.u8 d4, d5 \n" + "vzip.u8 d6, d7 \n" + "vsri.u16 q0, q0, #10 \n" + "vsri.u16 q1, q1, #10 \n" + "vsri.u16 q2, q2, #10 \n" + "vsri.u16 q3, q3, #10 \n" + "vst4.16 {d0, d2, d4, d6}, [%2]! \n" + "vst4.16 {d1, d3, d5, d7}, [%2]! \n" + "mov %0, %1 \n" + "add %1, %0, #16 \n" + "subs %3, %3, #80 \n" + "bgt 1b \n" + : "+r"(src_lower_bits), // %0 + "+r"(src_upper_bits), // %1 + "+r"(dst), // %2 + "+r"(size) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 85d1c1b9a..e0a4ea195 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -749,6 +749,54 @@ void DetileToYUY2_NEON(const uint8_t* src_y, } #endif +// Unpack MT2T into tiled P010 64 pixels at a time. See +// tinyurl.com/mtk-10bit-video-format for format documentation. +void UnpackMT2T_NEON(const uint16_t* src, uint16_t* dst, size_t size) { + const uint16_t* src_lower_bits = src; + const uint16_t* src_upper_bits = src + 8; + asm volatile( + "1: \n" + "ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n" + "ld1 {v7.8b}, [%0], #8 \n" + "shl v6.8b, v7.8b, #2 \n" + "shl v5.8b, v7.8b, #4 \n" + "shl v4.8b, v7.8b, #6 \n" + "zip1 v0.16b, v4.16b, v0.16b \n" + "zip1 v1.16b, v5.16b, v1.16b \n" + "zip1 v2.16b, v6.16b, v2.16b \n" + "zip1 v3.16b, v7.16b, v3.16b \n" + "sri v0.8h, v0.8h, #10 \n" + "sri v1.8h, v1.8h, #10 \n" + "sri v2.8h, v2.8h, #10 \n" + "sri v3.8h, v3.8h, #10 \n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n" + "ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n" + "ld1 {v7.8b}, [%0], #8 \n" + "shl v6.8b, v7.8b, #2 \n" + "shl v5.8b, v7.8b, #4 \n" + "shl v4.8b, v7.8b, #6 \n" + "zip1 v0.16b, v4.16b, v0.16b \n" + "zip1 v1.16b, v5.16b, v1.16b \n" + "zip1 v2.16b, v6.16b, v2.16b \n" + "zip1 v3.16b, v7.16b, v3.16b \n" + "sri v0.8h, v0.8h, #10 \n" + "sri v1.8h, v1.8h, #10 \n" + "sri v2.8h, v2.8h, #10 \n" + "sri v3.8h, v3.8h, #10 \n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n" + "mov %0, %1 \n" + "add %1, %0, #16 \n" + "subs %3, %3, #80 \n" + "b.gt 1b \n" + : "+r"(src_lower_bits), // %0 + "+r"(src_upper_bits), // %1 + "+r"(dst), // %2 + "+r"(size) // %3 + : + : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12"); +} + #if LIBYUV_USE_ST2 // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 47eff2ece..ad34cec7e 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -538,6 +538,26 @@ TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1) TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1) TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32) +// TODO (greenjustin): Test all variants. +TESTBIPLANARTOBPI(MT2T, + uint16_t, + 2, + 2, + 2, + P010, + uint16_t, + 2, + 2, + 2, + benchmark_width_, + _Opt, + +, + 0, + 1, + 10, + 16, + 32) + #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \