MT2T Warning fixes for fuchsia

Bug: b/258474032, b/257266635 Change-Id: Ic5cbbc60e2e1463361e359a2fe3e97976c1ea929 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4081348 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
2025-12-06 16:56:55 +08:00 · 2022-12-05 16:10:38 -08:00 · 2022-12-05 16:10:38 -08:00 · 610e0cdead
commit 610e0cdead
parent c19943b4d0
14 changed files with 252 additions and 25 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1850
+Version: 1852
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -162,6 +162,19 @@ int MM21ToYUY2(const uint8_t* src_y,
               int width,
               int height);

+// Convert MT2T to P010
+LIBYUV_API
+int MT2TToP010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 // Convert I422 to NV21.
 LIBYUV_API
 int I422ToNV21(const uint8_t* src_y,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -455,6 +455,7 @@ extern "C" {
 #define HAS_DETILEROW_NEON
 #define HAS_DETILESPLITUVROW_NEON
 #define HAS_DETILETOYUY2_NEON
+#define HAS_UNPACKMT2T_NEON
 #define HAS_DIVIDEROW_16_NEON
 #define HAS_HALFFLOATROW_NEON
 #define HAS_HALFMERGEUVROW_NEON
@ -2122,6 +2123,8 @@ void DetileToYUY2_Any_NEON(const uint8_t* src_y,
                           ptrdiff_t src_uv_tile_stride,
                           uint8_t* dst_yuy2,
                           int width);
+void UnpackMT2T_C(const uint16_t* src, uint16_t* dst, size_t size);
+void UnpackMT2T_NEON(const uint16_t* src, uint16_t* dst, size_t size);
 void MergeUVRow_C(const uint8_t* src_u,
                  const uint8_t* src_v,
                  uint8_t* dst_uv,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1850
+#define LIBYUV_VERSION 1852

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/compare.cc
+++ b/source/compare.cc
@ -45,7 +45,7 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
  }
 #endif

-  while (count >= (uint64_t)(kBlockSize)) {
+  while (count >= (uint64_t)kBlockSize) {
    seed = HashDjb2_SSE(src, kBlockSize, seed);
    src += kBlockSize;
    count -= kBlockSize;
@ -359,10 +359,10 @@ static double Ssim8x8_C(const uint8_t* src_a,
        (sum_a_sq + sum_b_sq + c1) *
        (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);

-    if (ssim_d == 0.0) {
+    if (ssim_d == 0) {
      return DBL_MAX;
    }
-    return ssim_n * 1.0 / ssim_d;
+    return (double)ssim_n / (double)ssim_d;
  }
 }

--- a/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@ -67,7 +67,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
      :
      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");

-  return static_cast<uint32_t>(diff);
+  return (uint32_t)(diff);
 }
 #else
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
--- a/source/convert.cc
+++ b/source/convert.cc
@ -732,6 +732,67 @@ int MM21ToYUY2(const uint8_t* src_y,
  return 0;
 }

+// Convert MT2T into P010. See tinyurl.com/mtk-10bit-video-format for format
+// documentation.
+// TODO(greenjustin): Add an MT2T to I420 conversion.
+// TODO(greenjustin): Investigate if there are valid stride parameters other
+// than width.
+LIBYUV_API
+int MT2TToP010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width <= 0 || height <= 0 || !src_y || !src_uv || !dst_y || !dst_uv) {
+    return -1;
+  }
+
+  // TODO(greenjustin): Investigate if we can allow arbitrary sizes. This may
+  // not be semantically meaningful in this format, but we do not have samples
+  // of unaligned data to conclude that yet. This format is 16x32 tiled, so we
+  // must pad the width and height to reflect that.
+  int aligned_width = (width + 15) & ~15;
+  int aligned_height = (height + 31) & ~31;
+
+  {
+    size_t y_size = aligned_width * aligned_height * 10 / 8;
+    size_t uv_size = aligned_width * ((aligned_height + 1) / 2) * 10 / 8;
+    size_t tmp_y_size = aligned_width * aligned_height * sizeof(uint16_t);
+    size_t tmp_uv_size =
+        aligned_width * ((aligned_height + 1) / 2) * sizeof(uint16_t);
+    void (*UnpackMT2T)(const uint16_t* src, uint16_t* dst, size_t size) =
+        UnpackMT2T_C;
+    align_buffer_64(tmp_y, tmp_y_size);
+    align_buffer_64(tmp_uv, tmp_uv_size);
+
+#if defined(HAS_UNPACKMT2T_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      UnpackMT2T = UnpackMT2T_NEON;
+    }
+#endif
+
+    // TODO(greenjustin): Unpack and detile in rows rather than planes to keep
+    // the caches hot.
+    UnpackMT2T(src_y, (uint16_t*)tmp_y, y_size);
+    UnpackMT2T(src_uv, (uint16_t*)tmp_uv, uv_size);
+
+    DetilePlane_16((uint16_t*)tmp_y, src_stride_y, dst_y, dst_stride_y, width,
+                   height, 32);
+    DetilePlane_16((uint16_t*)tmp_uv, src_stride_uv, dst_uv, dst_stride_uv,
+                   width, (height + 1) / 2, 16);
+
+    free_aligned_buffer_64(tmp_y);
+    free_aligned_buffer_64(tmp_uv);
+  }
+
+  return 0;
+}
+
 #ifdef I422TONV21_ROW_VERSION
 // Unittest fails for this version.
 // 422 chroma is 1/2 width, 1x height
--- a/source/mjpeg_decoder.cc
+++ b/source/mjpeg_decoder.cc
@ -109,7 +109,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
  }

  buf_.data = src;
-  buf_.len = static_cast<int>(src_len);
+  buf_.len = (int)src_len;
  buf_vec_.pos = 0;
  decompress_struct_->client_data = &buf_vec_;
 #ifdef HAVE_SETJMP
@ -428,7 +428,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {

 void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
  jpeg_source_mgr* src = cinfo->src;
-  size_t bytes = static_cast<size_t>(num_bytes);
+  size_t bytes = (size_t)num_bytes;
  if (bytes > src->bytes_in_buffer) {
    src->next_input_byte = nullptr;
    src->bytes_in_buffer = 0;
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -3196,6 +3196,7 @@ int RAWToRGB24(const uint8_t* src_raw,
  return 0;
 }

+// TODO(fbarchard): Consider uint8_t value
 LIBYUV_API
 void SetPlane(uint8_t* dst_y,
              int dst_stride_y,
@ -3256,7 +3257,7 @@ void SetPlane(uint8_t* dst_y,

  // Set plane
  for (y = 0; y < height; ++y) {
-    SetRow(dst_y, value, width);
+    SetRow(dst_y, (uint8_t)value, width);
    dst_y += dst_stride_y;
  }
 }
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@ -201,13 +201,13 @@ void TransposeWx8_NEON(const uint8_t* src,

      "4:                                          \n"

-      : "=&r"(src_temp),                          // %0
-        "+r"(src),                                // %1
-        "+r"(dst),                                // %2
-        "+r"(width)                               // %3
-      : "r"(&kVTbl4x4Transpose),                  // %4
-        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6
+      : "=&r"(src_temp),             // %0
+        "+r"(src),                   // %1
+        "+r"(dst),                   // %2
+        "+r"(width)                  // %3
+      : "r"(&kVTbl4x4Transpose),     // %4
+        "r"((ptrdiff_t)src_stride),  // %5
+        "r"((ptrdiff_t)dst_stride)   // %6
      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
        "v17", "v18", "v19", "v20", "v21", "v22", "v23");
 }
@ -423,15 +423,15 @@ void TransposeUVWx8_NEON(const uint8_t* src,

      "4:                                        \n"

-      : "=&r"(src_temp),                            // %0
-        "+r"(src),                                  // %1
-        "+r"(dst_a),                                // %2
-        "+r"(dst_b),                                // %3
-        "+r"(width)                                 // %4
-      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
-        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
-        "r"(&kVTbl4x4TransposeDi)                   // %8
+      : "=&r"(src_temp),               // %0
+        "+r"(src),                     // %1
+        "+r"(dst_a),                   // %2
+        "+r"(dst_b),                   // %3
+        "+r"(width)                    // %4
+      : "r"((ptrdiff_t)src_stride),    // %5
+        "r"((ptrdiff_t)dst_stride_a),  // %6
+        "r"((ptrdiff_t)dst_stride_b),  // %7
+        "r"(&kVTbl4x4TransposeDi)      // %8
      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
 }
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -2801,6 +2801,33 @@ void DetileToYUY2_C(const uint8_t* src_y,
  }
 }

+// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded
+// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the
+// block contain all of the lower 2 bits of each pixel packed together, and the
+// next 64 bytes represent all the upper 8 bits of the pixel.
+void UnpackMT2T_C(const uint16_t* src, uint16_t* dst, size_t size) {
+  for (size_t i = 0; i < size; i += 80) {
+    const uint8_t* src_lower_bits = (uint8_t*)src;
+    const uint8_t* src_upper_bits = ((uint8_t*)src) + 16;
+
+    for (int j = 0; j < 16; j++) {
+      uint8_t lower_bits = src_lower_bits[j];
+      *dst++ = (lower_bits & 0x03) << 6 | (uint16_t)src_upper_bits[j * 4] << 8 |
+               (uint16_t)src_upper_bits[j * 4] >> 2;
+      *dst++ = (lower_bits & 0x0C) << 4 |
+               (uint16_t)src_upper_bits[j * 4 + 1] << 8 |
+               (uint16_t)src_upper_bits[j * 4 + 1] >> 2;
+      *dst++ = (lower_bits & 0x30) << 2 |
+               (uint16_t)src_upper_bits[j * 4 + 2] << 8 |
+               (uint16_t)src_upper_bits[j * 4 + 2] >> 2;
+      *dst++ = (lower_bits & 0xC0) | (uint16_t)src_upper_bits[j * 4 + 3] << 8 |
+               (uint16_t)src_upper_bits[j * 4 + 3] >> 2;
+    }
+
+    src += 40;
+  }
+}
+
 void SplitRGBRow_C(const uint8_t* src_rgb,
                   uint8_t* dst_r,
                   uint8_t* dst_g,
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -720,6 +720,60 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
 }
 #endif

+void UnpackMT2T_NEON(const uint16_t* src, uint16_t* dst, size_t size) {
+  const uint16_t* src_lower_bits = src;
+  const uint16_t* src_upper_bits = src + 8;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d1, d3, d5, d7}, [%1]!       \n"  // Load 32 bytes of upper
+                                                      // bits.
+      "vld1.8      {d6}, [%0]!                   \n"  // Load 8 bytes of lower
+                                                      // bits.
+      "vshl.u8     d4, d6, #2                    \n"  // Align lower bits.
+      "vshl.u8     d2, d6, #4                    \n"
+      "vshl.u8     d0, d6, #6                    \n"
+      "vzip.u8     d0, d1                        \n"  // Zip lower and upper
+                                                      // bits together.
+      "vzip.u8     d2, d3                        \n"
+      "vzip.u8     d4, d5                        \n"
+      "vzip.u8     d6, d7                        \n"
+      "vsri.u16    q0, q0, #10                   \n"  // Copy upper 6 bits into
+                                                      // lower 6 bits for better
+                                                      // accuracy in
+                                                      // conversions.
+      "vsri.u16    q1, q1, #10                   \n"
+      "vsri.u16    q2, q2, #10                   \n"
+      "vsri.u16    q3, q3, #10                   \n"
+      "vst4.16     {d0, d2, d4, d6}, [%2]!       \n"  // Store 32 pixels
+      "vst4.16     {d1, d3, d5, d7}, [%2]!       \n"
+      "vld4.8      {d1, d3, d5, d7}, [%1]!       \n"  // Process last 32 pixels
+                                                      // in the block
+      "vld1.8      {d6}, [%0]!                   \n"
+      "vshl.u8     d4, d6, #2                    \n"
+      "vshl.u8     d2, d6, #4                    \n"
+      "vshl.u8     d0, d6, #6                    \n"
+      "vzip.u8     d0, d1                        \n"
+      "vzip.u8     d2, d3                        \n"
+      "vzip.u8     d4, d5                        \n"
+      "vzip.u8     d6, d7                        \n"
+      "vsri.u16    q0, q0, #10                   \n"
+      "vsri.u16    q1, q1, #10                   \n"
+      "vsri.u16    q2, q2, #10                   \n"
+      "vsri.u16    q3, q3, #10                   \n"
+      "vst4.16     {d0, d2, d4, d6}, [%2]!       \n"
+      "vst4.16     {d1, d3, d5, d7}, [%2]!       \n"
+      "mov         %0, %1                        \n"
+      "add         %1, %0, #16                   \n"
+      "subs        %3, %3, #80                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_lower_bits),  // %0
+        "+r"(src_upper_bits),  // %1
+        "+r"(dst),             // %2
+        "+r"(size)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -749,6 +749,54 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
 }
 #endif

+// Unpack MT2T into tiled P010 64 pixels at a time. See
+// tinyurl.com/mtk-10bit-video-format for format documentation.
+void UnpackMT2T_NEON(const uint16_t* src, uint16_t* dst, size_t size) {
+  const uint16_t* src_lower_bits = src;
+  const uint16_t* src_upper_bits = src + 8;
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n"
+      "ld1         {v7.8b}, [%0], #8             \n"
+      "shl         v6.8b, v7.8b, #2              \n"
+      "shl         v5.8b, v7.8b, #4              \n"
+      "shl         v4.8b, v7.8b, #6              \n"
+      "zip1        v0.16b, v4.16b, v0.16b        \n"
+      "zip1        v1.16b, v5.16b, v1.16b        \n"
+      "zip1        v2.16b, v6.16b, v2.16b        \n"
+      "zip1        v3.16b, v7.16b, v3.16b        \n"
+      "sri         v0.8h, v0.8h, #10             \n"
+      "sri         v1.8h, v1.8h, #10             \n"
+      "sri         v2.8h, v2.8h, #10             \n"
+      "sri         v3.8h, v3.8h, #10             \n"
+      "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"
+      "ld4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n"
+      "ld1         {v7.8b}, [%0], #8             \n"
+      "shl         v6.8b, v7.8b, #2              \n"
+      "shl         v5.8b, v7.8b, #4              \n"
+      "shl         v4.8b, v7.8b, #6              \n"
+      "zip1        v0.16b, v4.16b, v0.16b        \n"
+      "zip1        v1.16b, v5.16b, v1.16b        \n"
+      "zip1        v2.16b, v6.16b, v2.16b        \n"
+      "zip1        v3.16b, v7.16b, v3.16b        \n"
+      "sri         v0.8h, v0.8h, #10             \n"
+      "sri         v1.8h, v1.8h, #10             \n"
+      "sri         v2.8h, v2.8h, #10             \n"
+      "sri         v3.8h, v3.8h, #10             \n"
+      "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"
+      "mov         %0, %1                        \n"
+      "add         %1, %0, #16                   \n"
+      "subs        %3, %3, #80                   \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_lower_bits),  // %0
+        "+r"(src_upper_bits),  // %1
+        "+r"(dst),             // %2
+        "+r"(size)             // %3
+      :
+      : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v8", "v9", "v10", "v11", "v12");
+}
+
 #if LIBYUV_USE_ST2
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON(const uint8_t* src_u,
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -538,6 +538,26 @@ TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1)
 TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1)
 TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)

+// TODO (greenjustin): Test all variants.
+TESTBIPLANARTOBPI(MT2T,
+                  uint16_t,
+                  2,
+                  2,
+                  2,
+                  P010,
+                  uint16_t,
+                  2,
+                  2,
+                  2,
+                  benchmark_width_,
+                  _Opt,
+                  +,
+                  0,
+                  1,
+                  10,
+                  16,
+                  32)
+
 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
                         SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
                         DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,     \