From 610e0cdead3fcd3288693d18eab8c7323805ad9e Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Mon, 5 Dec 2022 16:10:38 -0800
Subject: [PATCH] MT2T Warning fixes for fuchsia

Bug: b/258474032, b/257266635
Change-Id: Ic5cbbc60e2e1463361e359a2fe3e97976c1ea929
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4081348
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
---
 README.chromium            |  2 +-
 include/libyuv/convert.h   | 13 ++++++++
 include/libyuv/row.h       |  3 ++
 include/libyuv/version.h   |  2 +-
 source/compare.cc          |  6 ++--
 source/compare_gcc.cc      |  2 +-
 source/convert.cc          | 61 ++++++++++++++++++++++++++++++++++++++
 source/mjpeg_decoder.cc    |  4 +--
 source/planar_functions.cc |  3 +-
 source/rotate_neon64.cc    | 32 ++++++++++----------
 source/row_common.cc       | 27 +++++++++++++++++
 source/row_neon.cc         | 54 +++++++++++++++++++++++++++++++++
 source/row_neon64.cc       | 48 ++++++++++++++++++++++++++++++
 unit_test/convert_test.cc  | 20 +++++++++++++
 14 files changed, 252 insertions(+), 25 deletions(-)

diff --git a/README.chromium b/README.chromium
index 357334de0..ccd2ca06a 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1850
+Version: 1852
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index e1eb36b62..5c5231bb3 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -162,6 +162,19 @@ int MM21ToYUY2(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert MT2T to P010
+LIBYUV_API
+int MT2TToP010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 // Convert I422 to NV21.
 LIBYUV_API
 int I422ToNV21(const uint8_t* src_y,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 861c6d3e9..27f468add 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -455,6 +455,7 @@ extern "C" {
 #define HAS_DETILEROW_NEON
 #define HAS_DETILESPLITUVROW_NEON
 #define HAS_DETILETOYUY2_NEON
+#define HAS_UNPACKMT2T_NEON
 #define HAS_DIVIDEROW_16_NEON
 #define HAS_HALFFLOATROW_NEON
 #define HAS_HALFMERGEUVROW_NEON
@@ -2122,6 +2123,8 @@ void DetileToYUY2_Any_NEON(const uint8_t* src_y,
                            ptrdiff_t src_uv_tile_stride,
                            uint8_t* dst_yuy2,
                            int width);
+void UnpackMT2T_C(const uint16_t* src, uint16_t* dst, size_t size);
+void UnpackMT2T_NEON(const uint16_t* src, uint16_t* dst, size_t size);
 void MergeUVRow_C(const uint8_t* src_u,
                   const uint8_t* src_v,
                   uint8_t* dst_uv,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index ca1dc1373..6254d978f 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1850
+#define LIBYUV_VERSION 1852
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/compare.cc b/source/compare.cc
index d4713b605..50a736bdd 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -45,7 +45,7 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
   }
 #endif
 
-  while (count >= (uint64_t)(kBlockSize)) {
+  while (count >= (uint64_t)kBlockSize) {
     seed = HashDjb2_SSE(src, kBlockSize, seed);
     src += kBlockSize;
     count -= kBlockSize;
@@ -359,10 +359,10 @@ static double Ssim8x8_C(const uint8_t* src_a,
         (sum_a_sq + sum_b_sq + c1) *
         (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
 
-    if (ssim_d == 0.0) {
+    if (ssim_d == 0) {
       return DBL_MAX;
     }
-    return ssim_n * 1.0 / ssim_d;
+    return (double)ssim_n / (double)ssim_d;
   }
 }
 
diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc
index b834b42ac..33cbe25de 100644
--- a/source/compare_gcc.cc
+++ b/source/compare_gcc.cc
@@ -67,7 +67,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
       :
       : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
 
-  return static_cast<uint32_t>(diff);
+  return (uint32_t)(diff);
 }
 #else
 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
diff --git a/source/convert.cc b/source/convert.cc
index ad0edd1f2..65d4ba16f 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -732,6 +732,67 @@ int MM21ToYUY2(const uint8_t* src_y,
   return 0;
 }
 
+// Convert MT2T into P010. See tinyurl.com/mtk-10bit-video-format for format
+// documentation.
+// TODO(greenjustin): Add an MT2T to I420 conversion.
+// TODO(greenjustin): Investigate if there are valid stride parameters other
+// than width.
+LIBYUV_API
+int MT2TToP010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width <= 0 || height <= 0 || !src_y || !src_uv || !dst_y || !dst_uv) {
+    return -1;
+  }
+
+  // TODO(greenjustin): Investigate if we can allow arbitrary sizes. This may
+  // not be semantically meaningful in this format, but we do not have samples
+  // of unaligned data to conclude that yet. This format is 16x32 tiled, so we
+  // must pad the width and height to reflect that.
+  int aligned_width = (width + 15) & ~15;
+  int aligned_height = (height + 31) & ~31;
+
+  {
+    size_t y_size = aligned_width * aligned_height * 10 / 8;
+    size_t uv_size = aligned_width * ((aligned_height + 1) / 2) * 10 / 8;
+    size_t tmp_y_size = aligned_width * aligned_height * sizeof(uint16_t);
+    size_t tmp_uv_size =
+        aligned_width * ((aligned_height + 1) / 2) * sizeof(uint16_t);
+    void (*UnpackMT2T)(const uint16_t* src, uint16_t* dst, size_t size) =
+        UnpackMT2T_C;
+    align_buffer_64(tmp_y, tmp_y_size);
+    align_buffer_64(tmp_uv, tmp_uv_size);
+
+#if defined(HAS_UNPACKMT2T_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      UnpackMT2T = UnpackMT2T_NEON;
+    }
+#endif
+
+    // TODO(greenjustin): Unpack and detile in rows rather than planes to keep
+    // the caches hot.
+    UnpackMT2T(src_y, (uint16_t*)tmp_y, y_size);
+    UnpackMT2T(src_uv, (uint16_t*)tmp_uv, uv_size);
+
+    DetilePlane_16((uint16_t*)tmp_y, src_stride_y, dst_y, dst_stride_y, width,
+                   height, 32);
+    DetilePlane_16((uint16_t*)tmp_uv, src_stride_uv, dst_uv, dst_stride_uv,
+                   width, (height + 1) / 2, 16);
+
+    free_aligned_buffer_64(tmp_y);
+    free_aligned_buffer_64(tmp_uv);
+  }
+
+  return 0;
+}
+
 #ifdef I422TONV21_ROW_VERSION
 // Unittest fails for this version.
 // 422 chroma is 1/2 width, 1x height
diff --git a/source/mjpeg_decoder.cc b/source/mjpeg_decoder.cc
index 4ccf00a36..0141da8a1 100644
--- a/source/mjpeg_decoder.cc
+++ b/source/mjpeg_decoder.cc
@@ -109,7 +109,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
   }
 
   buf_.data = src;
-  buf_.len = static_cast<int>(src_len);
+  buf_.len = (int)src_len;
   buf_vec_.pos = 0;
   decompress_struct_->client_data = &buf_vec_;
 #ifdef HAVE_SETJMP
@@ -428,7 +428,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
 
 void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
   jpeg_source_mgr* src = cinfo->src;
-  size_t bytes = static_cast<size_t>(num_bytes);
+  size_t bytes = (size_t)num_bytes;
   if (bytes > src->bytes_in_buffer) {
     src->next_input_byte = nullptr;
     src->bytes_in_buffer = 0;
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 96914e08d..f43525d57 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -3196,6 +3196,7 @@ int RAWToRGB24(const uint8_t* src_raw,
   return 0;
 }
 
+// TODO(fbarchard): Consider uint8_t value
 LIBYUV_API
 void SetPlane(uint8_t* dst_y,
               int dst_stride_y,
@@ -3256,7 +3257,7 @@ void SetPlane(uint8_t* dst_y,
 
   // Set plane
   for (y = 0; y < height; ++y) {
-    SetRow(dst_y, value, width);
+    SetRow(dst_y, (uint8_t)value, width);
     dst_y += dst_stride_y;
   }
 }
diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc
index 43c158173..ea1cf82c2 100644
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@@ -201,13 +201,13 @@ void TransposeWx8_NEON(const uint8_t* src,
 
       "4:                                          \n"
 
-      : "=&r"(src_temp),                          // %0
-        "+r"(src),                                // %1
-        "+r"(dst),                                // %2
-        "+r"(width)                               // %3
-      : "r"(&kVTbl4x4Transpose),                  // %4
-        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6
+      : "=&r"(src_temp),             // %0
+        "+r"(src),                   // %1
+        "+r"(dst),                   // %2
+        "+r"(width)                  // %3
+      : "r"(&kVTbl4x4Transpose),     // %4
+        "r"((ptrdiff_t)src_stride),  // %5
+        "r"((ptrdiff_t)dst_stride)   // %6
       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
         "v17", "v18", "v19", "v20", "v21", "v22", "v23");
 }
@@ -423,15 +423,15 @@ void TransposeUVWx8_NEON(const uint8_t* src,
 
       "4:                                        \n"
 
-      : "=&r"(src_temp),                            // %0
-        "+r"(src),                                  // %1
-        "+r"(dst_a),                                // %2
-        "+r"(dst_b),                                // %3
-        "+r"(width)                                 // %4
-      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
-        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
-        "r"(&kVTbl4x4TransposeDi)                   // %8
+      : "=&r"(src_temp),               // %0
+        "+r"(src),                     // %1
+        "+r"(dst_a),                   // %2
+        "+r"(dst_b),                   // %3
+        "+r"(width)                    // %4
+      : "r"((ptrdiff_t)src_stride),    // %5
+        "r"((ptrdiff_t)dst_stride_a),  // %6
+        "r"((ptrdiff_t)dst_stride_b),  // %7
+        "r"(&kVTbl4x4TransposeDi)      // %8
       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
         "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
 }
diff --git a/source/row_common.cc b/source/row_common.cc
index 5ee5b17f0..7d084e76f 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2801,6 +2801,33 @@ void DetileToYUY2_C(const uint8_t* src_y,
   }
 }
 
+// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded
+// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the
+// block contain all of the lower 2 bits of each pixel packed together, and the
+// next 64 bytes represent all the upper 8 bits of the pixel.
+void UnpackMT2T_C(const uint16_t* src, uint16_t* dst, size_t size) {
+  for (size_t i = 0; i < size; i += 80) {
+    const uint8_t* src_lower_bits = (uint8_t*)src;
+    const uint8_t* src_upper_bits = ((uint8_t*)src) + 16;
+
+    for (int j = 0; j < 16; j++) {
+      uint8_t lower_bits = src_lower_bits[j];
+      *dst++ = (lower_bits & 0x03) << 6 | (uint16_t)src_upper_bits[j * 4] << 8 |
+               (uint16_t)src_upper_bits[j * 4] >> 2;
+      *dst++ = (lower_bits & 0x0C) << 4 |
+               (uint16_t)src_upper_bits[j * 4 + 1] << 8 |
+               (uint16_t)src_upper_bits[j * 4 + 1] >> 2;
+      *dst++ = (lower_bits & 0x30) << 2 |
+               (uint16_t)src_upper_bits[j * 4 + 2] << 8 |
+               (uint16_t)src_upper_bits[j * 4 + 2] >> 2;
+      *dst++ = (lower_bits & 0xC0) | (uint16_t)src_upper_bits[j * 4 + 3] << 8 |
+               (uint16_t)src_upper_bits[j * 4 + 3] >> 2;
+    }
+
+    src += 40;
+  }
+}
+
 void SplitRGBRow_C(const uint8_t* src_rgb,
                    uint8_t* dst_r,
                    uint8_t* dst_g,
diff --git a/source/row_neon.cc b/source/row_neon.cc
index d2815d17b..0c6065f81 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -720,6 +720,60 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
 }
 #endif
 
+void UnpackMT2T_NEON(const uint16_t* src, uint16_t* dst, size_t size) {
+  const uint16_t* src_lower_bits = src;
+  const uint16_t* src_upper_bits = src + 8;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8      {d1, d3, d5, d7}, [%1]!       \n"  // Load 32 bytes of upper
+                                                      // bits.
+      "vld1.8      {d6}, [%0]!                   \n"  // Load 8 bytes of lower
+                                                      // bits.
+      "vshl.u8     d4, d6, #2                    \n"  // Align lower bits.
+      "vshl.u8     d2, d6, #4                    \n"
+      "vshl.u8     d0, d6, #6                    \n"
+      "vzip.u8     d0, d1                        \n"  // Zip lower and upper
+                                                      // bits together.
+      "vzip.u8     d2, d3                        \n"
+      "vzip.u8     d4, d5                        \n"
+      "vzip.u8     d6, d7                        \n"
+      "vsri.u16    q0, q0, #10                   \n"  // Copy upper 6 bits into
+                                                      // lower 6 bits for better
+                                                      // accuracy in
+                                                      // conversions.
+      "vsri.u16    q1, q1, #10                   \n"
+      "vsri.u16    q2, q2, #10                   \n"
+      "vsri.u16    q3, q3, #10                   \n"
+      "vst4.16     {d0, d2, d4, d6}, [%2]!       \n"  // Store 32 pixels
+      "vst4.16     {d1, d3, d5, d7}, [%2]!       \n"
+      "vld4.8      {d1, d3, d5, d7}, [%1]!       \n"  // Process last 32 pixels
+                                                      // in the block
+      "vld1.8      {d6}, [%0]!                   \n"
+      "vshl.u8     d4, d6, #2                    \n"
+      "vshl.u8     d2, d6, #4                    \n"
+      "vshl.u8     d0, d6, #6                    \n"
+      "vzip.u8     d0, d1                        \n"
+      "vzip.u8     d2, d3                        \n"
+      "vzip.u8     d4, d5                        \n"
+      "vzip.u8     d6, d7                        \n"
+      "vsri.u16    q0, q0, #10                   \n"
+      "vsri.u16    q1, q1, #10                   \n"
+      "vsri.u16    q2, q2, #10                   \n"
+      "vsri.u16    q3, q3, #10                   \n"
+      "vst4.16     {d0, d2, d4, d6}, [%2]!       \n"
+      "vst4.16     {d1, d3, d5, d7}, [%2]!       \n"
+      "mov         %0, %1                        \n"
+      "add         %1, %0, #16                   \n"
+      "subs        %3, %3, #80                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_lower_bits),  // %0
+        "+r"(src_upper_bits),  // %1
+        "+r"(dst),             // %2
+        "+r"(size)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 85d1c1b9a..e0a4ea195 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -749,6 +749,54 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
 }
 #endif
 
+// Unpack MT2T into tiled P010 64 pixels at a time. See
+// tinyurl.com/mtk-10bit-video-format for format documentation.
+void UnpackMT2T_NEON(const uint16_t* src, uint16_t* dst, size_t size) {
+  const uint16_t* src_lower_bits = src;
+  const uint16_t* src_upper_bits = src + 8;
+  asm volatile(
+      "1:                                        \n"
+      "ld4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n"
+      "ld1         {v7.8b}, [%0], #8             \n"
+      "shl         v6.8b, v7.8b, #2              \n"
+      "shl         v5.8b, v7.8b, #4              \n"
+      "shl         v4.8b, v7.8b, #6              \n"
+      "zip1        v0.16b, v4.16b, v0.16b        \n"
+      "zip1        v1.16b, v5.16b, v1.16b        \n"
+      "zip1        v2.16b, v6.16b, v2.16b        \n"
+      "zip1        v3.16b, v7.16b, v3.16b        \n"
+      "sri         v0.8h, v0.8h, #10             \n"
+      "sri         v1.8h, v1.8h, #10             \n"
+      "sri         v2.8h, v2.8h, #10             \n"
+      "sri         v3.8h, v3.8h, #10             \n"
+      "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"
+      "ld4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n"
+      "ld1         {v7.8b}, [%0], #8             \n"
+      "shl         v6.8b, v7.8b, #2              \n"
+      "shl         v5.8b, v7.8b, #4              \n"
+      "shl         v4.8b, v7.8b, #6              \n"
+      "zip1        v0.16b, v4.16b, v0.16b        \n"
+      "zip1        v1.16b, v5.16b, v1.16b        \n"
+      "zip1        v2.16b, v6.16b, v2.16b        \n"
+      "zip1        v3.16b, v7.16b, v3.16b        \n"
+      "sri         v0.8h, v0.8h, #10             \n"
+      "sri         v1.8h, v1.8h, #10             \n"
+      "sri         v2.8h, v2.8h, #10             \n"
+      "sri         v3.8h, v3.8h, #10             \n"
+      "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"
+      "mov         %0, %1                        \n"
+      "add         %1, %0, #16                   \n"
+      "subs        %3, %3, #80                   \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_lower_bits),  // %0
+        "+r"(src_upper_bits),  // %1
+        "+r"(dst),             // %2
+        "+r"(size)             // %3
+      :
+      : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v8", "v9", "v10", "v11", "v12");
+}
+
 #if LIBYUV_USE_ST2
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON(const uint8_t* src_u,
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 47eff2ece..ad34cec7e 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -538,6 +538,26 @@ TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1)
 TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1)
 TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
 
+// TODO (greenjustin): Test all variants.
+TESTBIPLANARTOBPI(MT2T,
+                  uint16_t,
+                  2,
+                  2,
+                  2,
+                  P010,
+                  uint16_t,
+                  2,
+                  2,
+                  2,
+                  benchmark_width_,
+                  _Opt,
+                  +,
+                  0,
+                  1,
+                  10,
+                  16,
+                  32)
+
 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
                          SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
                          DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,     \