From 26173eb73ec1f52fe4b405b760b1e9f2a2d2d04a Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Wed, 22 Nov 2017 15:11:11 -0800
Subject: [PATCH] H010ToAR30 for 10 bit bt.709 YUV to 30 bit RGB

This version of the H010ToAR30 provides a 3 step conversion
Convert16To8Row_AVX2
H420ToARGB_AVX2
ARGBToAR30_AVX2

Low level function added to convert 16 bit to 8 bit using multiply
to adjust 10 bit or other bit depths and then save the upper 16 bits.

Bug: libyuv:751
Test: LibYUVPlanarTest.Convert16To8Row_Opt unittest added
Change-Id: I9cc576fda8afa1003cb961d03e0e656e0b478f03
Reviewed-on: https://chromium-review.googlesource.com/783554
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
---
 README.chromium                |   2 +-
 include/libyuv/convert_argb.h  |  13 ++++
 include/libyuv/row.h           |  11 ++-
 include/libyuv/version.h       |   2 +-
 include/libyuv/video_common.h  |   2 +
 source/convert_argb.cc         | 130 ++++++++++++++++++++++++++++++++
 source/row_common.cc           |  20 +++++
 source/row_gcc.cc              | 133 ++++++++++++++++++++++-----------
 unit_test/convert_test.cc      |  63 ++++++++++++++++
 unit_test/planar_test.cc       |  45 ++++++++++-
 unit_test/video_common_test.cc |   2 +
 11 files changed, 372 insertions(+), 51 deletions(-)

diff --git a/README.chromium b/README.chromium
index 2ba21e58b..bd99afd2e 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1679
+Version: 1680
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h
index f43a5060b..1c89d9456 100644
--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -321,6 +321,19 @@ int H422ToABGR(const uint8* src_y,
                int width,
                int height);
 
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16* src_y,
+               int src_stride_y,
+               const uint16* src_u,
+               int src_stride_u,
+               const uint16* src_v,
+               int src_stride_v,
+               uint8* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
 // BGRA little endian (argb in memory) to ARGB.
 LIBYUV_API
 int BGRAToARGB(const uint8* src_frame,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 30b6e4c6a..743f6b154 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -278,6 +278,7 @@ extern "C" {
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
 #define HAS_ARGBTOAR30ROW_AVX2
+#define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_MERGEUVROW_16_AVX2
 #define HAS_MULTIPLYROW_16_AVX2
 #endif
@@ -1540,6 +1541,12 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
                          int width);
 void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
 
+void Convert16To8Row_AVX2(const uint16* src_y,
+                          uint8* dst_y,
+                          int scale,
+                          int width);
+void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
+
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_AVX(const uint8* src, uint8* dst, int count);
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
@@ -2419,9 +2426,7 @@ void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb,
 void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb,
                                 uint8* dst_rgb,
                                 int width);
-void ARGBToAR30Row_Any_AVX2(const uint8* src_argb,
-                            uint8* dst_rgb,
-                            int width);
+void ARGBToAR30Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
 
 void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
 void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 884f3c950..b2b65d135 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1679
+#define LIBYUV_VERSION 1680
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/include/libyuv/video_common.h b/include/libyuv/video_common.h
index b09450653..ec520ef09 100644
--- a/include/libyuv/video_common.h
+++ b/include/libyuv/video_common.h
@@ -93,6 +93,7 @@ enum FourCC {
   FOURCC_J420 = FOURCC('J', '4', '2', '0'),
   FOURCC_J400 = FOURCC('J', '4', '0', '0'),  // unofficial fourcc
   FOURCC_H420 = FOURCC('H', '4', '2', '0'),  // unofficial fourcc
+  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // unofficial fourcc. 10 bit lsb
 
   // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
   FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
@@ -154,6 +155,7 @@ enum FourCCBpp {
   FOURCC_BPP_J420 = 12,
   FOURCC_BPP_J400 = 8,
   FOURCC_BPP_H420 = 12,
+  FOURCC_BPP_H010 = 24,
   FOURCC_BPP_MJPG = 0,  // 0 means unknown.
   FOURCC_BPP_H264 = 0,
   FOURCC_BPP_IYUV = 12,
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 5007bdb97..feef641cd 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -428,6 +428,136 @@ int H422ToABGR(const uint8* src_y,
                           width, height);
 }
 
+// Convert 10 bit YUV to 10 bit RGB with matrix
+static int H010ToAR30Matrix(const uint16* src_y,
+                            int src_stride_y,
+                            const uint16* src_u,
+                            int src_stride_u,
+                            const uint16* src_v,
+                            int src_stride_v,
+                            uint8* dst_ar30,
+                            int dst_stride_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int scale,  // 16384 for 10 bits
+                            int width,
+                            int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*Convert16To8Row)(const uint16* src_y, uint8* dst_y, int scale,
+                          int width) = Convert16To8Row_C;
+  void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
+                        const uint8* v_buf, uint8* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+      ARGBToAR30Row_C;
+
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+
+#if defined(HAS_CONVERT16TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert16To8Row = Convert16To8Row_C;  // TODO(fbarchard): Any AVX2
+    if (IS_ALIGNED(width, 64)) {
+      Convert16To8Row = Convert16To8Row_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR30Row = ARGBToAR30Row_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
+  }
+#endif
+
+  align_buffer_64(row_y, width);
+  align_buffer_64(row_u, halfwidth);
+  align_buffer_64(row_v, halfwidth);
+  align_buffer_64(row_argb, width * 4);
+
+  for (y = 0; y < height; ++y) {
+    Convert16To8Row(src_y, row_y, scale, width);
+    Convert16To8Row(src_u, row_u, scale, halfwidth);
+    Convert16To8Row(src_v, row_v, scale, halfwidth);
+
+    I422ToARGBRow(row_y, row_u, row_v, row_argb, yuvconstants, width);
+
+    ARGBToAR30Row(row_argb, dst_ar30, width);
+
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  free_aligned_buffer_64(row_y);
+  free_aligned_buffer_64(row_u);
+  free_aligned_buffer_64(row_v);
+  free_aligned_buffer_64(row_argb);
+  return 0;
+}
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16* src_y,
+               int src_stride_y,
+               const uint16* src_u,
+               int src_stride_u,
+               const uint16* src_v,
+               int src_stride_v,
+               uint8* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return H010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvH709Constants, 16384, width, height);
+}
+
 // Convert I444 to ARGB with matrix
 static int I444ToARGBMatrix(const uint8* src_y,
                             int src_stride_y,
diff --git a/source/row_common.cc b/source/row_common.cc
index 5dfd57aed..3263142b7 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1811,6 +1811,11 @@ void MergeRGBRow_C(const uint8* src_r,
   }
 }
 
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
 void MergeUVRow_16_C(const uint16* src_u,
                      const uint16* src_v,
                      uint16* dst_uv,
@@ -1840,6 +1845,21 @@ void MultiplyRow_16_C(const uint16* src_y,
   }
 }
 
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_C(const uint16* src_y,
+                       uint8* dst_y,
+                       int scale,
+                       int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = (src_y[x] * scale) >> 16;
+  }
+}
+
 void CopyRow_C(const uint8* src, uint8* dst, int count) {
   memcpy(dst, src, count);
 }
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index f348b7edc..bfebbb3e3 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -702,52 +702,51 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBTOAR30ROW_AVX2
 void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"  // 0x000000ff mask
-    "vpsrld     $0x18,%%ymm4,%%ymm4            \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"  // 0xc0000000 mask
-    "vpslld     $30,%%ymm5,%%ymm5              \n"
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"  // 0x000000ff mask
+      "vpsrld     $0x18,%%ymm4,%%ymm4            \n"
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"  // 0xc0000000 mask
+      "vpslld     $30,%%ymm5,%%ymm5              \n"
 
-    LABELALIGN
-    "1:                                        \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    // alpha
-    "vpand      %%ymm5,%%ymm0,%%ymm3           \n"
-    // red
-    "vpsrld     $0x10,%%ymm0,%%ymm1            \n"
-    "vpand      %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpsrld     $0x6,%%ymm1,%%ymm2             \n"
-    "vpslld     $22,%%ymm1,%%ymm1              \n"
-    "vpslld     $20,%%ymm2,%%ymm2              \n"
-    "vpor       %%ymm1,%%ymm3,%%ymm3           \n"
-    "vpor       %%ymm2,%%ymm3,%%ymm3           \n"
-    //green
-    "vpsrld     $0x08,%%ymm0,%%ymm1            \n"
-    "vpand      %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpsrld     $0x6,%%ymm1,%%ymm2             \n"
-    "vpslld     $12,%%ymm1,%%ymm1              \n"
-    "vpslld     $10,%%ymm2,%%ymm2              \n"
-    "vpor       %%ymm1,%%ymm3,%%ymm3           \n"
-    "vpor       %%ymm2,%%ymm3,%%ymm3           \n"
-    //blue
-    "vpand      %%ymm4,%%ymm0,%%ymm1           \n"
-    "vpsrld     $0x6,%%ymm1,%%ymm2             \n"
-    "vpslld     $2,%%ymm1,%%ymm1               \n"
-    "vpor       %%ymm1,%%ymm3,%%ymm3           \n"
-    "vpor       %%ymm2,%%ymm3,%%ymm3           \n"
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      // alpha
+      "vpand      %%ymm5,%%ymm0,%%ymm3           \n"
+      // red
+      "vpsrld     $0x10,%%ymm0,%%ymm1            \n"
+      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpsrld     $0x6,%%ymm1,%%ymm2             \n"
+      "vpslld     $22,%%ymm1,%%ymm1              \n"
+      "vpslld     $20,%%ymm2,%%ymm2              \n"
+      "vpor       %%ymm1,%%ymm3,%%ymm3           \n"
+      "vpor       %%ymm2,%%ymm3,%%ymm3           \n"
+      // green
+      "vpsrld     $0x08,%%ymm0,%%ymm1            \n"
+      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpsrld     $0x6,%%ymm1,%%ymm2             \n"
+      "vpslld     $12,%%ymm1,%%ymm1              \n"
+      "vpslld     $10,%%ymm2,%%ymm2              \n"
+      "vpor       %%ymm1,%%ymm3,%%ymm3           \n"
+      "vpor       %%ymm2,%%ymm3,%%ymm3           \n"
+      // blue
+      "vpand      %%ymm4,%%ymm0,%%ymm1           \n"
+      "vpsrld     $0x6,%%ymm1,%%ymm2             \n"
+      "vpslld     $2,%%ymm1,%%ymm1               \n"
+      "vpor       %%ymm1,%%ymm3,%%ymm3           \n"
+      "vpor       %%ymm2,%%ymm3,%%ymm3           \n"
 
-    "vmovdqu    %%ymm3,(%1)                    \n"
-    "add        $0x20,%0                       \n"
-    "add        $0x20,%1                       \n"
-    "sub        $0x8,%2                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+      "vmovdqu    %%ymm3,(%1)                    \n"
+      "add        $0x20,%0                       \n"
+      "add        $0x20,%1                       \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif
 
@@ -2851,6 +2850,11 @@ void MergeUVRow_16_AVX2(const uint16* src_u,
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
 #ifdef HAS_MULTIPLYROW_16_AVX2
 void MultiplyRow_16_AVX2(const uint16* src_y,
                          uint16* dst_y,
@@ -2885,6 +2889,47 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
 }
 #endif  // HAS_MULTIPLYROW_16_AVX2
 
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void Convert16To8Row_AVX2(const uint16* src_y,
+                          uint8* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %3,%%xmm3                      \n"
+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   0x20(%0),%%ymm1                 \n"
+    "vpmulhuw   %%ymm3,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"  // mutates
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0,(%1)                     \n"
+    "add       $0x40,%0                        \n"
+    "add       $0x20,%1                        \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
 #ifdef HAS_SPLITRGBROW_SSSE3
 
 // Shuffle table for converting RGB to Planar.
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index ead5919c3..8bcb63d3c 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -1963,4 +1963,67 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
 }
 #endif  // HAS_ARGBTOAR30ROW_AVX2
 
+// Alias to copy pixels as is
+#define AR30ToAR30 ARGBToARGB
+
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
+                         ALIGN, YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C,       \
+                         BPP_C)                                                \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
+    const int kBpc = 2;                                                        \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF);                 \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + OFF);                         \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+      reinterpret_cast<uint16*>(src_y)[i + OFF] = (fastrand() & 0x3ff);        \
+    }                                                                          \
+    for (int i = 0; i < kSizeUV; ++i) {                                        \
+      reinterpret_cast<uint16*>(src_u)[i + OFF] = (fastrand() & 0x3ff);        \
+      reinterpret_cast<uint16*>(src_v)[i + OFF] = (fastrand() & 0x3ff);        \
+    }                                                                          \
+    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
+    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth,      \
+                          reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV,   \
+                          reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV,   \
+                          dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight);    \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth,    \
+                            reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV, \
+                            reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV, \
+                            dst_argb_opt + OFF, kStrideB, kWidth,              \
+                            NEG kHeight);                                      \
+    }                                                                          \
+    int max_diff = 0;                                                          \
+    for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                       \
+      int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -                     \
+                         static_cast<int>(dst_argb_opt[i]));                   \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+    EXPECT_LE(max_diff, DIFF);                                                 \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_u);                                       \
+    free_aligned_buffer_page_end(src_v);                                       \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }
+
+#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+                        YALIGN, DIFF, FMT_C, BPP_C)                            \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2, AR30, 4)
+
 }  // namespace libyuv
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index f9e6f8abb..151bcafd1 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2661,7 +2661,7 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
 }
 #endif
 
-// TODO(fbarchard): improve test for platforms and cpu detect
+// TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_MULTIPLYROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
   const int kPixels = benchmark_width_ * benchmark_height_;
@@ -2697,7 +2697,48 @@ TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
   free_aligned_buffer_page_end(dst_pixels_y_opt);
   free_aligned_buffer_page_end(dst_pixels_y_c);
 }
-#endif
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
+// TODO(fbarchard): Improve test for more platforms.
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_y, kPixels * 2);
+  align_buffer_page_end(dst_pixels_y_opt, kPixels);
+  align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+  MemRandomize(src_pixels_y, kPixels * 2);
+  // C code does not clamp so limit source range to 10 bits.
+  for (int i = 0; i < kPixels; ++i) {
+    reinterpret_cast<uint16*>(src_pixels_y)[i] &= 1023;
+  }
+
+  memset(dst_pixels_y_opt, 0, kPixels);
+  memset(dst_pixels_y_c, 1, kPixels);
+
+  Convert16To8Row_C(reinterpret_cast<const uint16*>(src_pixels_y),
+                    dst_pixels_y_c, 16384, kPixels);
+
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    if (has_avx2) {
+      Convert16To8Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
+                           dst_pixels_y_opt, 16384, kPixels);
+    } else {
+      Convert16To8Row_C(reinterpret_cast<const uint16*>(src_pixels_y),
+                        dst_pixels_y_opt, 16384, kPixels);
+    }
+  }
+
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_y);
+  free_aligned_buffer_page_end(dst_pixels_y_opt);
+  free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+#endif  // HAS_CONVERT16TO8ROW_AVX2
 
 float TestScaleMaxSamples(int benchmark_width,
                           int benchmark_height,
diff --git a/unit_test/video_common_test.cc b/unit_test/video_common_test.cc
index 424d79986..ba7b15a9d 100644
--- a/unit_test/video_common_test.cc
+++ b/unit_test/video_common_test.cc
@@ -80,6 +80,8 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
   EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
   EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
   EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010));
   EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
   EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
   EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));