Add 10/12 bit YUV To YUV functions

The following functions (and their 12 bit variant) are added: planar, 10->10: I410ToI010, I210ToI010 planar, 10->8: I410ToI444, I210ToI422 planar<->biplanar, 10->10: I010ToP010, I210ToP210, I410ToP410 P010ToI010, P210ToI210, P410ToI410 R=fbarchard@chromium.org Change-Id: I9aa2bafa0d6a6e1e38ce4e20cbb437e10f9b0158 Bug: libyuv:834, libyuv:873 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2709822 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
2026-02-07 02:09:50 +08:00 · 2021-02-25 15:21:28 +08:00 · 2021-02-25 15:21:28 +08:00 · a8c181050c
commit a8c181050c
parent 08815a2976
17 changed files with 1574 additions and 232 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1778
+Version: 1779
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -193,6 +193,129 @@ int I010ToI420(const uint16_t* src_y,
               int width,
               int height);

+#define H210ToH422 I210ToI422
+LIBYUV_API
+int I210ToI422(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H410ToH444 I410ToI444
+LIBYUV_API
+int I410ToI444(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H012ToH420 I012ToI420
+LIBYUV_API
+int I012ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H212ToH422 I212ToI422
+LIBYUV_API
+int I212ToI422(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define H412ToH444 I412ToI444
+LIBYUV_API
+int I412ToI444(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define I412ToI012 I410ToI010
+#define H410ToH010 I410ToI010
+#define H412ToH012 I410ToI010
+LIBYUV_API
+int I410ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+#define I212ToI012 I210ToI010
+#define H210ToH010 I210ToI010
+#define H212ToH012 I210ToI010
+LIBYUV_API
+int I210ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // Convert I010 to I410
 LIBYUV_API
 int I010ToI410(const uint16_t* src_y,
@ -233,6 +356,66 @@ int I210ToI410(const uint16_t* src_y,
 // Convert I212 to I412
 #define I212ToI412 I210ToI410

+// Convert I010 to P010
+LIBYUV_API
+int I010ToP010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I210 to P210
+LIBYUV_API
+int I210ToP210(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I012 to P012
+LIBYUV_API
+int I012ToP012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert I212 to P212
+LIBYUV_API
+int I212ToP212(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 // Convert I400 (grey) to I420.
 LIBYUV_API
 int I400ToI420(const uint8_t* src_y,
--- a/include/libyuv/convert_from.h
+++ b/include/libyuv/convert_from.h
@ -39,6 +39,24 @@ int I420ToI010(const uint8_t* src_y,
               int width,
               int height);

+// Convert 8 bit YUV to 12 bit.
+#define H420ToH012 I420ToI012
+LIBYUV_API
+int I420ToI012(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 LIBYUV_API
 int I420ToI422(const uint8_t* src_y,
               int src_stride_y,
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -105,6 +105,50 @@ void MergeUVPlane(const uint8_t* src_u,
                  int width,
                  int height);

+// Split interleaved msb UV plane into separate lsb U and V planes.
+LIBYUV_API
+void SplitUVPlane_16(const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint16_t* dst_u,
+                     int dst_stride_u,
+                     uint16_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     int depth);
+
+// Merge separate lsb U and V planes into one interleaved msb UV plane.
+LIBYUV_API
+void MergeUVPlane_16(const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint16_t* dst_uv,
+                     int dst_stride_uv,
+                     int width,
+                     int height,
+                     int depth);
+
+// Convert lsb plane to msb plane
+LIBYUV_API
+void ConvertToMSBPlane_16(const uint16_t* src_y,
+                          int src_stride_y,
+                          uint16_t* dst_y,
+                          int dst_stride_y,
+                          int width,
+                          int height,
+                          int depth);
+
+// Convert msb plane to lsb plane
+LIBYUV_API
+void ConvertToLSBPlane_16(const uint16_t* src_y,
+                          int src_stride_y,
+                          uint16_t* dst_y,
+                          int dst_stride_y,
+                          int width,
+                          int height,
+                          int depth);
+
 // Scale U and V to half width and height and merge into interleaved UV plane.
 // width and height are source size, allowing odd sizes.
 // Use for converting I444 or I422 to NV12.
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -305,6 +305,7 @@ extern "C" {
 #define HAS_ARGBTORGB24ROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_DIVIDEROW_16_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
 #define HAS_MERGEARGBROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
@ -318,6 +319,7 @@ extern "C" {
 #define HAS_MULTIPLYROW_16_AVX2
 #define HAS_RGBATOYJROW_AVX2
 #define HAS_SPLITARGBROW_AVX2
+#define HAS_SPLITUVROW_16_AVX2
 #define HAS_SWAPUVROW_AVX2
 // TODO(fbarchard): Fix AVX2 version of YUV24
 // #define HAS_NV21TOYUV24ROW_AVX2
@ -363,6 +365,7 @@ extern "C" {
 #define HAS_BGRATOYROW_NEON
 #define HAS_BYTETOFLOATROW_NEON
 #define HAS_COPYROW_NEON
+#define HAS_DIVIDEROW_16_NEON
 #define HAS_HALFFLOATROW_NEON
 #define HAS_HALFMERGEUVROW_NEON
 #define HAS_I400TOARGBROW_NEON
@ -380,9 +383,11 @@ extern "C" {
 #define HAS_J400TOARGBROW_NEON
 #define HAS_MERGEARGBROW_NEON
 #define HAS_MERGEUVROW_NEON
+#define HAS_MERGEUVROW_16_NEON
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORUVROW_NEON
 #define HAS_MIRRORSPLITUVROW_NEON
+#define HAS_MULTIPLYROW_16_NEON
 #define HAS_NV12TOARGBROW_NEON
 #define HAS_NV12TORGB24ROW_NEON
 #define HAS_NV12TORGB565ROW_NEON
@ -409,6 +414,7 @@ extern "C" {
 #define HAS_SPLITARGBROW_NEON
 #define HAS_SPLITRGBROW_NEON
 #define HAS_SPLITUVROW_NEON
+#define HAS_SPLITUVROW_16_NEON
 #define HAS_SWAPUVROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
@ -2010,22 +2016,96 @@ void SplitXRGBRow_Any_NEON(const uint8_t* src_argb,
 void MergeUVRow_16_C(const uint16_t* src_u,
                     const uint16_t* src_v,
                     uint16_t* dst_uv,
-                     int scale, /* 64 for 10 bit */
+                     int depth,
                     int width);
 void MergeUVRow_16_AVX2(const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint16_t* dst_uv,
-                        int scale,
+                        int depth,
                        int width);
+void MergeUVRow_16_Any_AVX2(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int depth,
+                        int width);
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int depth,
+                        int width);
+void MergeUVRow_16_Any_NEON(const uint16_t* src_u,
+                            const uint16_t* src_v,
+                            uint16_t* dst_uv,
+                            int depth,
+                            int width);
+
+void SplitUVRow_16_C(const uint16_t* src_uv,
+                     uint16_t* dst_u,
+                     uint16_t* dst_v,
+                     int depth,
+                     int width);
+void SplitUVRow_16_AVX2(const uint16_t* src_uv,
+                        uint16_t* dst_u,
+                        uint16_t* dst_v,
+                        int depth,
+                        int width);
+void SplitUVRow_16_Any_AVX2(const uint16_t* src_uv,
+                            uint16_t* dst_u,
+                            uint16_t* dst_v,
+                            int depth,
+                            int width);
+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+                        uint16_t* dst_u,
+                        uint16_t* dst_v,
+                        int depth,
+                        int width);
+void SplitUVRow_16_Any_NEON(const uint16_t* src_uv,
+                            uint16_t* dst_u,
+                            uint16_t* dst_v,
+                            int depth,
+                            int width);

-void MultiplyRow_16_AVX2(const uint16_t* src_y,
-                         uint16_t* dst_y,
-                         int scale,
-                         int width);
 void MultiplyRow_16_C(const uint16_t* src_y,
                      uint16_t* dst_y,
                      int scale,
                      int width);
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width);
+void MultiplyRow_16_Any_AVX2(const uint16_t* src_y,
+                             uint16_t* dst_y,
+                             int scale,
+                             int width);
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width);
+void MultiplyRow_16_Any_NEON(const uint16_t* src_y,
+                             uint16_t* dst_y,
+                             int scale,
+                             int width);
+
+void DivideRow_16_C(const uint16_t* src_y,
+                    uint16_t* dst_y,
+                    int scale,
+                    int width);
+void DivideRow_16_AVX2(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width);
+void DivideRow_16_Any_AVX2(const uint16_t* src_y,
+                           uint16_t* dst_y,
+                           int scale,
+                           int width);
+void DivideRow_16_NEON(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width);
+void DivideRow_16_Any_NEON(const uint16_t* src_y,
+                           uint16_t* dst_y,
+                           int scale,
+                           int width);

 void Convert8To16Row_C(const uint8_t* src_y,
                       uint16_t* dst_y,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1778
+#define LIBYUV_VERSION 1779

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/include/libyuv/video_common.h
+++ b/include/libyuv/video_common.h
@ -60,7 +60,7 @@ enum FourCC {
  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
  FOURCC_I010 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 420
-  FOURCC_I210 = FOURCC('I', '0', '1', '0'),  // bt.601 10 bit 422
+  FOURCC_I210 = FOURCC('I', '2', '1', '0'),  // bt.601 10 bit 422

  // 1 Secondary YUV format: row biplanar.  deprecated.
  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
@ -109,6 +109,8 @@ enum FourCC {
  FOURCC_F210 = FOURCC('F', '2', '1', '0'),  // bt.709 full range 10 bit 422
  FOURCC_H210 = FOURCC('H', '2', '1', '0'),  // bt.709 10 bit 422
  FOURCC_U210 = FOURCC('U', '2', '1', '0'),  // bt.2020 10 bit 422
+  FOURCC_P010 = FOURCC('P', '0', '1', '0'),
+  FOURCC_P210 = FOURCC('P', '2', '1', '0'),

  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
@ -178,7 +180,12 @@ enum FourCCBpp {
  FOURCC_BPP_J400 = 8,
  FOURCC_BPP_H420 = 12,
  FOURCC_BPP_H422 = 16,
-  FOURCC_BPP_H010 = 24,
+  FOURCC_BPP_I010 = 15,
+  FOURCC_BPP_I210 = 20,
+  FOURCC_BPP_H010 = 15,
+  FOURCC_BPP_H210 = 20,
+  FOURCC_BPP_P010 = 15,
+  FOURCC_BPP_P210 = 20,
  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
  FOURCC_BPP_H264 = 0,
  FOURCC_BPP_IYUV = 12,
--- a/source/convert.cc
+++ b/source/convert.cc
@ -149,6 +149,52 @@ int I010Copy(const uint16_t* src_y,
  return 0;
 }

+static int Planar16bitTo8bit(const uint16_t* src_y,
+                             int src_stride_y,
+                             const uint16_t* src_u,
+                             int src_stride_u,
+                             const uint16_t* src_v,
+                             int src_stride_v,
+                             uint8_t* dst_y,
+                             int dst_stride_y,
+                             uint8_t* dst_u,
+                             int dst_stride_u,
+                             uint8_t* dst_v,
+                             int dst_stride_v,
+                             int width,
+                             int height,
+                             int subsample_x,
+                             int subsample_y,
+                             int depth) {
+  int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  int scale = 1 << (24 - depth);
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    uv_height = -uv_height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (uv_height - 1) * src_stride_u;
+    src_v = src_v + (uv_height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+                    height);
+  // Convert UV planes.
+  Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale, uv_width,
+                    uv_height);
+  Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale, uv_width,
+                    uv_height);
+  return 0;
+}
+
 // Convert 10 bit YUV to 8 bit.
 LIBYUV_API
 int I010ToI420(const uint16_t* src_y,
@ -165,34 +211,295 @@ int I010ToI420(const uint16_t* src_y,
               int dst_stride_v,
               int width,
               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+                           1, 10);
+}
+
+LIBYUV_API
+int I210ToI422(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+                           0, 10);
+}
+
+LIBYUV_API
+int I410ToI444(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 0,
+                           0, 10);
+}
+
+LIBYUV_API
+int I012ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+                           1, 12);
+}
+
+LIBYUV_API
+int I212ToI422(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+                           0, 12);
+}
+
+LIBYUV_API
+int I412ToI444(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_y, dst_stride_y, dst_u,
+                           dst_stride_u, dst_v, dst_stride_v, width, height, 0,
+                           0, 12);
+}
+
+// Any Ix10 To I010 format with mirroring.
+static int Ix10ToI010(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint16_t* dst_y,
+                      int dst_stride_y,
+                      uint16_t* dst_u,
+                      int dst_stride_u,
+                      uint16_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height,
+                      int subsample_x,
+                      int subsample_y) {
+  const int dst_y_width = Abs(width);
+  const int dst_y_height = Abs(height);
+  const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  if (width <= 0 || height == 0) {
    return -1;
  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
+  if (dst_y) {
+    ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                  dst_y_width, dst_y_height, kFilterBilinear);
+  }
+  ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+                dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+                dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return 0;
+}
+
+LIBYUV_API
+int I410ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 0, 0);
+}
+
+LIBYUV_API
+int I210ToI010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, 1, 0);
+}
+
+// Any I[420]1[02] to P[420]1[02] format with mirroring.
+static int Ix1xToPx1x(const uint16_t* src_y,
+                      int src_stride_y,
+                      const uint16_t* src_u,
+                      int src_stride_u,
+                      const uint16_t* src_v,
+                      int src_stride_v,
+                      uint16_t* dst_y,
+                      int dst_stride_y,
+                      uint16_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height,
+                      int subsample_x,
+                      int subsample_y,
+                      int depth) {
+  const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  if (width <= 0 || height == 0) {
+    return -1;
  }

-  // Convert Y plane.
-  Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,
-                    height);
-  // Convert UV planes.
-  Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,
-                    halfheight);
-  Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,
-                    halfheight);
+  ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+                       depth);
+  MergeUVPlane_16(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+                  dst_stride_uv, uv_width, uv_height, depth);
  return 0;
 }

+LIBYUV_API
+int I010ToP010(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+                    width, height, 1, 1, 10);
+}
+
+LIBYUV_API
+int I210ToP210(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+                    width, height, 1, 0, 10);
+}
+
+LIBYUV_API
+int I012ToP012(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+                    width, height, 1, 1, 12);
+}
+
+LIBYUV_API
+int I212ToP212(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+                    width, height, 1, 0, 12);
+}
+
 // 422 chroma is 1/2 width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -111,6 +111,50 @@ int I420ToI010(const uint8_t* src_y,
  return 0;
 }

+// Convert 8 bit YUV to 12 bit.
+LIBYUV_API
+int I420ToI012(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width,
+                    height);
+  // Convert UV planes.
+  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth,
+                    halfheight);
+  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth,
+                    halfheight);
+  return 0;
+}
+
 // 420 chroma is 1/2 width, 1/2 height
 // 422 chroma is 1/2 width, 1x height
 LIBYUV_API
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -550,6 +550,216 @@ void MergeUVPlane(const uint8_t* src_u,
  }
 }

+// Support function for P010 etc UV channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitUVPlane_16(const uint16_t* src_uv,
+                     int src_stride_uv,
+                     uint16_t* dst_u,
+                     int dst_stride_u,
+                     uint16_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     int depth) {
+  int y;
+  int scale = 1 << depth;
+  void (*SplitUVRow)(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v,
+                     int scale, int width) = SplitUVRow_16_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == width * 2 && dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_16_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SplitUVRow = SplitUVRow_16_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Copy a row of UV.
+    SplitUVRow(src_uv, dst_u, dst_v, scale, width);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+}
+
+LIBYUV_API
+void MergeUVPlane_16(const uint16_t* src_u,
+                     int src_stride_u,
+                     const uint16_t* src_v,
+                     int src_stride_v,
+                     uint16_t* dst_uv,
+                     int dst_stride_uv,
+                     int width,
+                     int height,
+                     int depth) {
+  int y;
+  int scale = 1 << (16 - depth);
+  void (*MergeUVRow)(const uint16_t* src_u, const uint16_t* src_v,
+                     uint16_t* dst_uv, int scale, int width) = MergeUVRow_16_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uv = dst_uv + (height - 1) * dst_stride_uv;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_u == width && src_stride_v == width &&
+      dst_stride_uv == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow = MergeUVRow_16_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow = MergeUVRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      MergeUVRow = MergeUVRow_16_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow(src_u, src_v, dst_uv, scale, width);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+}
+
+// Convert plane from lsb to msb
+LIBYUV_API
+void ConvertToMSBPlane_16(const uint16_t* src_y,
+                          int src_stride_y,
+                          uint16_t* dst_y,
+                          int dst_stride_y,
+                          int width,
+                          int height,
+                          int depth) {
+  int y;
+  int scale = 1 << (16 - depth);
+  void (*MultiplyRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+                      int width) = MultiplyRow_16_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+
+#if defined(HAS_MULTIPLYROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MultiplyRow = MultiplyRow_16_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MultiplyRow = MultiplyRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MULTIPLYROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MultiplyRow = MultiplyRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MultiplyRow = MultiplyRow_16_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    MultiplyRow(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert plane from msb to lsb
+LIBYUV_API
+void ConvertToLSBPlane_16(const uint16_t* src_y,
+                          int src_stride_y,
+                          uint16_t* dst_y,
+                          int dst_stride_y,
+                          int width,
+                          int height,
+                          int depth) {
+  int y;
+  int scale = 1 << depth;
+  void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+                    int width) = DivideRow_16_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+
+#if defined(HAS_DIVIDEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    DivideRow = DivideRow_16_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      DivideRow = DivideRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_DIVIDEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DivideRow = DivideRow_16_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DivideRow = DivideRow_16_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    DivideRow(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
 // Swap U and V channels in interleaved UV plane.
 LIBYUV_API
 void SwapUVPlane(const uint8_t* src_uv,
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -546,6 +546,32 @@ ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7)
 #endif
 #undef ANY21C

+// Any 2 16 bit planes with parameter to 1
+#define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                     \
+  void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \
+               int width) {                                          \
+    SIMD_ALIGNED(T temp[16 * 4]);                                    \
+    memset(temp, 0, 16 * 4); /* for msan */                          \
+    int r = width & MASK;                                            \
+    int n = width & ~MASK;                                           \
+    if (n > 0) {                                                     \
+      ANY_SIMD(src_u, src_v, dst_uv, depth, n);                      \
+    }                                                                \
+    memcpy(temp, src_u + n, r * BPP);                                \
+    memcpy(temp + 16, src_v + n, r * BPP);                           \
+    ANY_SIMD(temp, temp + 16, temp + 32, depth, MASK + 1);           \
+    memcpy(dst_uv + n * 2, temp + 32, r * BPP * 2);                  \
+  }
+
+#ifdef HAS_MERGEUVROW_16_AVX2
+ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_16_NEON
+ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
+#endif
+
+#undef ANY21CT
+
 // Any 1 to 1.
 #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
@ -1126,6 +1152,30 @@ ANY11C(Convert8To16Row_Any_AVX2,
       uint16_t,
       31)
 #endif
+#ifdef HAS_MULTIPLYROW_16_AVX2
+ANY11C(MultiplyRow_16_Any_AVX2,
+       MultiplyRow_16_AVX2,
+       2,
+       2,
+       uint16_t,
+       uint16_t,
+       31)
+#endif
+#ifdef HAS_MULTIPLYROW_16_NEON
+ANY11C(MultiplyRow_16_Any_NEON,
+       MultiplyRow_16_NEON,
+       2,
+       2,
+       uint16_t,
+       uint16_t,
+       15)
+#endif
+#ifdef HAS_DIVIDEROW_16_AVX2
+ANY11C(DivideRow_16_Any_AVX2, DivideRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31)
+#endif
+#ifdef HAS_DIVIDEROW_16_NEON
+ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
+#endif
 #undef ANY11C

 // Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
@ -1405,6 +1455,32 @@ ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
 #endif
 #undef ANY12

+// Any 2 16 bit planes with parameter to 1
+#define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK)                            \
+  void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \
+    SIMD_ALIGNED(T temp[16 * 4]);                                           \
+    memset(temp, 0, 16 * 4 * BPP); /* for msan */                           \
+    int r = width & MASK;                                                   \
+    int n = width & ~MASK;                                                  \
+    if (n > 0) {                                                            \
+      ANY_SIMD(src_uv, dst_u, dst_v, depth, n);                             \
+    }                                                                       \
+    memcpy(temp, src_uv + n * 2, r * BPP * 2);                              \
+    ANY_SIMD(temp, temp + 32, temp + 48, depth, MASK + 1);                  \
+    memcpy(dst_u + n, temp + 32, r * BPP);                                  \
+    memcpy(dst_v + n, temp + 48, r * BPP);                                  \
+  }
+
+#ifdef HAS_SPLITUVROW_16_AVX2
+ANY12PT(SplitUVRow_16_Any_AVX2, SplitUVRow_16_AVX2, uint16_t, 2, 15)
+#endif
+
+#ifdef HAS_SPLITUVROW_16_NEON
+ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7)
+#endif
+
+#undef ANY21CT
+
 // Any 1 to 3.  Outputs RGB planes.
 #define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \
  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -2521,27 +2521,33 @@ void MergeXRGBRow_C(const uint8_t* src_r,
  }
 }

-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
+// Convert lsb formats to msb, depending on sample depth.
 void MergeUVRow_16_C(const uint16_t* src_u,
                     const uint16_t* src_v,
                     uint16_t* dst_uv,
-                     int scale,
+                     int depth,
                     int width) {
+  int shift = 16 - depth;
  int x;
-  for (x = 0; x < width - 1; x += 2) {
-    dst_uv[0] = src_u[x] * scale;
-    dst_uv[1] = src_v[x] * scale;
-    dst_uv[2] = src_u[x + 1] * scale;
-    dst_uv[3] = src_v[x + 1] * scale;
-    dst_uv += 4;
+  for (x = 0; x < width; ++x) {
+    dst_uv[0] = src_u[x] << shift;
+    dst_uv[1] = src_v[x] << shift;
+    dst_uv += 2;
  }
-  if (width & 1) {
-    dst_uv[0] = src_u[width - 1] * scale;
-    dst_uv[1] = src_v[width - 1] * scale;
+}
+
+// Convert msb formats to lsb, depending on sample depth.
+void SplitUVRow_16_C(const uint16_t* src_uv,
+                     uint16_t* dst_u,
+                     uint16_t* dst_v,
+                     int depth,
+                     int width) {
+  int shift = 16 - depth;
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_u[x] = src_uv[0] >> shift;
+    dst_v[x] = src_uv[1] >> shift;
+    src_uv += 2;
  }
 }

@ -2555,6 +2561,16 @@ void MultiplyRow_16_C(const uint16_t* src_y,
  }
 }

+void DivideRow_16_C(const uint16_t* src_y,
+                    uint16_t* dst_y,
+                    int scale,
+                    int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = (src_y[x] * scale) >> 16;
+  }
+}
+
 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 32768 = 9 bits
 // 16384 = 10 bits
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -3653,22 +3653,18 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
 }
 #endif  // HAS_MERGEUVROW_SSE2

-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
 #ifdef HAS_MERGEUVROW_16_AVX2
 void MergeUVRow_16_AVX2(const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint16_t* dst_uv,
-                        int scale,
+                        int depth,
                        int width) {
+  depth = 16 - depth;
  // clang-format off
  asm volatile (
      "vmovd       %4,%%xmm3                     \n"
      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "vbroadcastss %%xmm3,%%xmm3                \n"
      "sub         %0,%1                         \n"

    // 16 pixels per loop.
@ -3678,8 +3674,8 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
      "vmovdqu     (%0,%1,1),%%ymm1              \n"
      "add         $0x20,%0                      \n"

-      "vpmullw     %%ymm3,%%ymm0,%%ymm0          \n"
-      "vpmullw     %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpsllw      %%xmm3,%%ymm0,%%ymm0          \n"
+      "vpsllw      %%xmm3,%%ymm1,%%ymm1          \n"
      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm2          \n"  // mutates
      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm0          \n"
      "vextractf128 $0x0,%%ymm2,(%2)             \n"
@ -3694,12 +3690,62 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
    "+r"(src_v),   // %1
    "+r"(dst_uv),  // %2
    "+r"(width)    // %3
-  : "r"(scale)     // %4
+  : "r"(depth)     // %4
  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
  // clang-format on
 }
 #endif  // HAS_MERGEUVROW_AVX2

+#ifdef HAS_MERGEUVROW_16_AVX2
+const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8,  9,  12, 13,
+                                 2, 3, 6, 7, 10, 11, 14, 15};
+void SplitUVRow_16_AVX2(const uint16_t* src_uv,
+                        uint16_t* dst_u,
+                        uint16_t* dst_v,
+                        int depth,
+                        int width) {
+  depth = 16 - depth;
+  // clang-format off
+  asm volatile (
+    "vmovd       %4,%%xmm3                     \n"
+    "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+    "vbroadcastss %%xmm3,%%xmm3                \n"
+    "vbroadcastf128 %5,%%ymm4                  \n"
+    "sub         %1,%2                         \n"
+
+    // 16 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu     (%0),%%ymm0                   \n"
+    "vmovdqu     0x20(%0),%%ymm1               \n"
+    "add         $0x40,%0                      \n"
+
+    "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
+    "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
+    "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
+    "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
+    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+    "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+    "vextractf128 $0x0,%%ymm0,(%1)             \n"
+    "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
+    "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
+    "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
+    "add         $0x20,%1                      \n"
+    "sub         $0x10,%3                      \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uv),   // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),  // %2
+    "+r"(width),    // %3
+    "+r"(depth)     // %4
+  :
+    "m"(kSplitUVShuffle16) // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+  // clang-format on
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 128 = 9 bits
 // 64 = 10 bits
@ -3717,7 +3763,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
      "vbroadcastss %%xmm3,%%ymm3                \n"
      "sub         %0,%1                         \n"

-    // 16 pixels per loop.
+    // 32 pixels per loop.
    LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
@ -3739,6 +3785,46 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
 }
 #endif  // HAS_MULTIPLYROW_16_AVX2

+// Use scale to convert msb formats to lsb, depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// 65536 = 16 bits
+#ifdef HAS_DIVIDEROW_16_AVX2
+void DivideRow_16_AVX2(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd       %3,%%xmm3                     \n"
+    "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "sub         %0,%1                         \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu     (%0),%%ymm0                   \n"
+    "vmovdqu     0x20(%0),%%ymm1               \n"
+    "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
+    "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+    "vmovdqu     %%ymm0,(%0,%1)                \n"
+    "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
+    "add         $0x40,%0                      \n"
+    "sub         $0x20,%2                      \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width),    // %2
+    "+r"(scale)     // %3
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 32768 = 9 bits
 // 16384 = 10 bits
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -3166,6 +3166,121 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
      : "cc", "memory", "q0", "q1", "q2", "q3");
 }

+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+                        uint16_t* dst_u,
+                        uint16_t* dst_v,
+                        int depth,
+                        int width) {
+  asm volatile(
+      "vdup.32     q0, %3                        \n"
+      "1:                                        \n"
+      "vld2.16     {q1, q2}, [%0]!               \n"  // load 8 UV
+      "vmovl.u16   q3, d2                        \n"
+      "vmovl.u16   q4, d3                        \n"
+      "vshl.u32    q3, q3, q0                    \n"
+      "vshl.u32    q4, q4, q0                    \n"
+      "vmovn.u32   d2, q3                        \n"
+      "vmovn.u32   d3, q4                        \n"
+      "vmovl.u16   q3, d4                        \n"
+      "vmovl.u16   q4, d5                        \n"
+      "vshl.u32    q3, q3, q0                    \n"
+      "vshl.u32    q4, q4, q0                    \n"
+      "vmovn.u32   d4, q3                        \n"
+      "vmovn.u32   d5, q4                        \n"
+      "subs        %4, %4, #8                    \n"  // 8 src pixels per loop
+      "vst1.16     {q1}, [%1]!                   \n"  // store 8 U pixels
+      "vst1.16     {q2}, [%2]!                   \n"  // store 8 V pixels
+      "bgt         1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(depth),   // %3
+        "+r"(width)    // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int depth,
+                        int width) {
+  int shift = 16 - depth;
+  asm volatile(
+      "vdup.16     q2, %3                        \n"
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"  // load 8 U
+      "vld1.16     {q1}, [%1]!                   \n"  // load 8 V
+      "vshl.u16    q0, q0, q2                    \n"
+      "vshl.u16    q1, q1, q2                    \n"
+      "subs        %4, %4, #8                    \n"  // 8 src pixels per loop
+      "vst2.16     {q0, q1}, [%2]!               \n"  // store 8 UV pixels
+      "bgt         1b                            \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(shift),   // %3
+        "+r"(width)    // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width) {
+  asm volatile(
+      "vdup.16     q2, %2                        \n"
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"
+      "vld1.16     {q1}, [%0]!                   \n"
+      "vmul.u16    q0, q0, q2                    \n"
+      "vmul.u16    q1, q1, q2                    \n"
+      "vst1.16     {q0}, [%1]!                   \n"
+      "vst1.16     {q1}, [%1]!                   \n"
+      "subs        %3, %3, #16                   \n"  // 16 src pixels per loop
+      "bgt         1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(scale),  // %2
+        "+r"(width)   // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void DivideRow_16_NEON(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  asm volatile(
+      "vdup.16     q0, %2                        \n"
+      "1:                                        \n"
+      "vld1.16     {q1}, [%0]!                   \n"
+      "vld1.16     {q2}, [%0]!                   \n"
+      "vmovl.u16   q3, d2                        \n"
+      "vmovl.u16   q1, d3                        \n"
+      "vmovl.u16   q4, d4                        \n"
+      "vmovl.u16   q2, d5                        \n"
+      "vshl.u32    q3, q3, q0                    \n"
+      "vshl.u32    q4, q4, q0                    \n"
+      "vshl.u32    q1, q1, q0                    \n"
+      "vshl.u32    q2, q2, q0                    \n"
+      "vmovn.u32   d2, q3                        \n"
+      "vmovn.u32   d3, q1                        \n"
+      "vmovn.u32   d4, q4                        \n"
+      "vmovn.u32   d5, q2                        \n"
+      "vst1.16     {q1}, [%1]!                   \n"
+      "vst1.16     {q2}, [%1]!                   \n"
+      "subs        %3, %3, #16                   \n"  // 16 src pixels per loop
+      "bgt         1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(scale),  // %2
+        "+r"(width)   // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..

 #ifdef __cplusplus
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -3526,6 +3526,126 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
      : "cc", "memory", "v0", "v1", "v2", "v3");
 }

+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+                        uint16_t* dst_u,
+                        uint16_t* dst_v,
+                        int depth,
+                        int width) {
+  asm volatile(
+      "dup         v0.4s, %w3                    \n"
+      "1:                                        \n"
+      "ld2         {v1.8h, v2.8h}, [%0], #32     \n"  // load 8 UV
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ushll       v3.4s, v1.4h, #0              \n"
+      "ushll2      v4.4s, v1.8h, #0              \n"
+      "ushl        v3.4s, v3.4s, v0.4s           \n"
+      "ushl        v4.4s, v4.4s, v0.4s           \n"
+      "xtn         v1.4h, v3.4s                  \n"
+      "xtn2        v1.8h, v4.4s                  \n"
+      "ushll       v3.4s, v2.4h, #0              \n"
+      "ushll2      v4.4s, v2.8h, #0              \n"
+      "ushl        v3.4s, v3.4s, v0.4s           \n"
+      "ushl        v4.4s, v4.4s, v0.4s           \n"
+      "xtn         v2.4h, v3.4s                  \n"
+      "xtn2        v2.8h, v4.4s                  \n"
+      "subs        %w4, %w4, #8                  \n"  // 8 src pixels per loop
+      "st1         {v1.8h}, [%1], #16            \n"  // store 8 U pixels
+      "st1         {v2.8h}, [%2], #16            \n"  // store 8 V pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(depth),   // %3
+        "+r"(width)    // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int depth,
+                        int width) {
+  int shift = 16 - depth;
+  asm volatile(
+      "dup         v2.8h, %w3                    \n"
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ld1         {v1.8h}, [%1], #16            \n"  // load 8 V
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "ushl        v0.8h, v0.8h, v2.8h           \n"
+      "ushl        v1.8h, v1.8h, v2.8h           \n"
+      "subs        %w4, %w4, #8                  \n"  // 8 src pixels per loop
+      "st2         {v0.8h, v1.8h}, [%2], #32     \n"  // store 8 UV pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(shift),   // %3
+        "+r"(width)    // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width) {
+  asm volatile(
+      "dup         v2.8h, %w2                    \n"
+      "1:                                        \n"
+      "ldp         q0, q1, [%0]                  \n"
+      "add         %0, %0, #32                   \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "mul         v0.8h, v0.8h, v2.8h           \n"
+      "mul         v1.8h, v1.8h, v2.8h           \n"
+      "stp         q0, q1, [%1]                  \n"  // store 16 pixels
+      "add         %1, %1, #32                   \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 src pixels per loop
+      "b.gt        1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(scale),  // %2
+        "+r"(width)   // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void DivideRow_16_NEON(const uint16_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  asm volatile(
+      "dup         v0.8h, %w2                    \n"
+      "1:                                        \n"
+      "ldp         q1, q2, [%0]                  \n"
+      "add         %0, %0, #32                   \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "ushll       v3.4s, v1.4h, #0              \n"
+      "ushll       v4.4s, v2.4h, #0              \n"
+      "ushll2      v1.4s, v1.8h, #0              \n"
+      "ushll2      v2.4s, v2.8h, #0              \n"
+      "mul         v3.4s, v0.4s, v3.4s           \n"
+      "mul         v4.4s, v0.4s, v4.4s           \n"
+      "mul         v1.4s, v0.4s, v1.4s           \n"
+      "mul         v2.4s, v0.4s, v2.4s           \n"
+      "shrn        v3.4h, v3.4s, #16             \n"
+      "shrn        v4.4h, v4.4s, #16             \n"
+      "shrn2       v3.8h, v1.4s, #16             \n"
+      "shrn2       v4.8h, v2.4s, #16             \n"
+      "stp         q3, q3, [%1]                  \n"  // store 16 pixels
+      "add         %1, %1, #32                   \n"
+      "subs        %w3, %w3, #16                 \n"  // 16 src pixels per loop
+      "b.gt        1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(scale),  // %2
+        "+r"(width)   // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

 #ifdef __cplusplus
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -158,15 +158,26 @@ TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8)
 TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8)
 TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8)
 TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
 TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I012, uint16_t, 2, 2, 2, 8)
 TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
 TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
 TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8)
 TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
 TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
 TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
 TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 10)
+TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 10)
+TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 12)
+TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12)

 // Test Android 420 to I420
 #define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X,          \
@ -292,63 +303,74 @@ int I400ToNV21(const uint8_t* src_y,
                    dst_stride_vu, width, height);
 }

-#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
-                        FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
+                        SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
+                        DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,     \
+                        SRC_DEPTH)                                            \
  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
+                  "SRC_SUBSAMP_X unsupported");                               \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
+                  "SRC_SUBSAMP_Y unsupported");                               \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
+                  "DST_SUBSAMP_X unsupported");                               \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
+                  "DST_SUBSAMP_Y unsupported");                               \
    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
    const int kHeight = benchmark_height_;                                    \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
-    align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *           \
-                                         SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
-                                     OFF);                                    \
-    align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *           \
-                                         SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
-                                     OFF);                                    \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *        \
-                                        SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *      \
-                                          SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    for (int i = 0; i < kHeight; ++i)                                         \
-      for (int j = 0; j < kWidth; ++j)                                        \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {             \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {            \
-        src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =             \
-            (fastrand() & 0xff);                                              \
-        src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =             \
-            (fastrand() & 0xff);                                              \
-      }                                                                       \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
+    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
+    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
+    align_buffer_page_end(src_u,                                              \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(src_v,                                              \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF);      \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_uv_c,                                           \
+                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_uv_opt,                                         \
+                          kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);        \
+    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
+    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
+    SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF);                   \
+    SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF);                   \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1);                       \
    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
-    memset(dst_uv_c, 2,                                                       \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
-    memset(dst_uv_opt, 102,                                                   \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) {                \
+      src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+      src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);          \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2);      \
    MaskCpuFlags(disable_cpu_flags_);                                         \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X),   \
-        src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth,       \
-        dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight);     \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth,   \
+                                   src_v_p, kSrcHalfWidth,                    \
+                                   reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
+                                   reinterpret_cast<DST_T*>(dst_uv_c),        \
+                                   kDstHalfWidth * 2, kWidth, NEG kHeight);   \
    MaskCpuFlags(benchmark_cpu_info_);                                        \
    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
-          src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth,   \
-          dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
+          src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,    \
+          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
+          reinterpret_cast<DST_T*>(dst_uv_opt), kDstHalfWidth * 2, kWidth,    \
+          NEG kHeight);                                                       \
    }                                                                         \
-    for (int i = 0; i < kHeight; ++i) {                                       \
-      for (int j = 0; j < kWidth; ++j) {                                      \
-        EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);        \
-      }                                                                       \
+    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
    }                                                                         \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {            \
-        EXPECT_EQ(dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j],         \
-                  dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]);      \
-      }                                                                       \
+    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC * 2; ++i) {  \
+      EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]);                                  \
    }                                                                         \
    free_aligned_buffer_page_end(dst_y_c);                                    \
    free_aligned_buffer_page_end(dst_uv_c);                                   \
@ -359,23 +381,33 @@ int I400ToNV21(const uint8_t* src_y,
    free_aligned_buffer_page_end(src_v);                                      \
  }

-#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,        \
-                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                    \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                  SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0)   \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                  SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                  SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0)    \
-  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
-                  SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
+                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
+                       DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)               \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1,          \
+                  SRC_DEPTH)                                                  \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH)  \
+  TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                  SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                  DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)

-TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
-TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
-TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2)
-TESTPLANARTOBP(I444, 1, 1, NV12, 2, 2)
-TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
-TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
+TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I422, uint8_t, 1, 2, 1, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV12, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I400, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOBP(I010, uint16_t, 2, 2, 2, P010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOBP(I210, uint16_t, 2, 2, 1, P210, uint16_t, 2, 2, 1, 10)
+TESTPLANARTOBP(I012, uint16_t, 2, 2, 2, P012, uint16_t, 2, 2, 2, 12)
+TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)

 #define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,       \
                          SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,           \
@ -385,13 +417,13 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");        \
    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");        \
    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                    \
-                  "SRC_SUBSAMP_X unsupported");                            \
+                  "SRC_SUBSAMP_X unsupported");                                \
    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                    \
-                  "SRC_SUBSAMP_Y unsupported");                            \
+                  "SRC_SUBSAMP_Y unsupported");                                \
    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                    \
-                  "DST_SUBSAMP_X unsupported");                            \
+                  "DST_SUBSAMP_X unsupported");                                \
    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                    \
-                  "DST_SUBSAMP_Y unsupported");                            \
+                  "DST_SUBSAMP_Y unsupported");                                \
    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
    const int kHeight = benchmark_height_;                                     \
    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);                \
@ -407,15 +439,15 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);                \
    align_buffer_page_end(dst_uv_opt,                                          \
                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);       \
-    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                     \
-    MemRandomize(src_uv + OFF, 2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);  \
    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
-      src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1);                        \
+      src_y_p[i] =                                                             \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
    }                                                                          \
-    for (int i = 0; i < 2 * kSrcHalfWidth * kSrcHalfHeight; ++i) {             \
-      src_uv_p[i] = src_uv_p[i] & ((1 << SRC_DEPTH) - 1);                      \
+    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) {             \
+      src_uv_p[i] =                                                            \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
    }                                                                          \
    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                              \
    memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);         \
@ -483,112 +515,111 @@ TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
 TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8)
 TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8)
 TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8)
-// These formats put data in high bits, so test on full 16bit range.
-TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 16)
-TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 16)
-TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 16)
-TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 16)
-TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 16)
-TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 16)
+TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10)
+TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10)
+TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10)
+TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12)
+TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12)
+TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12)

-#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
-                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
-                         DOY)                                                  \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = benchmark_height_;                                     \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
-    align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *       \
-                                          SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
-                                      OFF);                                    \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
-    align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) *              \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));         \
-    align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) *              \
-                                       SUBSAMPLE(kHeight, SUBSAMP_Y));         \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
-    align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *            \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) *            \
-                                         SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    for (int i = 0; i < kHeight; ++i)                                          \
-      for (int j = 0; j < kWidth; ++j)                                         \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {              \
-      for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {         \
-        src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =         \
-            (fastrand() & 0xff);                                               \
-      }                                                                        \
-    }                                                                          \
-    memset(dst_y_c, 1, kWidth* kHeight);                                       \
-    memset(dst_u_c, 2,                                                         \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_v_c, 3,                                                         \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
-    memset(dst_u_opt, 102,                                                     \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    memset(dst_v_opt, 103,                                                     \
-           SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));      \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
-        src_y + OFF, kWidth, src_uv + OFF,                                     \
-        2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth,    \
-        dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c,                        \
-        SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);                    \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                          \
-          src_y + OFF, kWidth, src_uv + OFF,                                   \
-          2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL,        \
-          kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt,          \
-          SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);                  \
-    }                                                                          \
-    if (DOY) {                                                                 \
-      for (int i = 0; i < kHeight; ++i) {                                      \
-        for (int j = 0; j < kWidth; ++j) {                                     \
-          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);       \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                 \
-        EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],               \
-                  dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);            \
-      }                                                                        \
-    }                                                                          \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
-      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                 \
-        EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j],               \
-                  dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);            \
-      }                                                                        \
-    }                                                                          \
-    free_aligned_buffer_page_end(dst_y_c);                                     \
-    free_aligned_buffer_page_end(dst_u_c);                                     \
-    free_aligned_buffer_page_end(dst_v_c);                                     \
-    free_aligned_buffer_page_end(dst_y_opt);                                   \
-    free_aligned_buffer_page_end(dst_u_opt);                                   \
-    free_aligned_buffer_page_end(dst_v_opt);                                   \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_uv);                                      \
+#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,       \
+                         SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,           \
+                         DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,    \
+                         SRC_DEPTH)                                           \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
+                  "SRC_SUBSAMP_X unsupported");                               \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
+                  "SRC_SUBSAMP_Y unsupported");                               \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
+                  "DST_SUBSAMP_X unsupported");                               \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
+                  "DST_SUBSAMP_Y unsupported");                               \
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kHeight = benchmark_height_;                                    \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
+    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
+    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
+    align_buffer_page_end(src_uv,                                             \
+                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC * 2 + OFF);  \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
+    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
+    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                 \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      src_y_p[i] =                                                            \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
+    }                                                                         \
+    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) {            \
+      src_uv_p[i] =                                                           \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
+    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
+    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
+    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
+    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
+        src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                         \
+        reinterpret_cast<DST_T*>(dst_y_c), kWidth,                            \
+        reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth,                     \
+        reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth,             \
+        NEG kHeight);                                                         \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                       \
+          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
+          reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth,                 \
+          reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth,         \
+          NEG kHeight);                                                       \
+    }                                                                         \
+    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
+    }                                                                         \
+    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {      \
+      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                    \
+      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                    \
+    }                                                                         \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_u_c);                                    \
+    free_aligned_buffer_page_end(dst_v_c);                                    \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_u_opt);                                  \
+    free_aligned_buffer_page_end(dst_v_opt);                                  \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
  }

-#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
-                        FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                     \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                   SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                   SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1,  \
-                   1)                                                         \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                   SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1)  \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                   SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1)     \
-  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                   SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
+#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
+                        SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
+                        DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)               \
+  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                   DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \
+  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                   DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1,          \
+                   SRC_DEPTH)                                                  \
+  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                   DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH)  \
+  TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
+                   SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
+                   DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)

-TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
-TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
+TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
+TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)

 // Provide matrix wrappers for full range bt.709
 #define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
--- a/unit_test/video_common_test.cc
+++ b/unit_test/video_common_test.cc
@ -81,6 +81,11 @@ TEST_F(LibYUVBaseTest, TestFourCC) {
  EXPECT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420));
  EXPECT_TRUE(TestValidFourCC(FOURCC_H422, FOURCC_BPP_H422));
  EXPECT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_H210, FOURCC_BPP_H210));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I010, FOURCC_BPP_I010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_I210, FOURCC_BPP_I210));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_P010, FOURCC_BPP_P010));
+  EXPECT_TRUE(TestValidFourCC(FOURCC_P210, FOURCC_BPP_P210));
  EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
  EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
  EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));