Add P010ToP410 and P210ToP410

These are 16 bit bi-planar convert functions to scale UV plane to Y plane's size using (bi)linear filter. libyuv_unittest --gtest_filter=*ToP41* R=fbarchard@chromium.org Bug: libyuv:872 Change-Id: I3cb4fafe2b2c9eedd0d91cf4c619abb9ee107bc1 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2690102 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2021-02-12 10:49:25 +08:00 · 2021-02-12 10:49:25 +08:00 · d4ecb70610
commit d4ecb70610
parent 12a4a2372c
15 changed files with 1353 additions and 498 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1776
+Version: 1777
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -315,6 +315,44 @@ int NV16ToNV24(const uint8_t* src_y,
               int width,
               int height);

+// Convert P010 to P410.
+LIBYUV_API
+int P010ToP410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert P012 to P412.
+#define P012ToP412 P010ToP410
+
+// Convert P016 to P416.
+#define P016ToP416 P010ToP410
+
+// Convert P210 to P410.
+LIBYUV_API
+int P210ToP410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert P212 to P412.
+#define P212ToP412 P210ToP410
+
+// Convert P216 to P416.
+#define P216ToP416 P210ToP410
+
 // Convert YUY2 to I420.
 LIBYUV_API
 int YUY2ToI420(const uint8_t* src_yuy2,
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@ -81,10 +81,12 @@ extern "C" {
 #define HAS_SCALEROWUP2LINEAR_SSSE3
 #define HAS_SCALEROWUP2BILINEAR_SSE2
 #define HAS_SCALEROWUP2BILINEAR_SSSE3
-#define HAS_SCALEROWUP2LINEAR_16_SSE2
-#define HAS_SCALEROWUP2BILINEAR_16_SSE2
+#define HAS_SCALEROWUP2LINEAR_16_SSSE3
+#define HAS_SCALEROWUP2BILINEAR_16_SSSE3
 #define HAS_SCALEUVROWUP2LINEAR_SSSE3
 #define HAS_SCALEUVROWUP2BILINEAR_SSSE3
+#define HAS_SCALEUVROWUP2LINEAR_16_SSE2
+#define HAS_SCALEUVROWUP2BILINEAR_16_SSE2
 #endif

 // The following are available for gcc/clang x86 platforms, but
@ -100,6 +102,8 @@ extern "C" {
 #define HAS_SCALEROWUP2BILINEAR_16_AVX2
 #define HAS_SCALEUVROWUP2LINEAR_AVX2
 #define HAS_SCALEUVROWUP2BILINEAR_AVX2
+#define HAS_SCALEUVROWUP2LINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2BILINEAR_16_AVX2
 #endif

 // The following are available on all x86 platforms, but
@ -134,6 +138,8 @@ extern "C" {
 #define HAS_SCALEROWUP2BILINEAR_16_NEON
 #define HAS_SCALEUVROWUP2LINEAR_NEON
 #define HAS_SCALEUVROWUP2BILINEAR_NEON
+#define HAS_SCALEUVROWUP2LINEAR_16_NEON
+#define HAS_SCALEUVROWUP2BILINEAR_16_NEON
 #endif

 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@ -487,6 +493,22 @@ void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width);
+void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint16_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
+                                   uint16_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint16_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);

 void ScaleUVCols_C(uint8_t* dst_uv,
                   const uint8_t* src_uv,
@ -589,10 +611,10 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               ptrdiff_t dst_stride,
                               int dst_width);
-void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width);
-void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr,
                                  ptrdiff_t src_stride,
                                  uint16_t* dst_ptr,
                                  ptrdiff_t dst_stride,
@ -629,10 +651,10 @@ void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width);
-void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
+void ScaleRowUp2_Linear_16_Any_SSSE3(const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    int dst_width);
-void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
+void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr,
                                      ptrdiff_t src_stride,
                                      uint16_t* dst_ptr,
                                      ptrdiff_t dst_stride,
@ -1235,6 +1257,54 @@ void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
                                     uint8_t* dst_ptr,
                                     ptrdiff_t dst_stride,
                                     int dst_width);
+void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr,
+                                      int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
+                                        ptrdiff_t src_stride,
+                                        uint16_t* dst_ptr,
+                                        ptrdiff_t dst_stride,
+                                        int dst_width);
+void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr,
+                                      int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
+                                        ptrdiff_t src_stride,
+                                        uint16_t* dst_ptr,
+                                        ptrdiff_t dst_stride,
+                                        int dst_width);
+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width);
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+void ScaleUVRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
+                                      uint16_t* dst_ptr,
+                                      int dst_width);
+void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
+                                        ptrdiff_t src_stride,
+                                        uint16_t* dst_ptr,
+                                        ptrdiff_t dst_stride,
+                                        int dst_width);

 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
--- a/include/libyuv/scale_uv.h
+++ b/include/libyuv/scale_uv.h
@ -30,6 +30,19 @@ int UVScale(const uint8_t* src_uv,
            int dst_height,
            enum FilterMode filtering);

+// Scale an 16 bit UV image.
+// This function is currently incomplete, it can't handle all cases.
+LIBYUV_API
+int UVScale_16(const uint16_t* src_uv,
+               int src_stride_uv,
+               int src_width,
+               int src_height,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int dst_width,
+               int dst_height,
+               enum FilterMode filtering);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1776
+#define LIBYUV_VERSION 1777

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@ -663,6 +663,55 @@ int NV16ToNV24(const uint8_t* src_y,
  return 0;
 }

+LIBYUV_API
+int P010ToP410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                  Abs(width), Abs(height), kFilterBilinear);
+  }
+  UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+             SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+             Abs(height), kFilterBilinear);
+  return 0;
+}
+
+LIBYUV_API
+int P210ToP410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_uv,
+               int src_stride_uv,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                  Abs(width), Abs(height), kFilterBilinear);
+  }
+  UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+             dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+  return 0;
+}
+
 // Convert YUY2 to I420.
 LIBYUV_API
 int YUY2ToI420(const uint8_t* src_yuy2,
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -4190,6 +4190,7 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
      "lea         64(%4),%4                     \n"
      "sub         $0x10,%5                      \n"
      "jg          1b                            \n"
+      "vzeroupper                                \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
@ -4231,6 +4232,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r,
      "lea         64(%3),%3                     \n"
      "sub         $0x10,%4                      \n"
      "jg          1b                            \n"
+      "vzeroupper                                \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
@ -4340,9 +4342,9 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb,
 }
 #endif

+static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8,  12, 1, 5, 9,  13,
+                                            2, 6, 10, 14, 3, 7, 11, 15};
 #ifdef HAS_SPLITARGBROW_SSSE3
-static const uvec8 kShuffleMaskARGBSplit = {0u, 4u, 8u,  12u, 1u, 5u, 9u,  13u,
-                                            2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u};
 void SplitARGBRow_SSSE3(const uint8_t* src_argb,
                        uint8_t* dst_r,
                        uint8_t* dst_g,
@ -4351,6 +4353,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
                        int width) {
  asm volatile(

+      "movdqa      %6,%%xmm3                     \n"
      "sub         %1,%2                         \n"
      "sub         %1,%3                         \n"
      "sub         %1,%4                         \n"
@ -4360,8 +4363,8 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,

      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
-      "pshufb      %6,%%xmm0                     \n"  // 048C159D26AE37BF (lo)
-      "pshufb      %6,%%xmm1                     \n"  // 048C159D26AE37BF (hi)
+      "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
+      "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
      "movdqa      %%xmm0,%%xmm2                 \n"
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
@ -4385,7 +4388,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
        "+rm"(width)  // %5
 #endif
      : "m"(kShuffleMaskARGBSplit)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }

 void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
@ -4395,13 +4398,15 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
                        int width) {
  asm volatile(

+      "movdqa      %5,%%xmm3                     \n"
+
      LABELALIGN
      "1:                                        \n"

      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
-      "pshufb      %5,%%xmm0                     \n"  // 048C159D26AE37BF (lo)
-      "pshufb      %5,%%xmm1                     \n"  // 048C159D26AE37BF (hi)
+      "pshufb      %%xmm3,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
+      "pshufb      %%xmm3,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
      "movdqa      %%xmm0,%%xmm2                 \n"
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
@ -4421,16 +4426,12 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
        "+r"(dst_b),                // %3
        "+r"(width)                 // %4
      : "m"(kShuffleMaskARGBSplit)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 #endif

 #ifdef HAS_SPLITARGBROW_AVX2
-static const lvec8 kShuffleMaskARGBSplit_AVX2 = {
-    0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u,
-    0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u};
-static const ulvec32 kShuffleMaskARGBPermute_AVX2 = {0u, 4u, 1u, 5u,
-                                                     2u, 6u, 3u, 7u};
+static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
 void SplitARGBRow_AVX2(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
@ -4442,7 +4443,8 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
      "sub         %1,%2                         \n"
      "sub         %1,%3                         \n"
      "sub         %1,%4                         \n"
-      "vmovdqu     %7,%%ymm3                     \n"
+      "vmovdqa     %7,%%ymm3                     \n"
+      "vbroadcastf128 %6,%%ymm4                     \n"

      LABELALIGN
      "1:                                        \n"
@ -4451,8 +4453,8 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
      "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
      "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
      "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
-      "vpshufb     %6,%%ymm0,%%ymm0              \n"
-      "vpshufb     %6,%%ymm1,%%ymm1              \n"
+      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
      "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
      "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
@ -4465,6 +4467,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
      "lea         16(%1),%1                     \n"
      "subl        $0x10,%5                      \n"
      "jg          1b                            \n"
+      "vzeroupper                                \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_r),     // %1
        "+r"(dst_g),     // %2
@ -4475,9 +4478,9 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
 #else
        "+rm"(width)  // %5
 #endif
-      : "m"(kShuffleMaskARGBSplit_AVX2),   // %6
-        "m"(kShuffleMaskARGBPermute_AVX2)  // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+      : "m"(kShuffleMaskARGBSplit),   // %6
+        "m"(kShuffleMaskARGBPermute)  // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }

 void SplitXRGBRow_AVX2(const uint8_t* src_argb,
@ -4487,15 +4490,18 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
                       int width) {
  asm volatile(

-      "vmovdqu     %6,%%ymm3                     \n" LABELALIGN
+      "vmovdqa     %6,%%ymm3                     \n"
+      "vbroadcastf128 %5,%%ymm4                     \n"
+
+      LABELALIGN
      "1:                                        \n"

      "vmovdqu     (%0),%%xmm0                   \n"  // 00-0F
      "vmovdqu     16(%0),%%xmm1                 \n"  // 10-1F
      "vinserti128 $1,32(%0),%%ymm0,%%ymm0       \n"  // 00-0F 20-2F
      "vinserti128 $1,48(%0),%%ymm1,%%ymm1       \n"  // 10-1F 30-3F
-      "vpshufb     %5,%%ymm0,%%ymm0              \n"
-      "vpshufb     %5,%%ymm1,%%ymm1              \n"
+      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
      "vpermd      %%ymm0,%%ymm3,%%ymm0          \n"
      "vpermd      %%ymm1,%%ymm3,%%ymm1          \n"
      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"  // GA
@ -4510,13 +4516,14 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
      "lea         16(%3),%3                     \n"
      "sub         $0x10,%4                      \n"
      "jg          1b                            \n"
-      : "+r"(src_argb),                    // %0
-        "+r"(dst_r),                       // %1
-        "+r"(dst_g),                       // %2
-        "+r"(dst_b),                       // %3
-        "+r"(width)                        // %4
-      : "m"(kShuffleMaskARGBSplit_AVX2),   // %5
-        "m"(kShuffleMaskARGBPermute_AVX2)  // %6
+      "vzeroupper                                \n"
+      : "+r"(src_argb),               // %0
+        "+r"(dst_r),                  // %1
+        "+r"(dst_g),                  // %2
+        "+r"(dst_b),                  // %3
+        "+r"(width)                   // %4
+      : "m"(kShuffleMaskARGBSplit),   // %5
+        "m"(kShuffleMaskARGBPermute)  // %6
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 #endif
--- a/source/scale.cc
+++ b/source/scale.cc
@ -1441,20 +1441,16 @@ void ScalePlaneUp2_Bilinear(int src_width,
  }
 #endif

-  if (src_height == 1) {
-    Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
-  } else {
+  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  dst_ptr += dst_stride;
+  for (x = 0; x < src_height - 1; ++x) {
+    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+    src_ptr += src_stride;
+    // TODO: Test performance of writing one row of destination at a time.
+    dst_ptr += 2 * dst_stride;
+  }
+  if (!(dst_height & 1)) {
    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-    dst_ptr += dst_stride;
-    for (x = 0; x < src_height - 1; ++x) {
-      Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
-      src_ptr += src_stride;
-      // TODO: Test performance of writing one row of destination at a time.
-      dst_ptr += 2 * dst_stride;
-    }
-    if (!(dst_height & 1)) {
-      Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-    }
  }
 }

@ -1480,9 +1476,9 @@ void ScalePlaneUp2_16_Linear(int src_width,
  // This function can only scale up by 2 times horizontally.
  assert(src_width == ((dst_width + 1) / 2));

-#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSSE3;
  }
 #endif

@ -1534,9 +1530,9 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
  assert(src_width == ((dst_width + 1) / 2));
  assert(src_height == ((dst_height + 1) / 2));

-#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSSE3;
  }
 #endif

@ -1552,19 +1548,15 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
  }
 #endif

-  if (src_height == 1) {
-    Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
-  } else {
+  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  dst_ptr += dst_stride;
+  for (x = 0; x < src_height - 1; ++x) {
+    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += 2 * dst_stride;
+  }
+  if (!(dst_height & 1)) {
    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-    dst_ptr += dst_stride;
-    for (x = 0; x < src_height - 1; ++x) {
-      Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
-      src_ptr += src_stride;
-      dst_ptr += 2 * dst_stride;
-    }
-    if (!(dst_height & 1)) {
-      Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-    }
  }
 }

--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@ -656,9 +656,9 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
         uint8_t)
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
-SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
-         ScaleRowUp2_Linear_16_SSE2,
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3
+SUH2LANY(ScaleRowUp2_Linear_16_Any_SSSE3,
+         ScaleRowUp2_Linear_16_SSSE3,
         ScaleRowUp2_Linear_16_C,
         15,
         uint16_t)
@ -676,7 +676,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
 SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
         ScaleRowUp2_Linear_16_AVX2,
         ScaleRowUp2_Linear_16_C,
-         15,
+         31,
         uint16_t)
 #endif

@ -744,9 +744,9 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
         uint8_t)
 #endif

-#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
-SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
-         ScaleRowUp2_Bilinear_16_SSE2,
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3,
+         ScaleRowUp2_Bilinear_16_SSSE3,
         ScaleRowUp2_Bilinear_16_C,
         15,
         uint16_t)
@ -818,6 +818,12 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
          0,
          uint8_t)

+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C,
+          ScaleUVRowUp2_Linear_16_C,
+          ScaleUVRowUp2_Linear_16_C,
+          0,
+          uint16_t)
+
 #ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
 SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
          ScaleUVRowUp2_Linear_SSSE3,
@ -834,6 +840,22 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
          uint8_t)
 #endif

+#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE2,
+          ScaleUVRowUp2_Linear_16_SSE2,
+          ScaleUVRowUp2_Linear_16_C,
+          3,
+          uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
+          ScaleUVRowUp2_Linear_16_AVX2,
+          ScaleUVRowUp2_Linear_16_C,
+          7,
+          uint16_t)
+#endif
+
 #ifdef HAS_SCALEUVROWUP2LINEAR_NEON
 SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
          ScaleUVRowUp2_Linear_NEON,
@ -842,6 +864,14 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
          uint8_t)
 #endif

+#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON,
+          ScaleUVRowUp2_Linear_16_NEON,
+          ScaleUVRowUp2_Linear_16_C,
+          7,
+          uint16_t)
+#endif
+
 #undef SBUH2LANY

 // Scale bi-planar plane up 2 times using bilinear filter.
@ -886,6 +916,12 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
          0,
          uint8_t)

+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C,
+          ScaleUVRowUp2_Bilinear_16_C,
+          ScaleUVRowUp2_Bilinear_16_C,
+          0,
+          uint16_t)
+
 #ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
 SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
          ScaleUVRowUp2_Bilinear_SSSE3,
@ -902,6 +938,22 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
          uint8_t)
 #endif

+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE2,
+          ScaleUVRowUp2_Bilinear_16_SSE2,
+          ScaleUVRowUp2_Bilinear_16_C,
+          7,
+          uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2,
+          ScaleUVRowUp2_Bilinear_16_AVX2,
+          ScaleUVRowUp2_Bilinear_16_C,
+          7,
+          uint16_t)
+#endif
+
 #ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
 SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
          ScaleUVRowUp2_Bilinear_NEON,
@ -910,6 +962,14 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
          uint8_t)
 #endif

+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON,
+          ScaleUVRowUp2_Bilinear_16_NEON,
+          ScaleUVRowUp2_Bilinear_16_C,
+          3,
+          uint16_t)
+#endif
+
 #undef SBU2BLANY

 #ifdef __cplusplus
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -1258,6 +1258,64 @@ void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
  }
 }

+void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
+                               int dst_width) {
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    dst_ptr[4 * x + 0] =
+        (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+    dst_ptr[4 * x + 1] =
+        (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+    dst_ptr[4 * x + 2] =
+        (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+    dst_ptr[4 * x + 3] =
+        (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+  }
+}
+
+void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint16_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  uint16_t* d = dst_ptr;
+  uint16_t* e = dst_ptr + dst_stride;
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                    t[2 * x + 2] * 1 + 8) >>
+                   4;
+    d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                    t[2 * x + 3] * 1 + 8) >>
+                   4;
+    d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+                    t[2 * x + 2] * 3 + 8) >>
+                   4;
+    d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+                    t[2 * x + 3] * 3 + 8) >>
+                   4;
+    e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+                    t[2 * x + 2] * 3 + 8) >>
+                   4;
+    e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+                    t[2 * x + 3] * 3 + 8) >>
+                   4;
+    e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                    t[2 * x + 2] * 9 + 8) >>
+                   4;
+    e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                    t[2 * x + 3] * 9 + 8) >>
+                   4;
+  }
+}
+
 // Scales a single row of pixels using point sampling.
 void ScaleUVCols_C(uint8_t* dst_uv,
                   const uint8_t* src_uv,
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@ -791,6 +791,102 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
  );
 }

+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width) {
+  const uint16_t* src_temp = src_ptr + 2;
+  asm volatile(
+      "vmov.u16    d30, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.16     {q0}, [%0]!                   \n"  // 00112233 (1u1v, 16)
+      "vld1.16     {q1}, [%3]!                   \n"  // 11223344 (1u1v, 16)
+
+      "vmovl.u16   q2, d0                        \n"  // 0011 (1u1v, 32b)
+      "vmovl.u16   q3, d2                        \n"  // 1122 (1u1v, 32b)
+      "vmovl.u16   q4, d1                        \n"  // 2233 (1u1v, 32b)
+      "vmovl.u16   q5, d3                        \n"  // 3344 (1u1v, 32b)
+      "vmlal.u16   q2, d2, d30                   \n"  // 3*near+far (odd)
+      "vmlal.u16   q3, d0, d30                   \n"  // 3*near+far (even)
+      "vmlal.u16   q4, d3, d30                   \n"  // 3*near+far (odd)
+      "vmlal.u16   q5, d1, d30                   \n"  // 3*near+far (even)
+
+      "vrshrn.u32  d1, q2, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u32  d0, q3, #2                    \n"  // 3/4*near+1/4*far (even)
+      "vrshrn.u32  d3, q4, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u32  d2, q5, #2                    \n"  // 3/4*near+1/4*far (even)
+
+      "vst2.32     {d0, d1}, [%1]!               \n"  // store
+      "vst2.32     {d2, d3}, [%1]!               \n"  // store
+      "subs        %2, %2, #8                    \n"  // 4 uv -> 8 uv
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d30"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 2;
+  const uint16_t* src_temp1 = src_ptr1 + 2;
+
+  asm volatile(
+      "vmov.u16    d30, #3                       \n"
+      "vmov.u32    q14, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0]!                   \n"  // 0011 (1u1v)
+      "vld1.8      {d1}, [%5]!                   \n"  // 1122 (1u1v)
+      "vmovl.u16   q2, d0                        \n"  // 0011 (1u1v, 32b)
+      "vmovl.u16   q3, d1                        \n"  // 1122 (1u1v, 32b)
+      "vmlal.u16   q2, d1, d30                   \n"  // 3*near+far (1, odd)
+      "vmlal.u16   q3, d0, d30                   \n"  // 3*near+far (1, even)
+
+      "vld1.8      {d0}, [%1]!                   \n"  // 0011 (1u1v)
+      "vld1.8      {d1}, [%6]!                   \n"  // 1122 (1u1v)
+      "vmovl.u16   q4, d0                        \n"  // 0011 (1u1v, 32b)
+      "vmovl.u16   q5, d1                        \n"  // 1122 (1u1v, 32b)
+      "vmlal.u16   q4, d1, d30                   \n"  // 3*near+far (2, odd)
+      "vmlal.u16   q5, d0, d30                   \n"  // 3*near+far (2, even)
+
+      "vmovq       q0, q4                        \n"
+      "vmovq       q1, q5                        \n"
+      "vmla.u32    q4, q2, q14                   \n"  // 9 3 3 1 (1, odd)
+      "vmla.u32    q5, q3, q14                   \n"  // 9 3 3 1 (1, even)
+      "vmla.u32    q2, q0, q14                   \n"  // 9 3 3 1 (2, odd)
+      "vmla.u32    q3, q1, q14                   \n"  // 9 3 3 1 (2, even)
+
+      "vrshrn.u32  d1, q4, #4                    \n"  // 1, odd
+      "vrshrn.u32  d0, q5, #4                    \n"  // 1, even
+      "vrshrn.u32  d3, q2, #4                    \n"  // 2, odd
+      "vrshrn.u32  d2, q3, #4                    \n"  // 2, even
+
+      "vst2.32     {d0, d1}, [%2]!               \n"  // store
+      "vst2.32     {d2, d3}, [%3]!               \n"  // store
+      "subs        %4, %4, #4                    \n"  // 2 uv -> 4 uv
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
+        "d30"  // Clobber List
+  );
+}
+
 // Add a row of bytes to a row of shorts.  Used for box filter.
 // Reads 16 bytes and accumulates to 16 shorts at a time.
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -799,8 +799,8 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
      "rshrn       v4.8b, v4.8h, #4              \n"  // 1, odd
      "rshrn       v3.8b, v5.8h, #4              \n"  // 1, even

-      "st2         {v1.4h, v2.4h}, [%5], #16     \n"  // store 1
-      "st2         {v3.4h, v4.4h}, [%4], #16     \n"  // store 2
+      "st2         {v1.4h, v2.4h}, [%5], #16     \n"  // store 2
+      "st2         {v3.4h, v4.4h}, [%4], #16     \n"  // store 1
      "subs        %w6, %w6, #8                  \n"  // 4 uv -> 8 uv
      "b.gt        1b                            \n"
      : "+r"(src_ptr),    // %0
@ -816,6 +816,106 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
  );
 }

+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                  uint16_t* dst_ptr,
+                                  int dst_width) {
+  const uint16_t* src_temp = src_ptr + 2;
+  asm volatile(
+      "movi        v31.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
+      "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
+      "ushll       v3.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
+      "ushll2      v4.4s, v0.8h, #0              \n"  // 2233 (1u1v, 32b)
+      "ushll2      v5.4s, v1.8h, #0              \n"  // 3344 (1u1v, 32b)
+
+      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (odd)
+      "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (even)
+      "umlal2      v4.4s, v1.8h, v31.8h          \n"  // 3*near+far (odd)
+      "umlal2      v5.4s, v0.8h, v31.8h          \n"  // 3*near+far (even)
+
+      "rshrn       v2.4h, v2.4s, #2              \n"  // 3/4*near+1/4*far (odd)
+      "rshrn       v1.4h, v3.4s, #2              \n"  // 3/4*near+1/4*far (even)
+      "rshrn       v4.4h, v4.4s, #2              \n"  // 3/4*near+1/4*far (odd)
+      "rshrn       v3.4h, v5.4s, #2              \n"  // 3/4*near+1/4*far (even)
+
+      "st2         {v1.2s, v2.2s}, [%2], #16     \n"  // store
+      "st2         {v3.2s, v4.2s}, [%2], #16     \n"  // store
+      "subs        %w3, %w3, #8                  \n"  // 4 uv -> 8 uv
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v31"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint16_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 2;
+  const uint16_t* src_temp1 = src_ptr1 + 2;
+
+  asm volatile(
+      "movi        v31.4h, #3                    \n"
+      "movi        v30.4s, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"
+      "ldr         d1, [%2], #8                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+      "ushll       v2.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
+      "ushll       v3.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
+      "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
+      "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
+
+      "ldr         d0, [%1], #8                  \n"
+      "ldr         d1, [%3], #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+      "ushll       v4.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
+      "ushll       v5.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
+      "umlal       v4.4s, v1.4h, v31.4h          \n"  // 3*near+far (2, odd)
+      "umlal       v5.4s, v0.4h, v31.4h          \n"  // 3*near+far (2, even)
+
+      "mov         v0.4s, v4.4s                  \n"
+      "mov         v1.4s, v5.4s                  \n"
+      "mla         v4.4s, v2.4s, v30.4s          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.4s, v3.4s, v30.4s          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.4s, v0.4s, v30.4s          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.4s, v1.4s, v30.4s          \n"  // 9 3 3 1 (2, even)
+
+      "rshrn       v1.4h, v2.4s, #4              \n"  // 2, odd
+      "rshrn       v0.4h, v3.4s, #4              \n"  // 2, even
+      "rshrn       v3.4h, v4.4s, #4              \n"  // 1, odd
+      "rshrn       v2.4h, v5.4s, #4              \n"  // 1, even
+
+      "st2         {v0.2s, v1.2s}, [%5], #16     \n"  // store 2
+      "st2         {v2.2s, v3.2s}, [%4], #16     \n"  // store 1
+      "subs        %w6, %w6, #4                  \n"  // 2 uv -> 4 uv
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+        "v31"  // Clobber List
+  );
+}
+
 // Add a row of bytes to a row of shorts.  Used for box filter.
 // Reads 16 bytes and accumulates to 16 shorts at a time.
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@ -741,23 +741,124 @@ void ScaleUVBilinearUp2(int src_width,
  }
 #endif

-  if (src_height == 1) {
-    Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
-  } else {
+  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  dst_ptr += dst_stride;
+  for (x = 0; x < src_height - 1; ++x) {
+    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+    src_ptr += src_stride;
+    // TODO: Test performance of writing one row of destination at a time.
+    dst_ptr += 2 * dst_stride;
+  }
+  if (!(dst_height & 1)) {
    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
-    dst_ptr += dst_stride;
-    for (x = 0; x < src_height - 1; ++x) {
-      Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
-      src_ptr += src_stride;
-      // TODO: Test performance of writing one row of destination at a time.
-      dst_ptr += 2 * dst_stride;
-    }
-    if (!(dst_height & 1)) {
-      Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  }
+}
+
+// Scale 16 bit UV, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of P210 to P410.
+void ScaleUVLinearUp2_16(int src_width,
+                         int src_height,
+                         int dst_width,
+                         int dst_height,
+                         int src_stride,
+                         int dst_stride,
+                         const uint16_t* src_uv,
+                         uint16_t* dst_uv) {
+  void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
+      ScaleUVRowUp2_Linear_16_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
+      dst_uv += dst_stride;
+      y += dy;
    }
  }
 }

+// Scale 16 bit UV, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of P010 to P410.
+void ScaleUVBilinearUp2_16(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_ptr,
+                           uint16_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleUVRowUp2_Bilinear_16_Any_C;
+  int x;
+
+  // This function can only scale up by 2 times.
+  assert(src_width == ((dst_width + 1) / 2));
+  assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+  }
+#endif
+
+  Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  dst_ptr += dst_stride;
+  for (x = 0; x < src_height - 1; ++x) {
+    Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+    src_ptr += src_stride;
+    // TODO: Test performance of writing one row of destination at a time.
+    dst_ptr += 2 * dst_stride;
+  }
+  if (!(dst_height & 1)) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+  }
+}
+
 // Scale UV to/from any dimensions, without interpolation.
 // Fixed point math is used for performance: The upper 16 bits
 // of x and dx is the integer part of the source position and
@ -851,6 +952,26 @@ static int UVCopy(const uint8_t* src_UV,
  CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
  return 0;
 }
+
+static int UVCopy_16(const uint16_t* src_UV,
+                     int src_stride_UV,
+                     uint16_t* dst_UV,
+                     int dst_stride_UV,
+                     int width,
+                     int height) {
+  if (!src_UV || !dst_UV || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_UV = src_UV + (height - 1) * src_stride_UV;
+    src_stride_UV = -src_stride_UV;
+  }
+
+  CopyPlane_16(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
+  return 0;
+}
 #endif  // HAS_UVCOPY

 // Scale a UV plane (from NV12)
@ -953,7 +1074,7 @@ static void ScaleUV(const uint8_t* src,
                       dst_stride, src, dst, x, y, dy, 4, filtering);
    return;
  }
-  if (filtering && src_height == dst_height) {
+  if (filtering && (dst_width + 1) / 2 == src_width) {
    ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
                     dst_stride, src, dst);
    return;
@ -1005,6 +1126,69 @@ int UVScale(const uint8_t* src_uv,
  return 0;
 }

+// Scale an 16 bit UV image.
+// This function is currently incomplete, it can't handle all cases.
+LIBYUV_API
+int UVScale_16(const uint16_t* src_uv,
+               int src_stride_uv,
+               int src_width,
+               int src_height,
+               uint16_t* dst_uv,
+               int dst_stride_uv,
+               int dst_width,
+               int dst_height,
+               enum FilterMode filtering) {
+  int dy = 0;
+
+  if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  // UV does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src_uv = src_uv + (src_height - 1) * src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+  src_width = Abs(src_width);
+
+#ifdef HAS_UVCOPY
+  if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
+    if (dst_height == 1) {
+      UVCopy_16(src_uv + ((src_height - 1) / 2) * src_stride_uv, src_stride_uv,
+                dst_uv, dst_stride_uv, dst_width, dst_height);
+    } else {
+      dy = src_height / dst_height;
+      UVCopy_16(src_uv + src_stride_uv * ((dy - 1) / 2), src_stride_uv * dy,
+                dst_uv, dst_stride_uv, dst_width, dst_height);
+    }
+
+    return 0;
+  }
+#endif
+
+  if (filtering && (dst_width + 1) / 2 == src_width) {
+    ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height,
+                        src_stride_uv, dst_stride_uv, src_uv, dst_uv);
+    return 0;
+  }
+
+  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height,
+                          src_stride_uv, dst_stride_uv, src_uv, dst_uv);
+    return 0;
+  }
+
+  return -1;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -377,89 +377,119 @@ TESTPLANARTOBP(I444, 1, 1, NV12, 2, 2)
 TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
 TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)

-#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,       \
-                          FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG,    \
-                          OFF, DOY)                                           \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
-    const int kHeight = benchmark_height_;                                    \
-    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
-    align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *      \
-                                          SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
-                                      OFF);                                   \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) *        \
-                                        SUBSAMPLE(kHeight, SUBSAMP_Y));       \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) *      \
-                                          SUBSAMPLE(kHeight, SUBSAMP_Y));     \
-    for (int i = 0; i < kHeight; ++i)                                         \
-      for (int j = 0; j < kWidth; ++j)                                        \
-        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {             \
-      for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {        \
-        src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =        \
-            (fastrand() & 0xff);                                              \
-      }                                                                       \
-    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight);                                      \
-    memset(dst_uv_c, 2,                                                       \
-           2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
-    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
-    memset(dst_uv_opt, 102,                                                   \
-           2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        src_y + OFF, kWidth, src_uv + OFF,                                    \
-        2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth,   \
-        dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);     \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y + OFF, kWidth, src_uv + OFF,                                  \
-          2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL,       \
-          kWidth, dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth,       \
-          NEG kHeight);                                                       \
-    }                                                                         \
-    if (DOY) {                                                                \
-      for (int i = 0; i < kHeight; ++i) {                                     \
-        for (int j = 0; j < kWidth; ++j) {                                    \
-          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);      \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {            \
-        EXPECT_EQ(dst_uv_c[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j],         \
-                  dst_uv_opt[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j]);      \
-      }                                                                       \
-    }                                                                         \
-    free_aligned_buffer_page_end(dst_y_c);                                    \
-    free_aligned_buffer_page_end(dst_uv_c);                                   \
-    free_aligned_buffer_page_end(dst_y_opt);                                  \
-    free_aligned_buffer_page_end(dst_uv_opt);                                 \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_uv);                                     \
+#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,       \
+                          SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,           \
+                          DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,    \
+                          DOY, SRC_DEPTH)                                      \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
+    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");        \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");        \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                    \
+                  "DST SRC_SUBSAMP_X unsupported");                            \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                    \
+                  "DST SRC_SUBSAMP_Y unsupported");                            \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                    \
+                  "DST DST_SUBSAMP_X unsupported");                            \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                    \
+                  "DST DST_SUBSAMP_Y unsupported");                            \
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+    const int kHeight = benchmark_height_;                                     \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);                \
+    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);              \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);                \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);              \
+    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);              \
+    align_buffer_page_end(src_uv,                                              \
+                          2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                  \
+    align_buffer_page_end(dst_uv_c,                                            \
+                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);       \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);                \
+    align_buffer_page_end(dst_uv_opt,                                          \
+                          2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);       \
+    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                     \
+    MemRandomize(src_uv + OFF, 2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);  \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
+    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+      src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1);                        \
+    }                                                                          \
+    for (int i = 0; i < 2 * kSrcHalfWidth * kSrcHalfHeight; ++i) {             \
+      src_uv_p[i] = src_uv_p[i] & ((1 << SRC_DEPTH) - 1);                      \
+    }                                                                          \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                              \
+    memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);         \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                          \
+    memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);     \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
+        src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth,                \
+        DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth,                \
+        reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth,       \
+        kWidth, NEG kHeight);                                                  \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                          \
+          src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth,              \
+          DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth,            \
+          reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth,   \
+          kWidth, NEG kHeight);                                                \
+    }                                                                          \
+    if (DOY) {                                                                 \
+      for (int i = 0; i < kHeight; ++i) {                                      \
+        for (int j = 0; j < kWidth; ++j) {                                     \
+          EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]);       \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    for (int i = 0; i < kDstHalfHeight; ++i) {                                 \
+      for (int j = 0; j < 2 * kDstHalfWidth; ++j) {                            \
+        EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j],                         \
+                  dst_uv_opt[i * 2 * kDstHalfWidth + j]);                      \
+      }                                                                        \
+    }                                                                          \
+    free_aligned_buffer_page_end(dst_y_c);                                     \
+    free_aligned_buffer_page_end(dst_uv_c);                                    \
+    free_aligned_buffer_page_end(dst_y_opt);                                   \
+    free_aligned_buffer_page_end(dst_uv_opt);                                  \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_uv);                                      \
  }

-#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
-                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                     \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1,  \
-                    1)                                                         \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1)  \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1)     \
-  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
-                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
+#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
+                         SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
+                         DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)              \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
+                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
+                    DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1,        \
+                    SRC_DEPTH)                                                 \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
+                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
+                    DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, 1,      \
+                    SRC_DEPTH)                                                 \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
+                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
+                    DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1,         \
+                    SRC_DEPTH)                                                 \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
+                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
+                    DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH) \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
+                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
+                    DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0,          \
+                    SRC_DEPTH)

-TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
-TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
-TESTBIPLANARTOBP(NV12, 2, 2, NV24, 1, 1)
-TESTBIPLANARTOBP(NV16, 2, 1, NV24, 1, 1)
+TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
+TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8)
+TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8)
+TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8)
+// These formats put data at high bits, so test on full 16bit range.
+TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 16)
+TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 16)
+TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 16)
+TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 16)
+TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 16)
+TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 16)

 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \