Add special optimization for I420ToI444 and I422ToI444

These functions use (bi)linear filter, to scale U and V planes to the size of Y plane. This will help enhance the quality of YUV to RGB conversion. Also added 10bit and 12bit version: I010ToI410 I210ToI410 I012ToI412 I212ToI412 libyuv_unittest --gtest_filter=LibYUVConvertTest.I42*ToI444*:LibYUVConvertTest.I*1*ToI41* R=fbarchard@chromium.org Change-Id: Ie4a711a5ba28f2ff1f44c021f7a5c149022264c5 Bug: libyuv:872 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2658097 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2021-02-03 14:21:07 +08:00 · 2021-02-03 14:21:07 +08:00 · fc61dde1eb
commit fc61dde1eb
parent c28d404936
13 changed files with 2155 additions and 30 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1772
+Version: 1774
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -89,6 +89,23 @@ int I422ToI420(const uint8_t* src_y,
               int width,
               int height);

+// Convert I422 to I444.
+LIBYUV_API
+int I422ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // Convert I422 to NV21.
 LIBYUV_API
 int I422ToNV21(const uint8_t* src_y,
@ -122,6 +139,23 @@ int I420Copy(const uint8_t* src_y,
             int width,
             int height);

+// Convert I420 to I444.
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // Copy I010 to I010
 #define I010ToI010 I010Copy
 #define H010ToH010 I010Copy
@ -159,6 +193,46 @@ int I010ToI420(const uint16_t* src_y,
               int width,
               int height);

+// Convert I010 to I410
+LIBYUV_API
+int I010ToI410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I012 to I412
+#define I012ToI412 I010ToI410
+
+// Convert I212 to I412
+LIBYUV_API
+int I210ToI410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+// Convert I212 to I412
+#define I212ToI412 I210ToI410
+
 // Convert I400 (grey) to I420.
 LIBYUV_API
 int I400ToI420(const uint8_t* src_y,
--- a/include/libyuv/scale.h
+++ b/include/libyuv/scale.h
@ -49,6 +49,18 @@ void ScalePlane_16(const uint16_t* src,
                   int dst_height,
                   enum FilterMode filtering);

+// Sample is expected to be in the low 12 bits.
+LIBYUV_API
+void ScalePlane_12(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   enum FilterMode filtering);
+
 // Scales a YUV 4:2:0 image from the src width and height to the
 // dst width and height.
 // If filtering is kFilterNone, a simple nearest-neighbor algorithm is
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@ -77,6 +77,12 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 #define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#define HAS_SCALECOLUP2LINEAR_SSE2
+#define HAS_SCALECOLUP2LINEAR_SSSE3
+#define HAS_SCALEROWUP2LINEAR_SSE2
+#define HAS_SCALEROWUP2LINEAR_SSSE3
+#define HAS_SCALECOLUP2LINEAR_16_SSE2
+#define HAS_SCALEROWUP2LINEAR_16_SSE2
 #endif

 // The following are available for gcc/clang x86 platforms, but
@ -86,6 +92,10 @@ extern "C" {
    (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
 #define HAS_SCALEUVROWDOWN2BOX_AVX2
+#define HAS_SCALECOLUP2LINEAR_AVX2
+#define HAS_SCALEROWUP2LINEAR_AVX2
+#define HAS_SCALECOLUP2LINEAR_16_AVX2
+#define HAS_SCALEROWUP2LINEAR_16_AVX2
 #endif

 // The following are available on all x86 platforms, but
@ -114,6 +124,10 @@ extern "C" {
 #define HAS_SCALEROWDOWN4_NEON
 #define HAS_SCALEUVROWDOWN2BOX_NEON
 #define HAS_SCALEUVROWDOWNEVEN_NEON
+#define HAS_SCALECOLUP2LINEAR_NEON
+#define HAS_SCALEROWUP2LINEAR_NEON
+#define HAS_SCALECOLUP2LINEAR_16_NEON
+#define HAS_SCALEROWUP2LINEAR_16_NEON
 #endif

 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@ -279,6 +293,40 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint16_t* d,
                               int dst_width);
+
+void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            ptrdiff_t dst_stride,
+                            int dst_width);
+void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
+void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
+                                 uint16_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint16_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+
 void ScaleCols_C(uint8_t* dst_ptr,
                 const uint8_t* src_ptr,
                 int dst_width,
@ -508,6 +556,88 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width);
+
+void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width);
+void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8_t* dst_ptr,
+                                    ptrdiff_t dst_stride,
+                                    int dst_width);
+void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+
 void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst_ptr,
@ -1143,6 +1273,39 @@ void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int dst_width);

+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width);
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width);
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   ptrdiff_t dst_stride,
+                                   int dst_width);
+void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
+                                    uint16_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint16_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+
 void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
 void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
                          uint16_t* dst_ptr,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1773
+#define LIBYUV_VERSION 1774

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -159,6 +159,102 @@ int I420ToI444(const uint8_t* src_y,
                    dst_uv_height);
 }

+// 420 chroma to 444 chroma, 10/12 bit version
+LIBYUV_API
+int I010ToI410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                  Abs(width), Abs(height), kFilterBilinear);
+  }
+  ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
+                SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width),
+                Abs(height), kFilterBilinear);
+  ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
+                SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width),
+                Abs(height), kFilterBilinear);
+  return 0;
+}
+
+// 422 chroma to 444 chroma, 10/12 bit version
+LIBYUV_API
+int I210ToI410(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+                  Abs(width), Abs(height), kFilterBilinear);
+  }
+  ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+                dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+  ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+                dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+  return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I422ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+               Abs(width), Abs(height), kFilterBilinear);
+  }
+  ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+             dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+             dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+  return 0;
+}
+
 // Copy to I400. Source can be I420,422,444,400,NV12,NV21
 LIBYUV_API
 int I400Copy(const uint8_t* src_y,
--- a/source/scale.cc
+++ b/source/scale.cc
@ -1336,6 +1336,238 @@ void ScalePlaneBilinearUp(int src_width,
  }
 }

+// Scale plane, horizontally 2 times, vertically any time.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of I422 to I444.
+void ScalePlaneUp2_Linear(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_ptr,
+                          uint8_t* dst_ptr) {
+  void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
+      ScaleRowUp2_Linear_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+               dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+  }
+}
+
+// Scale plane, 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of I420 to I444.
+void ScalePlaneUp2_Bilinear(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleRowUp2_Bilinear_Any_C;
+  int x;
+
+  // This function can only scale up by 2 times.
+  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+  assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+  }
+#endif
+
+  if (src_height == 1) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
+  } else {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+    dst_ptr += dst_stride;
+    for (x = 0; x < src_height - 1; ++x) {
+      Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+      src_ptr += src_stride;
+      // TODO test performance of writing one row of destination at a time
+      dst_ptr += 2 * dst_stride;
+    }
+    if (!(dst_height & 1)) {
+      Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+    }
+  }
+}
+
+// Scale at most 14bit plane, horizontally 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// stride is in count of uint16_t.
+// This is used to scale U and V planes of I210 to I410 and I212 to I412.
+void ScalePlaneUp2_16_Linear(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16_t* src_ptr,
+                             uint16_t* dst_ptr) {
+  void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+                     int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
+               dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+  }
+}
+
+// Scale at most 12bit plane, up 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// stride is in count of uint16_t.
+// This is used to scale U and V planes of I010 to I410 and I012 to I412.
+void ScalePlaneUp2_16_Bilinear(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleRowUp2_Bilinear_16_Any_C;
+  int x;
+
+  // This function can only scale up by 2 times.
+  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+  assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
+  }
+#endif
+
+  if (src_height == 1) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
+  } else {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+    dst_ptr += dst_stride;
+    for (x = 0; x < src_height - 1; ++x) {
+      Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+      src_ptr += src_stride;
+      dst_ptr += 2 * dst_stride;
+    }
+    if (!(dst_height & 1)) {
+      Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+    }
+  }
+}
+
 void ScalePlaneBilinearUp_16(int src_width,
                             int src_height,
                             int dst_width,
@ -1627,6 +1859,17 @@ void ScalePlane(const uint8_t* src,
                  dst_stride, src, dst);
    return;
  }
+  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+    ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst);
+    return;
+  }
+  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
+                           src_stride, dst_stride, src, dst);
+    return;
+  }
  if (filtering && dst_height > src_height) {
    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
                         src_stride, dst_stride, src, dst, filtering);
@ -1724,6 +1967,43 @@ void ScalePlane_16(const uint16_t* src,
                      dst_stride, src, dst);
 }

+LIBYUV_API
+void ScalePlane_12(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+    ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst);
+    return;
+  }
+  if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
+                              src_stride, dst_stride, src, dst);
+    return;
+  }
+
+  ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
+                dst_width, dst_height, filtering);
+}
+
 // Scale an I420 image.
 // This function in turn calls a scaling function for each plane.

--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@ -609,6 +609,191 @@ CANY(ScaleARGBFilterCols_Any_MSA,
 #endif
 #undef CANY

+// Scale up horizontally 2 times using linear filter.
+#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE)                       \
+  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
+    int work_width = (dst_width - 1) & ~1;                         \
+    int r = work_width & MASK;                                     \
+    int n = work_width & ~MASK;                                    \
+    dst_ptr[0] = src_ptr[0];                                       \
+    if (work_width > 0) {                                          \
+      if (n != 0) {                                                \
+        SIMD(src_ptr, dst_ptr + 1, n);                             \
+      }                                                            \
+      C(src_ptr + (n / 2), dst_ptr + n + 1, r);                    \
+    }                                                              \
+    dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1];         \
+  }
+
+// Even the C version need to be wrapped, because boundary pixels have to
+// be handled differently
+
+SUH2LANY(ScaleRowUp2_Linear_Any_C,
+         ScaleRowUp2_Linear_C,
+         ScaleRowUp2_Linear_C,
+         0,
+         uint8_t)
+
+SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
+         ScaleRowUp2_Linear_16_C,
+         ScaleRowUp2_Linear_16_C,
+         0,
+         uint16_t)
+
+#ifdef HAS_SCALECOLUP2LINEAR_SSE2
+SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
+         ScaleRowUp2_Linear_SSE2,
+         ScaleRowUp2_Linear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
+SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
+         ScaleRowUp2_Linear_SSSE3,
+         ScaleRowUp2_Linear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
+         ScaleRowUp2_Linear_16_SSE2,
+         ScaleRowUp2_Linear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_AVX2
+SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
+         ScaleRowUp2_Linear_AVX2,
+         ScaleRowUp2_Linear_C,
+         31,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
+         ScaleRowUp2_Linear_16_AVX2,
+         ScaleRowUp2_Linear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_NEON
+SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
+         ScaleRowUp2_Linear_NEON,
+         ScaleRowUp2_Linear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_NEON
+SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
+         ScaleRowUp2_Linear_16_NEON,
+         ScaleRowUp2_Linear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#undef SUH2LANY
+
+// Scale up 2 times using bilinear filter.
+// This function produces 2 rows at a time
+#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE)                            \
+  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+            ptrdiff_t dst_stride, int dst_width) {                      \
+    int work_width = (dst_width - 1) & ~1;                              \
+    int r = work_width & MASK;                                          \
+    int n = work_width & ~MASK;                                         \
+    const PTYPE* sa = src_ptr;                                          \
+    const PTYPE* sb = src_ptr + src_stride;                             \
+    PTYPE* da = dst_ptr;                                                \
+    PTYPE* db = dst_ptr + dst_stride;                                   \
+    da[0] = (3 * sa[0] + sb[0]) >> 2;                                   \
+    db[0] = (sa[0] + 3 * sb[0]) >> 2;                                   \
+    if (work_width > 0) {                                               \
+      if (n != 0) {                                                     \
+        SIMD(sa, sb - sa, da + 1, db - da, n);                          \
+      }                                                                 \
+      C(sa + (n / 2), sb - sa, da + n + 1, db - da, r);                 \
+    }                                                                   \
+    da[dst_width - 1] =                                                 \
+        (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2]) >> 2;   \
+    db[dst_width - 1] =                                                 \
+        (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2]) >> 2;   \
+  }
+
+SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
+         ScaleRowUp2_Bilinear_C,
+         ScaleRowUp2_Bilinear_C,
+         0,
+         uint8_t)
+
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
+         ScaleRowUp2_Bilinear_16_C,
+         ScaleRowUp2_Bilinear_16_C,
+         0,
+         uint16_t)
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
+         ScaleRowUp2_Bilinear_SSE2,
+         ScaleRowUp2_Bilinear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
+         ScaleRowUp2_Bilinear_16_SSE2,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
+         ScaleRowUp2_Bilinear_SSSE3,
+         ScaleRowUp2_Bilinear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
+         ScaleRowUp2_Bilinear_AVX2,
+         ScaleRowUp2_Bilinear_C,
+         31,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
+         ScaleRowUp2_Bilinear_16_AVX2,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
+         ScaleRowUp2_Bilinear_NEON,
+         ScaleRowUp2_Bilinear_C,
+         15,
+         uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
+         ScaleRowUp2_Bilinear_16_NEON,
+         ScaleRowUp2_Bilinear_16_C,
+         15,
+         uint16_t)
+#endif
+
+#undef SU2BLANY
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -400,6 +400,95 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
  }
 }

+// sample position: (O is src sample position, X is dst sample position)
+//
+//      v dst_ptr at here           v stop at here
+//  X O X   X O X   X O X   X O X   X O X
+//    ^ src_ptr at here
+void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+    dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+  }
+}
+
+// sample position: (O is src sample position, X is dst sample position)
+//
+//    src_ptr at here
+//  X v X   X   X   X   X   X   X   X   X
+//    O       O       O       O       O
+//  X   X   X   X   X   X   X   X   X   X
+//      ^ dst_ptr at here           ^ stop at here
+//  X   X   X   X   X   X   X   X   X   X
+//    O       O       O       O       O
+//  X   X   X   X   X   X   X   X   X   X
+void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            ptrdiff_t dst_stride,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    d[2 * x + 0] =
+        (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+    d[2 * x + 1] =
+        (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+    e[2 * x + 0] =
+        (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+    e[2 * x + 1] =
+        (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+  }
+}
+
+// only suitable for at most 14bit range.
+void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
+                             int dst_width) {
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+    dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+  }
+}
+
+// Only suitable for at most 12bit range.
+void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
+  uint16_t* d = dst_ptr;
+  uint16_t* e = dst_ptr + dst_stride;
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    d[2 * x + 0] =
+        (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+    d[2 * x + 1] =
+        (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+    e[2 * x + 0] =
+        (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+    e[2 * x + 1] =
+        (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+  }
+}
+
 // Scales a single row of pixels using point sampling.
 void ScaleCols_C(uint8_t* dst_ptr,
                 const uint8_t* src_ptr,
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@ -785,6 +785,836 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
        "xmm7");
 }

+#ifdef HAS_SCALECOLUP2LINEAR_SSE2
+void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  asm volatile(
+
+      "pxor        %%xmm0,%%xmm0                 \n"  // 0
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      "psrlw       $15,%%xmm6                    \n"
+      "psllw       $1,%%xmm6                     \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm1                   \n"  // 01234567
+      "movq        1(%0),%%xmm2                  \n"  // 12345678
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
+      "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
+      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
+      "paddw       %%xmm5,%%xmm4                 \n"
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "paddw       %%xmm6,%%xmm4                 \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
+      "paddw       %%xmm5,%%xmm5                 \n"
+      "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
+      "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
+
+      "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
+      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
+      "paddw       %%xmm2,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
+      "paddw       %%xmm6,%%xmm1                 \n"
+      "paddw       %%xmm3,%%xmm3                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
+      "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
+
+      "packuswb    %%xmm1,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1)                   \n"
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "pxor        %%xmm0,%%xmm0                 \n"  // 0
+      // above line
+      "movq        (%0),%%xmm1                   \n"  // 01234567
+      "movq        1(%0),%%xmm2                  \n"  // 12345678
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
+      "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
+      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
+
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
+      "paddw       %%xmm5,%%xmm4                 \n"  // near+far
+      "movdqa      %%xmm3,%%xmm5                 \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
+      "paddw       %%xmm5,%%xmm5                 \n"  // 2*near
+      "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
+
+      "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
+      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
+      "paddw       %%xmm2,%%xmm1                 \n"
+      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
+      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
+      "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+      // below line
+      "movq        (%0,%3),%%xmm6                \n"  // 01234567
+      "movq        1(%0,%3),%%xmm2               \n"  // 12345678
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
+      "punpcklbw   %%xmm6,%%xmm6                 \n"  // 0011223344556677
+      "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
+
+      "movdqa      %%xmm6,%%xmm5                 \n"
+      "punpcklbw   %%xmm0,%%xmm5                 \n"  // 00112233 (16)
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "punpcklbw   %%xmm0,%%xmm7                 \n"  // 11223344 (16)
+      "paddw       %%xmm7,%%xmm5                 \n"  // near+far
+      "movdqa      %%xmm3,%%xmm7                 \n"
+      "punpcklbw   %%xmm0,%%xmm7                 \n"  // 01122334 (16)
+      "paddw       %%xmm7,%%xmm7                 \n"  // 2*near
+      "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far (2, lo)
+
+      "punpckhbw   %%xmm0,%%xmm6                 \n"  // 44556677 (16)
+      "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
+      "paddw       %%xmm6,%%xmm2                 \n"  // near+far
+      "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
+      "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
+      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
+
+      // xmm4 xmm1
+      // xmm5 xmm2
+      "pcmpeqw     %%xmm0,%%xmm0                 \n"
+      "psrlw       $15,%%xmm0                    \n"
+      "psllw       $3,%%xmm0                     \n"  // all 8
+
+      "movdqa      %%xmm4,%%xmm3                 \n"
+      "movdqa      %%xmm5,%%xmm6                 \n"
+      "psllw       $1,%%xmm3                     \n"  // 6*near+2*far (1, lo)
+      "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
+      "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
+      "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
+
+      "movdqa      %%xmm1,%%xmm7                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "psllw       $1,%%xmm7                     \n"  // 6*near+2*far (1, hi)
+      "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
+      "paddw       %%xmm1,%%xmm7                 \n"  // 9*near+3*far (1, hi)
+      "paddw       %%xmm6,%%xmm7                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw       $4,%%xmm7                     \n"  // ^ div by 16
+
+      "packuswb    %%xmm7,%%xmm3                 \n"
+      "movdqu      %%xmm3,(%1)                   \n"  // save above line
+
+      "movdqa      %%xmm5,%%xmm3                 \n"
+      "paddw       %%xmm0,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
+      "psllw       $1,%%xmm3                     \n"  // 6*near+2*far (2, lo)
+      "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (lo)
+      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
+
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "paddw       %%xmm0,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
+      "psllw       $1,%%xmm3                     \n"  // 6*near+2*far (2, hi)
+      "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
+      "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (hi)
+      "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
+
+      "packuswb    %%xmm2,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1,%4)                \n"  // save below line
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+
+      "pxor        %%xmm0,%%xmm0                 \n"  // 0
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      "psrlw       $15,%%xmm6                    \n"
+      "psllw       $1,%%xmm6                     \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm1                   \n"  // 01234567 (16)
+      "movdqu      2(%0),%%xmm2                  \n"  // 12345678 (16)
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "punpcklwd   %%xmm4,%%xmm4                 \n"  // 00112233 (16)
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "punpcklwd   %%xmm5,%%xmm5                 \n"  // 11223344 (16)
+      "paddw       %%xmm5,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm6,%%xmm4                 \n"
+      "punpcklwd   %%xmm2,%%xmm5                 \n"  // 01122334 (16)
+      "psllw       $1,%%xmm5                     \n"
+      "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
+      "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
+      "movdqu      %%xmm5,(%1)                   \n"
+
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpckhwd   %%xmm2,%%xmm3                 \n"  // 45566778 (16)
+      "punpckhwd   %%xmm1,%%xmm1                 \n"  // 44556677 (16)
+      "punpckhwd   %%xmm2,%%xmm2                 \n"  // 55667788 (16)
+      "paddw       %%xmm2,%%xmm1                 \n"
+      "paddw       %%xmm6,%%xmm1                 \n"
+      "psllw       $1,%%xmm3                     \n"
+      "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
+      "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
+      "movdqu      %%xmm1,0x10(%1)               \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  asm volatile(
+
+      "pxor        %%xmm0,%%xmm0                 \n"  // 0
+      "pcmpeqw     %%xmm7,%%xmm7                 \n"
+      "psrlw       $15,%%xmm7                    \n"
+      "psllw       $3,%%xmm7                     \n"  // all 8
+
+      LABELALIGN
+      "1:                                        \n"
+      // above line
+      "movdqu      (%0),%%xmm1                   \n"  // 01234567 (16)
+      "movdqu      2(%0),%%xmm2                  \n"  // 12345678 (16)
+      "movdqa      %%xmm1,%%xmm4                 \n"
+      "punpcklwd   %%xmm4,%%xmm4                 \n"  // 00112233 (16)
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "punpcklwd   %%xmm5,%%xmm5                 \n"  // 11223344 (16)
+      "paddw       %%xmm5,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "punpcklwd   %%xmm2,%%xmm5                 \n"  // 01122334 (16)
+      "paddw       %%xmm5,%%xmm5                 \n"
+      "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
+
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpckhwd   %%xmm2,%%xmm3                 \n"  // 45566778 (16)
+      "punpckhwd   %%xmm1,%%xmm1                 \n"  // 44556677 (16)
+      "punpckhwd   %%xmm2,%%xmm2                 \n"  // 55667788 (16)
+      "paddw       %%xmm2,%%xmm1                 \n"
+      "paddw       %%xmm3,%%xmm3                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
+
+      // below line
+      "movdqu      (%0,%3,2),%%xmm6              \n"  // 01234567 (16)
+      "movdqu      2(%0,%3,2),%%xmm2             \n"  // 12345678 (16)
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpcklwd   %%xmm3,%%xmm3                 \n"  // 00112233 (16)
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "punpcklwd   %%xmm5,%%xmm5                 \n"  // 11223344 (16)
+      "paddw       %%xmm5,%%xmm3                 \n"
+      "movdqa      %%xmm6,%%xmm5                 \n"
+      "punpcklwd   %%xmm2,%%xmm5                 \n"  // 01122334 (16)
+      "paddw       %%xmm5,%%xmm5                 \n"
+      "paddw       %%xmm3,%%xmm5                 \n"  // 3*near+far (2, lo)
+
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpckhwd   %%xmm2,%%xmm3                 \n"  // 45566778 (16)
+      "punpckhwd   %%xmm6,%%xmm6                 \n"  // 44556677 (16)
+      "punpckhwd   %%xmm2,%%xmm2                 \n"  // 55667788 (16)
+      "paddw       %%xmm6,%%xmm2                 \n"
+      "paddw       %%xmm3,%%xmm3                 \n"
+      "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
+
+      // xmm4 xmm1
+      // xmm5 xmm2
+
+      "movdqa      %%xmm4,%%xmm3                 \n"
+      "movdqa      %%xmm5,%%xmm6                 \n"
+      "psllw       $1,%%xmm3                     \n"  // 6*near+2*far (1, lo)
+      "paddw       %%xmm7,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
+      "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
+      "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
+      "movdqu      %%xmm3,(%1)                   \n"
+
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "movdqa      %%xmm2,%%xmm6                 \n"
+      "psllw       $1,%%xmm3                     \n"  // 6*near+2*far (1, hi)
+      "paddw       %%xmm7,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
+      "paddw       %%xmm1,%%xmm3                 \n"  // 9*near+3*far (1, hi)
+      "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
+      "movdqu      %%xmm3,0x10(%1)               \n"
+
+      "movdqa      %%xmm5,%%xmm3                 \n"
+      "paddw       %%xmm7,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
+      "psllw       $1,%%xmm3                     \n"  // 6*near+2*far (2, lo)
+      "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
+      "movdqu      %%xmm5,(%1,%4,2)              \n"
+
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "paddw       %%xmm7,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
+      "psllw       $1,%%xmm3                     \n"  // 6*near+2*far (2, hi)
+      "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
+      "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
+      "movdqu      %%xmm2,0x10(%1,%4,2)          \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
+static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3,
+                                          3, 1, 1, 3, 3, 1, 1, 3};
+
+void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  asm volatile(
+
+      "pcmpeqw      %%xmm4,%%xmm4                \n"
+      "psrlw        $15,%%xmm4                   \n"
+      "psllw        $1,%%xmm4                    \n"  // all 2
+      "movdqu       %3,%%xmm3                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq         (%0),%%xmm0                  \n"  // 01234567
+      "movq         1(%0),%%xmm1                 \n"  // 12345678
+      "punpcklwd    %%xmm0,%%xmm0                \n"  // 0101232345456767
+      "punpcklwd    %%xmm1,%%xmm1                \n"  // 1212343456567878
+      "movdqa       %%xmm0,%%xmm2                \n"
+      "punpckhdq    %%xmm1,%%xmm2                \n"  // 4545565667677878
+      "punpckldq    %%xmm1,%%xmm0                \n"  // 0101121223233434
+      "pmaddubsw    %%xmm3,%%xmm2                \n"  // 3*near+far (hi)
+      "pmaddubsw    %%xmm3,%%xmm0                \n"  // 3*near+far (lo)
+      "paddw        %%xmm4,%%xmm0                \n"  // 3*near+far+2 (lo)
+      "paddw        %%xmm4,%%xmm2                \n"  // 3*near+far+2 (hi)
+      "psrlw        $2,%%xmm0                    \n"  // 3/4*near+1/4*far (lo)
+      "psrlw        $2,%%xmm2                    \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb    %%xmm2,%%xmm0,%%xmm0         \n"
+      "vmovdqu      %%xmm0,(%1)                  \n"
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),            // %0
+        "+r"(dst_ptr),            // %1
+        "+r"(dst_width)           // %2
+      : "m"(kLinearMadd31_SSSE3)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                ptrdiff_t dst_stride,
+                                int dst_width) {
+  asm volatile(
+
+      "pcmpeqw      %%xmm6,%%xmm6                \n"
+      "psrlw        $15,%%xmm6                   \n"
+      "psllw        $3,%%xmm6                    \n"  // all 8
+      "movdqu       %5,%%xmm7                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq         (%0),%%xmm0                  \n"  // 01234567
+      "movq         1(%0),%%xmm1                 \n"  // 12345678
+      "punpcklwd    %%xmm0,%%xmm0                \n"  // 0101232345456767
+      "punpcklwd    %%xmm1,%%xmm1                \n"  // 1212343456567878
+      "movdqa       %%xmm0,%%xmm2                \n"
+      "punpckhdq    %%xmm1,%%xmm2                \n"  // 4545565667677878
+      "punpckldq    %%xmm1,%%xmm0                \n"  // 0101121223233434
+      "pmaddubsw    %%xmm7,%%xmm2                \n"  // 3*near+far (1, hi)
+      "pmaddubsw    %%xmm7,%%xmm0                \n"  // 3*near+far (1, lo)
+
+      "movq         (%0,%3),%%xmm1               \n"
+      "movq         1(%0,%3),%%xmm4              \n"
+      "punpcklwd    %%xmm1,%%xmm1                \n"
+      "punpcklwd    %%xmm4,%%xmm4                \n"
+      "movdqa       %%xmm1,%%xmm3                \n"
+      "punpckhdq    %%xmm4,%%xmm3                \n"
+      "punpckldq    %%xmm4,%%xmm1                \n"
+      "pmaddubsw    %%xmm7,%%xmm3                \n"  // 3*near+far (2, hi)
+      "pmaddubsw    %%xmm7,%%xmm1                \n"  // 3*near+far (2, lo)
+
+      // xmm0 xmm2
+      // xmm1 xmm3
+
+      "movdqa       %%xmm0,%%xmm4                \n"
+      "movdqa       %%xmm1,%%xmm5                \n"
+      "paddw        %%xmm0,%%xmm4                \n"  // 6*near+2*far (1, lo)
+      "paddw        %%xmm6,%%xmm5                \n"  // 3*near+far+8 (2, lo)
+      "paddw        %%xmm0,%%xmm4                \n"  // 9*near+3*far (1, lo)
+      "paddw        %%xmm5,%%xmm4                \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw        $4,%%xmm4                    \n"  // ^ div by 16 (1, lo)
+
+      "movdqa       %%xmm1,%%xmm5                \n"
+      "paddw        %%xmm1,%%xmm5                \n"  // 6*near+2*far (2, lo)
+      "paddw        %%xmm6,%%xmm0                \n"  // 3*near+far+8 (1, lo)
+      "paddw        %%xmm1,%%xmm5                \n"  // 9*near+3*far (2, lo)
+      "paddw        %%xmm0,%%xmm5                \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrlw        $4,%%xmm5                    \n"  // ^ div by 16 (2, lo)
+
+      "movdqa       %%xmm2,%%xmm0                \n"
+      "movdqa       %%xmm3,%%xmm1                \n"
+      "paddw        %%xmm2,%%xmm0                \n"  // 6*near+2*far (1, hi)
+      "paddw        %%xmm6,%%xmm1                \n"  // 3*near+far+8 (2, hi)
+      "paddw        %%xmm2,%%xmm0                \n"  // 9*near+3*far (1, hi)
+      "paddw        %%xmm1,%%xmm0                \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw        $4,%%xmm0                    \n"  // ^ div by 16 (1, hi)
+
+      "movdqa       %%xmm3,%%xmm1                \n"
+      "paddw        %%xmm3,%%xmm1                \n"  // 6*near+2*far (2, hi)
+      "paddw        %%xmm6,%%xmm2                \n"  // 3*near+far+8 (1, hi)
+      "paddw        %%xmm3,%%xmm1                \n"  // 9*near+3*far (2, hi)
+      "paddw        %%xmm2,%%xmm1                \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrlw        $4,%%xmm1                    \n"  // ^ div by 16 (2, hi)
+
+      "packuswb     %%xmm0,%%xmm4                \n"
+      "movdqu       %%xmm4,(%1)                  \n"  // store above
+      "packuswb     %%xmm1,%%xmm5                \n"
+      "movdqu       %%xmm5,(%1,%4)               \n"  // store below
+
+      "lea          0x8(%0),%0                   \n"
+      "lea          0x10(%1),%1                  \n"  // 8 sample to 16 sample
+      "sub          $0x10,%2                     \n"
+      "jg           1b                           \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearMadd31_SSSE3)      // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_AVX2
+static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1,
+                                         3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1,
+                                         1, 3, 3, 1, 1, 3, 3, 1, 1, 3};
+
+void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  asm volatile(
+
+      "vpcmpeqw     %%ymm4,%%ymm4,%%ymm4         \n"
+      "vpsrlw       $15,%%ymm4,%%ymm4            \n"
+      "vpsllw       $1,%%ymm4,%%ymm4             \n"  // all 2
+      "vmovdqu      %3,%%ymm3                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu      (%0),%%xmm0                  \n"  // 0123456789ABCDEF
+      "vmovdqu      1(%0),%%xmm1                 \n"  // 123456789ABCDEF0
+      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"
+      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"
+      "vpunpcklwd   %%ymm0,%%ymm0,%%ymm0         \n"
+      "vpunpcklwd   %%ymm1,%%ymm1,%%ymm1         \n"
+      "vpunpckhdq   %%ymm1,%%ymm0,%%ymm2         \n"
+      "vpunpckldq   %%ymm1,%%ymm0,%%ymm0         \n"
+      "vpmaddubsw   %%ymm3,%%ymm2,%%ymm1         \n"  // 3*near+far (hi)
+      "vpmaddubsw   %%ymm3,%%ymm0,%%ymm0         \n"  // 3*near+far (lo)
+      "vpaddw       %%ymm4,%%ymm0,%%ymm0         \n"  // 3*near+far+2 (lo)
+      "vpaddw       %%ymm4,%%ymm1,%%ymm1         \n"  // 3*near+far+2 (hi)
+      "vpsrlw       $2,%%ymm0,%%ymm0             \n"  // 3/4*near+1/4*far (lo)
+      "vpsrlw       $2,%%ymm1,%%ymm1             \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb    %%ymm1,%%ymm0,%%ymm0         \n"
+      "vmovdqu      %%ymm0,(%1)                  \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),           // %0
+        "+r"(dst_ptr),           // %1
+        "+r"(dst_width)          // %2
+      : "m"(kLinearMadd31_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+  asm volatile(
+
+      "vpcmpeqw     %%ymm6,%%ymm6,%%ymm6         \n"
+      "vpsrlw       $15,%%ymm6,%%ymm6            \n"
+      "vpsllw       $3,%%ymm6,%%ymm6             \n"  // all 8
+      "vmovdqu      %5,%%ymm7                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu      (%0),%%xmm0                  \n"  // 0123456789ABCDEF
+      "vmovdqu      1(%0),%%xmm1                 \n"  // 123456789ABCDEF0
+      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"
+      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"
+      "vpunpcklwd   %%ymm0,%%ymm0,%%ymm0         \n"
+      "vpunpcklwd   %%ymm1,%%ymm1,%%ymm1         \n"
+      "vpunpckhdq   %%ymm1,%%ymm0,%%ymm2         \n"
+      "vpunpckldq   %%ymm1,%%ymm0,%%ymm0         \n"
+      "vpmaddubsw   %%ymm7,%%ymm2,%%ymm1         \n"  // 3*near+far (1, hi)
+      "vpmaddubsw   %%ymm7,%%ymm0,%%ymm0         \n"  // 3*near+far (1, lo)
+
+      "vmovdqu      (%0,%3),%%xmm2               \n"  // 0123456789ABCDEF
+      "vmovdqu      1(%0,%3),%%xmm3              \n"  // 123456789ABCDEF0
+      "vpermq       $0b11011000,%%ymm2,%%ymm2    \n"
+      "vpermq       $0b11011000,%%ymm3,%%ymm3    \n"
+      "vpunpcklwd   %%ymm2,%%ymm2,%%ymm2         \n"
+      "vpunpcklwd   %%ymm3,%%ymm3,%%ymm3         \n"
+      "vpunpckhdq   %%ymm3,%%ymm2,%%ymm4         \n"
+      "vpunpckldq   %%ymm3,%%ymm2,%%ymm2         \n"
+      "vpmaddubsw   %%ymm7,%%ymm4,%%ymm3         \n"  // 3*near+far (2, hi)
+      "vpmaddubsw   %%ymm7,%%ymm2,%%ymm2         \n"  // 3*near+far (2, lo)
+
+      // ymm0 ymm1
+      // ymm2 ymm3
+
+      "vpaddw       %%ymm0,%%ymm0,%%ymm4         \n"  // 6*near+2*far (1, lo)
+      "vpaddw       %%ymm6,%%ymm2,%%ymm5         \n"  // 3*near+far+8 (2, lo)
+      "vpaddw       %%ymm4,%%ymm0,%%ymm4         \n"  // 9*near+3*far (1, lo)
+      "vpaddw       %%ymm4,%%ymm5,%%ymm4         \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrlw       $4,%%ymm4,%%ymm4             \n"  // ^ div by 16 (1, lo)
+
+      "vpaddw       %%ymm2,%%ymm2,%%ymm5         \n"  // 6*near+2*far (2, lo)
+      "vpaddw       %%ymm6,%%ymm0,%%ymm0         \n"  // 3*near+far+8 (1, lo)
+      "vpaddw       %%ymm5,%%ymm2,%%ymm5         \n"  // 9*near+3*far (2, lo)
+      "vpaddw       %%ymm5,%%ymm0,%%ymm5         \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrlw       $4,%%ymm5,%%ymm5             \n"  // ^ div by 16 (2, lo)
+
+      "vpaddw       %%ymm1,%%ymm1,%%ymm0         \n"  // 6*near+2*far (1, hi)
+      "vpaddw       %%ymm6,%%ymm3,%%ymm2         \n"  // 3*near+far+8 (2, hi)
+      "vpaddw       %%ymm0,%%ymm1,%%ymm0         \n"  // 9*near+3*far (1, hi)
+      "vpaddw       %%ymm0,%%ymm2,%%ymm0         \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrlw       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16 (1, hi)
+
+      "vpaddw       %%ymm3,%%ymm3,%%ymm2         \n"  // 6*near+2*far (2, hi)
+      "vpaddw       %%ymm6,%%ymm1,%%ymm1         \n"  // 3*near+far+8 (1, hi)
+      "vpaddw       %%ymm2,%%ymm3,%%ymm2         \n"  // 9*near+3*far (2, hi)
+      "vpaddw       %%ymm2,%%ymm1,%%ymm2         \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrlw       $4,%%ymm2,%%ymm2             \n"  // ^ div by 16 (2, hi)
+
+      "vpackuswb    %%ymm0,%%ymm4,%%ymm4         \n"
+      "vmovdqu      %%ymm4,(%1)                  \n"  // store above
+      "vpackuswb    %%ymm2,%%ymm5,%%ymm5         \n"
+      "vmovdqu      %%ymm5,(%1,%4)               \n"  // store below
+
+      "lea          0x10(%0),%0                  \n"
+      "lea          0x20(%1),%1                  \n"  // 16 sample to 32 sample
+      "sub          $0x20,%2                     \n"
+      "jg           1b                           \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearMadd31_AVX2)       // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
+static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3,
+                                             3, 1, 1, 3, 3, 1, 1, 3};
+
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+
+      "vmovdqu      %3,%%ymm3                    \n"
+      "vpcmpeqw     %%ymm4,%%ymm4,%%ymm4         \n"
+      "vpsrlw       $15,%%ymm4,%%ymm4            \n"
+      "vpsllw       $1,%%ymm4,%%ymm4             \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
+      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
+
+      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
+      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
+
+      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
+      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
+      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
+      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
+      "vpmaddwd     %%ymm3,%%ymm1,%%ymm0         \n"  // 3*near+far (lo)
+      "vpmaddwd     %%ymm3,%%ymm2,%%ymm1         \n"  // 3*near+far (hi)
+      "vpackssdw    %%ymm1,%%ymm0,%%ymm0         \n"  // 3*near+far
+      "vpaddw       %%ymm4,%%ymm0,%%ymm0         \n"  // 3*near+far+2
+      "vpsrlw       $2,%%ymm0,%%ymm0             \n"  // 3/4*near+1/4*far
+      "vmovdqu      %%ymm0,(%1)                  \n"
+
+      "lea          0x10(%0),%0                  \n"
+      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
+      "sub          $0x10,%2                     \n"
+      "jg           1b                           \n"
+      : "+r"(src_ptr),              // %0
+        "+r"(dst_ptr),              // %1
+        "+r"(dst_width)             // %2
+      : "m"(kLinearMadd31_16_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+
+// This version can handle full 16bit range but is slower
+void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr,
+                                     uint16_t* dst_ptr,
+                                     int dst_width) {
+  asm volatile(
+
+      "vmovdqu      %3,%%ymm3                    \n"
+      "vpcmpeqd     %%ymm4,%%ymm4,%%ymm4         \n"
+      "vpsrld       $31,%%ymm4,%%ymm4            \n"
+      "vpslld       $1,%%ymm4,%%ymm4             \n"  // all 2
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
+      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
+
+      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
+      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
+
+      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
+      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
+      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
+      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
+      "vpmaddwd     %%ymm3,%%ymm1,%%ymm0         \n"  // 3*near+far (lo)
+      "vpmaddwd     %%ymm3,%%ymm2,%%ymm1         \n"  // 3*near+far (hi)
+      "vpaddd       %%ymm4,%%ymm0,%%ymm0         \n"  // 3*near+far+2 (lo)
+      "vpaddd       %%ymm4,%%ymm1,%%ymm1         \n"  // 3*near+far+2 (hi)
+      "vpsrad       $2,%%ymm0,%%ymm0             \n"  // 3/4*near+1/4*far (lo)
+      "vpsrad       $2,%%ymm1,%%ymm1             \n"  // 3/4*near+1/4*far (hi)
+      "vpackssdw    %%ymm1,%%ymm0,%%ymm0         \n"
+      "vmovdqu      %%ymm0,(%1)                  \n"
+
+      "lea          0x10(%0),%0                  \n"
+      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
+      "sub          $0x10,%2                     \n"
+      "jg           1b                           \n"
+      : "+r"(src_ptr),              // %0
+        "+r"(dst_ptr),              // %1
+        "+r"(dst_width)             // %2
+      : "m"(kLinearMadd31_16_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  asm volatile(
+
+      "vmovdqu      %5,%%ymm5                    \n"
+      "vpcmpeqw     %%ymm4,%%ymm4,%%ymm4         \n"
+      "vpsrlw       $15,%%ymm4,%%ymm4            \n"
+      "vpsllw       $3,%%ymm4,%%ymm4             \n"  // all 8
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
+      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
+      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
+      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
+      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
+      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
+      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
+      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
+      "vpmaddwd     %%ymm5,%%ymm1,%%ymm0         \n"  // 3*near+far (1, lo)
+      "vpmaddwd     %%ymm5,%%ymm2,%%ymm1         \n"  // 3*near+far (1, hi)
+      "vpackssdw    %%ymm1,%%ymm0,%%ymm2         \n"  // 3*near+far (1)
+
+      "vmovdqu      (%0,%3,2),%%xmm0             \n"  // 01234567 (16b)
+      "vmovdqu      2(%0,%3,2),%%xmm1            \n"  // 12345678 (16b)
+      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
+      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
+      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
+      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
+      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm3         \n"  // 2323343467677878
+      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
+      "vpmaddwd     %%ymm5,%%ymm1,%%ymm0         \n"  // 3*near+far (2, lo)
+      "vpmaddwd     %%ymm5,%%ymm3,%%ymm1         \n"  // 3*near+far (2, hi)
+      "vpackssdw    %%ymm1,%%ymm0,%%ymm3         \n"  // 3*near+far (2)
+
+      "vpaddw       %%ymm2,%%ymm2,%%ymm0         \n"  // 6*near+2*far (1)
+      "vpaddw       %%ymm4,%%ymm3,%%ymm1         \n"  // 3*near+far+8 (2)
+      "vpaddw       %%ymm0,%%ymm2,%%ymm0         \n"  // 9*near+3*far (1)
+      "vpaddw       %%ymm0,%%ymm1,%%ymm0         \n"  // 9 3 3 1 + 8 (1)
+      "vpsrlw       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16
+      "vmovdqu      %%ymm0,(%1)                  \n"  // store above
+
+      "vpaddw       %%ymm3,%%ymm3,%%ymm0         \n"  // 6*near+2*far (2)
+      "vpaddw       %%ymm4,%%ymm2,%%ymm1         \n"  // 3*near+far+8 (1)
+      "vpaddw       %%ymm0,%%ymm3,%%ymm0         \n"  // 9*near+3*far (2)
+      "vpaddw       %%ymm0,%%ymm1,%%ymm0         \n"  // 9 3 3 1 + 8 (2)
+      "vpsrlw       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16
+      "vmovdqu      %%ymm0,(%1,%4,2)             \n"  // store below
+
+      "lea          0x10(%0),%0                  \n"
+      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
+      "sub          $0x10,%2                     \n"
+      "jg           1b                           \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearMadd31_16_AVX2)    // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+// This version can handle full 16bit range but is slower.
+void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
+                                       ptrdiff_t src_stride,
+                                       uint16_t* dst_ptr,
+                                       ptrdiff_t dst_stride,
+                                       int dst_width) {
+  asm volatile(
+
+      "vmovdqu      %5,%%ymm7                    \n"
+      "vpcmpeqd     %%ymm6,%%ymm6,%%ymm6         \n"
+      "vpsrld       $31,%%ymm6,%%ymm6            \n"
+      "vpslld       $3,%%ymm6,%%ymm6             \n"  // all 8
+
+      LABELALIGN
+      "1:                                        \n"
+
+      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
+      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
+      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
+      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
+      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
+      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
+      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
+      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
+      "vpmaddwd     %%ymm7,%%ymm1,%%ymm0         \n"  // 3*near+far (1, lo)
+      "vpmaddwd     %%ymm7,%%ymm2,%%ymm1         \n"  // 3*near+far (1, hi)
+
+      "vmovdqu      (%0,%3,2),%%xmm2             \n"  // 01234567 (16b)
+      "vmovdqu      2(%0,%3,2),%%xmm3            \n"  // 12345678 (16b)
+      "vpermq       $0b11011000,%%ymm2,%%ymm2    \n"  // 0123000045670000
+      "vpermq       $0b11011000,%%ymm3,%%ymm3    \n"  // 1234000056780000
+      "vpunpckldq   %%ymm2,%%ymm2,%%ymm2         \n"  // 0101232345456767
+      "vpunpckldq   %%ymm3,%%ymm3,%%ymm3         \n"  // 1212343456567878
+      "vpunpckhqdq  %%ymm3,%%ymm2,%%ymm4         \n"  // 2323343467677878
+      "vpunpcklqdq  %%ymm3,%%ymm2,%%ymm3         \n"  // 0101121245455656
+      "vpmaddwd     %%ymm7,%%ymm3,%%ymm2         \n"  // 3*near+far (2, lo)
+      "vpmaddwd     %%ymm7,%%ymm4,%%ymm3         \n"  // 3*near+far (2, hi)
+
+      "vpaddd       %%ymm0,%%ymm0,%%ymm4         \n"  // 6*near+2*far (1, lo)
+      "vpaddd       %%ymm6,%%ymm2,%%ymm5         \n"  // 3*near+far+8 (2, lo)
+      "vpaddd       %%ymm4,%%ymm0,%%ymm4         \n"  // 9*near+3*far (1, lo)
+      "vpaddd       %%ymm4,%%ymm5,%%ymm4         \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrad       $4,%%ymm4,%%ymm4             \n"  // ^ div by 16 (1, lo)
+
+      "vpaddd       %%ymm2,%%ymm2,%%ymm5         \n"  // 6*near+2*far (2, lo)
+      "vpaddd       %%ymm6,%%ymm0,%%ymm0         \n"  // 3*near+far+8 (1, lo)
+      "vpaddd       %%ymm5,%%ymm2,%%ymm5         \n"  // 9*near+3*far (2, lo)
+      "vpaddd       %%ymm5,%%ymm0,%%ymm5         \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrad       $4,%%ymm5,%%ymm5             \n"  // ^ div by 16 (2, lo)
+
+      "vpaddd       %%ymm1,%%ymm1,%%ymm0         \n"  // 6*near+2*far (1, hi)
+      "vpaddd       %%ymm6,%%ymm3,%%ymm2         \n"  // 3*near+far+8 (2, hi)
+      "vpaddd       %%ymm0,%%ymm1,%%ymm0         \n"  // 9*near+3*far (1, hi)
+      "vpaddd       %%ymm0,%%ymm2,%%ymm0         \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrad       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16 (1, hi)
+
+      "vpaddd       %%ymm3,%%ymm3,%%ymm2         \n"  // 6*near+2*far (2, hi)
+      "vpaddd       %%ymm6,%%ymm1,%%ymm1         \n"  // 3*near+far+8 (1, hi)
+      "vpaddd       %%ymm2,%%ymm3,%%ymm2         \n"  // 9*near+3*far (2, hi)
+      "vpaddd       %%ymm2,%%ymm1,%%ymm2         \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrad       $4,%%ymm2,%%ymm2             \n"  // ^ div by 16 (2, hi)
+
+      "vpackssdw    %%ymm0,%%ymm4,%%ymm4         \n"
+      "vmovdqu      %%ymm4,(%1)                  \n"  // store above
+      "vpackssdw    %%ymm2,%%ymm5,%%ymm5         \n"
+      "vmovdqu      %%ymm5,(%1,%4,2)             \n"  // store below
+
+      "lea          0x10(%0),%0                  \n"
+      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
+      "sub          $0x10,%2                     \n"
+      "jg           1b                           \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kLinearMadd31_16_AVX2)    // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
 // Reads 16xN bytes and produces 16 shorts at a time.
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
@ -946,8 +1776,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
        "x"(kFsub80),  // %8
        "x"(kFadd40)   // %9
 #else
-        "m"(kFsub80),    // %8
-        "m"(kFadd40)     // %9
+        "m"(kFsub80),  // %8
+        "m"(kFadd40)   // %9
 #endif
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@ -504,6 +504,200 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
      : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
 }

+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  const uint8_t* src_temp = src_ptr + 1;
+  asm volatile(
+
+      "vmov.u16    q15, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0]!                   \n"  // 01234567
+      "vld1.8      {d2}, [%3]!                   \n"  // 12345678
+
+      "vmovl.u8    q0, d0                        \n"  // 01234567 (16b)
+      "vmovl.u8    q1, d2                        \n"  // 12345678 (16b)
+      "vmovq       q2, q0                        \n"
+      "vmla.u16    q2, q1, q15                   \n"  // 3*near+far (odd)
+      "vmla.u16    q1, q0, q15                   \n"  // 3*near+far (even)
+
+      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u16  d1, q2, #2                    \n"  // 3/4*near+1/4*far (even)
+
+      "vst2.8      {d0, d1}, [%1]!               \n"  // store
+      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint8_t* src_temp = src_ptr + 1;
+  const uint8_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+
+      "vmov.u16    q15, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d0}, [%0]!                   \n"  // 01234567
+      "vld1.8      {d2}, [%5]!                   \n"  // 12345678
+
+      "vmovl.u8    q0, d0                        \n"  // 01234567 (16b)
+      "vmovl.u8    q1, d2                        \n"  // 12345678 (16b)
+      "vmovq       q2, q0                        \n"
+      "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (1, odd)
+      "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (1, even)
+
+      "vld1.8      {d4}, [%1]!                   \n"  // 01234567
+      "vld1.8      {d6}, [%6]!                   \n"  // 12345678
+
+      "vmovl.u8    q2, d4                        \n"  // 01234567 (16b)
+      "vmovl.u8    q3, d6                        \n"  // 12345678 (16b)
+      "vmovq       q4, q2                        \n"
+      "vmla.u16    q2, q3, q15                   \n"  // 3*near+far (2, odd)
+      "vmla.u16    q3, q4, q15                   \n"  // 3*near+far (2, even)
+
+      // e  o
+      // q1 q0
+      // q3 q2
+
+      "vmovq       q4, q2                        \n"
+      "vmovq       q5, q3                        \n"
+      "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
+      "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
+      "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
+      "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
+
+      // e  o
+      // q5 q4
+      // q1 q0
+
+      "vrshrn.u16  d2, q1, #4                    \n"  // 2, even
+      "vrshrn.u16  d3, q0, #4                    \n"  // 2, odd
+      "vrshrn.u16  d0, q5, #4                    \n"  // 1, even
+      "vrshrn.u16  d1, q4, #4                    \n"  // 1, odd
+
+      "vst2.8      {d0, d1}, [%2]!               \n"  // store
+      "vst2.8      {d2, d3}, [%3]!               \n"  // store
+      "subs        %4, %4, #16                   \n"  // 8 sample -> 16 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+        "q15"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  const uint16_t* src_temp = src_ptr + 1;
+  asm volatile(
+
+      "vmov.u16    q15, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.16     {q1}, [%0]!                   \n"  // 01234567 (16b)
+      "vld1.16     {q0}, [%3]!                   \n"  // 12345678 (16b)
+
+      "vmovq       q2, q0                        \n"
+      "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (odd)
+      "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (even)
+
+      "vrshr.u16   q0, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshr.u16   q1, q1, #2                    \n"  // 3/4*near+1/4*far (even)
+
+      "vst2.16     {d0, d1, d2, d3}, [%1]!       \n"  // store
+      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q15"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 1;
+  const uint16_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+
+      "vmov.u16    q15, #3                       \n"
+
+      "1:                                        \n"
+      "add         %5, %0, #2                    \n"
+      "vld1.16     {q0}, [%0]!                   \n"  // 01234567 (16b)
+      "vld1.16     {q1}, [%5]!                   \n"  // 12345678 (16b)
+
+      "vmovq       q2, q0                        \n"
+      "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (odd)
+      "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (even)
+
+      "add         %5, %1, #2                    \n"
+      "vld1.16     {q2}, [%1]!                   \n"  // 01234567 (16b)
+      "vld1.16     {q3}, [%6]!                   \n"  // 12345678 (16b)
+
+      "vmovq       q4, q2                        \n"
+      "vmla.u16    q2, q3, q15                   \n"  // 3*near+far (odd)
+      "vmla.u16    q3, q4, q15                   \n"  // 3*near+far (even)
+
+      "vmovq       q4, q2                        \n"
+      "vmovq       q5, q3                        \n"
+      "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
+      "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
+      "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
+      "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
+
+      "vrshr.u16   q2, q1, #4                    \n"  // 2, even
+      "vrshr.u16   q3, q0, #4                    \n"  // 2, odd
+      "vrshr.u16   q0, q5, #4                    \n"  // 1, even
+      "vrshr.u16   q1, q4, #4                    \n"  // 1, odd
+
+      "vst2.16     {d0, d1, d2, d3}, [%2]!       \n"  // store
+      "vst2.16     {d4, d5, d6, d7}, [%3]!       \n"  // store
+      "subs        %4, %4, #16                   \n"  // 8 sample -> 16 sample
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+        "q15"  // Clobber List
+  );
+}
+
 // Add a row of bytes to a row of shorts.  Used for box filter.
 // Reads 16 bytes and accumulates to 16 shorts at a time.
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -535,6 +535,196 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
        "v19", "v30", "v31", "memory", "cc");
 }

+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int dst_width) {
+  const uint8_t* src_temp = src_ptr + 1;
+  asm volatile(
+
+      "movi        v31.8b, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"  // 01234567
+      "ldr         d1, [%1], #8                  \n"  // 12345678
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.8h, v0.8b, #0              \n"  // 01234567 (16b)
+      "ushll       v3.8h, v1.8b, #0              \n"  // 12345678 (16b)
+
+      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (odd)
+      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (even)
+
+      "rshrn       v2.8b, v2.8h, #2              \n"  // 3/4*near+1/4*far (odd)
+      "rshrn       v1.8b, v3.8h, #2              \n"  // 3/4*near+1/4*far (even)
+
+      "st2         {v1.8b, v2.8b}, [%2], #16     \n"  // store
+      "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               ptrdiff_t dst_stride,
+                               int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint8_t* src_temp = src_ptr + 1;
+  const uint8_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+
+      "movi        v31.8b, #3                    \n"
+      "movi        v30.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"  // 01234567
+      "ldr         d1, [%2], #8                  \n"  // 12345678
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.8h, v0.8b, #0              \n"  // 01234567 (16b)
+      "ushll       v3.8h, v1.8b, #0              \n"  // 12345678 (16b)
+      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (1, odd)
+      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (1, even)
+
+      "ldr         d0, [%1], #8                  \n"
+      "ldr         d1, [%3], #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v4.8h, v0.8b, #0              \n"  // 01234567 (16b)
+      "ushll       v5.8h, v1.8b, #0              \n"  // 12345678 (16b)
+      "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
+      "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
+
+      "mov         v0.8h, v4.8h                  \n"
+      "mov         v1.8h, v5.8h                  \n"
+      "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.8h, v1.8h, v30.8h          \n"  // 9 3 3 1 (2, even)
+
+      "rshrn       v2.8b, v2.8h, #4              \n"  // 2, odd
+      "rshrn       v1.8b, v3.8h, #4              \n"  // 2, even
+      "rshrn       v4.8b, v4.8h, #4              \n"  // 1, odd
+      "rshrn       v3.8b, v5.8h, #4              \n"  // 1, even
+
+      "st2         {v1.8b, v2.8b}, [%5], #16     \n"  // store 1
+      "st2         {v3.8b, v4.8b}, [%4], #16     \n"  // store 2
+      "subs        %w6, %w6, #16                 \n"  // 8 sample -> 16 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+        "v31"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
+                                int dst_width) {
+  const uint16_t* src_temp = src_ptr + 1;
+  asm volatile(
+
+      "movi        v31.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
+      "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "mov         v2.8h, v0.8h                  \n"
+      "mla         v0.8h, v1.8h, v31.8h          \n"  // 3*near+far (odd)
+      "mla         v1.8h, v2.8h, v31.8h          \n"  // 3*near+far (even)
+
+      "urshr       v2.8h, v0.8h, #2              \n"  // 3/4*near+1/4*far (odd)
+      "urshr       v1.8h, v1.8h, #2              \n"  // 3/4*near+1/4*far (even)
+
+      "st2         {v1.8h, v2.8h}, [%2], #32     \n"  // store
+      "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v31"  // Clobber List
+  );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint16_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint16_t* src_temp = src_ptr + 1;
+  const uint16_t* src_temp1 = src_ptr1 + 1;
+
+  asm volatile(
+
+      "movi        v31.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ld1         {v2.8h}, [%0], #16            \n"  // 01234567 (16b)
+      "ld1         {v3.8h}, [%2], #16            \n"  // 12345678 (16b)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "mov         v0.8h, v2.8h                  \n"
+      "mla         v2.8h, v3.8h, v31.8h          \n"  // 3*near+far (odd)
+      "mla         v3.8h, v0.8h, v31.8h          \n"  // 3*near+far (even)
+
+      "ld1         {v4.8h}, [%1], #16            \n"  // 01234567 (16b)
+      "ld1         {v5.8h}, [%3], #16            \n"  // 12345678 (16b)
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+
+      "mov         v0.8h, v4.8h                  \n"
+      "mla         v4.8h, v5.8h, v31.8h          \n"  // 3*near+far (odd)
+      "mla         v5.8h, v0.8h, v31.8h          \n"  // 3*near+far (even)
+
+      "mov         v0.8h, v4.8h                  \n"
+      "mov         v1.8h, v5.8h                  \n"
+      "mla         v4.8h, v2.8h, v31.8h          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.8h, v3.8h, v31.8h          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.8h, v0.8h, v31.8h          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.8h, v1.8h, v31.8h          \n"  // 9 3 3 1 (2, even)
+
+      "urshr       v2.8h, v2.8h, #4              \n"  // 2, odd
+      "urshr       v1.8h, v3.8h, #4              \n"  // 2, even
+      "urshr       v4.8h, v4.8h, #4              \n"  // 1, odd
+      "urshr       v3.8h, v5.8h, #4              \n"  // 1, even
+
+      "st2         {v3.8h, v4.8h}, [%4], #32     \n"  // store 1
+      "st2         {v1.8h, v2.8h}, [%5], #32     \n"  // store 2
+
+      "subs        %w6, %w6, #16                 \n"  // 8 sample -> 16 sample
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+        "v31"  // Clobber List
+  );
+}
+
 // Add a row of bytes to a row of shorts.  Used for box filter.
 // Reads 16 bytes and accumulates to 16 shorts at a time.
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -49,7 +49,8 @@ namespace libyuv {

 #define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
                       SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
-                       DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF)      \
+                       DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,      \
+                       SRC_DEPTH)                                             \
  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
@ -81,6 +82,16 @@ namespace libyuv {
    MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC);                    \
    MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
    MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC);      \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
+    SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF);                   \
+    SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF);                   \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
+    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) {                \
+      src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+      src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1);                       \
+    }                                                                         \
    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
@ -89,9 +100,7 @@ namespace libyuv {
    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
    MaskCpuFlags(disable_cpu_flags_);                                         \
    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        reinterpret_cast<SRC_T*>(src_y + OFF), kWidth,                        \
-        reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth,                 \
-        reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth,                 \
+        src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,      \
        reinterpret_cast<DST_T*>(dst_y_c), kWidth,                            \
        reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth,                     \
        reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth,             \
@ -99,9 +108,7 @@ namespace libyuv {
    MaskCpuFlags(benchmark_cpu_info_);                                        \
    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          reinterpret_cast<SRC_T*>(src_y + OFF), kWidth,                      \
-          reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth,               \
-          reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth,               \
+          src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth,    \
          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
          reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth,                 \
          reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth,         \
@ -127,34 +134,39 @@ namespace libyuv {

 #define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,           \
                      SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,               \
-                      DST_SUBSAMP_X, DST_SUBSAMP_Y)                            \
+                      DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)                 \
  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_ - 4, _Any, +, 0)                             \
+                 benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH)                  \
  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_, _Unaligned, +, 1)                           \
+                 benchmark_width_, _Unaligned, +, 1, SRC_DEPTH)                \
  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_, _Invert, -, 0)                              \
+                 benchmark_width_, _Invert, -, 0, SRC_DEPTH)                   \
  TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
                 FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y,     \
-                 benchmark_width_, _Opt, +, 0)
+                 benchmark_width_, _Opt, +, 0, SRC_DEPTH)

-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1)
-TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2)
-TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2)
-TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
+TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
+TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
+TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
+TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12)

 // Test Android 420 to I420
 #define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X,          \