Add NV12ToNV24 and NV16ToNV24

These are bi-planar convert functions to scale UV plane to Y plane's size using (bi)linear filter. libyuv_unittest --gtest_filter=*ToNV24* R=fbarchard@chromium.org Change-Id: I3d98f833feeef00af3c903ac9ad0e41bdcbcb51f Bug: libyuv:872 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2682152 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2021-02-09 13:09:59 +08:00 · 2021-02-09 13:09:59 +08:00 · f7fc83f46d
commit f7fc83f46d
parent 942c508448
13 changed files with 951 additions and 71 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1775
+Version: 1776
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -289,6 +289,32 @@ int NV21ToI420(const uint8_t* src_y,
               int width,
               int height);

+// Convert NV12 to NV24.
+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert NV16 to NV24.
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 // Convert YUY2 to I420.
 LIBYUV_API
 int YUY2ToI420(const uint8_t* src_yuy2,
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@ -77,12 +77,14 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 #define HAS_SCALEUVROWDOWN2BOX_SSSE3
-#define HAS_SCALECOLUP2LINEAR_SSE2
-#define HAS_SCALECOLUP2LINEAR_SSSE3
 #define HAS_SCALEROWUP2LINEAR_SSE2
 #define HAS_SCALEROWUP2LINEAR_SSSE3
-#define HAS_SCALECOLUP2LINEAR_16_SSE2
+#define HAS_SCALEROWUP2BILINEAR_SSE2
+#define HAS_SCALEROWUP2BILINEAR_SSSE3
 #define HAS_SCALEROWUP2LINEAR_16_SSE2
+#define HAS_SCALEROWUP2BILINEAR_16_SSE2
+#define HAS_SCALEUVROWUP2LINEAR_SSSE3
+#define HAS_SCALEUVROWUP2BILINEAR_SSSE3
 #endif

 // The following are available for gcc/clang x86 platforms, but
@ -92,10 +94,12 @@ extern "C" {
    (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
 #define HAS_SCALEUVROWDOWN2BOX_AVX2
-#define HAS_SCALECOLUP2LINEAR_AVX2
 #define HAS_SCALEROWUP2LINEAR_AVX2
-#define HAS_SCALECOLUP2LINEAR_16_AVX2
+#define HAS_SCALEROWUP2BILINEAR_AVX2
 #define HAS_SCALEROWUP2LINEAR_16_AVX2
+#define HAS_SCALEROWUP2BILINEAR_16_AVX2
+#define HAS_SCALEUVROWUP2LINEAR_AVX2
+#define HAS_SCALEUVROWUP2BILINEAR_AVX2
 #endif

 // The following are available on all x86 platforms, but
@ -124,10 +128,12 @@ extern "C" {
 #define HAS_SCALEROWDOWN4_NEON
 #define HAS_SCALEUVROWDOWN2BOX_NEON
 #define HAS_SCALEUVROWDOWNEVEN_NEON
-#define HAS_SCALECOLUP2LINEAR_NEON
 #define HAS_SCALEROWUP2LINEAR_NEON
-#define HAS_SCALECOLUP2LINEAR_16_NEON
+#define HAS_SCALEROWUP2BILINEAR_NEON
 #define HAS_SCALEROWUP2LINEAR_16_NEON
+#define HAS_SCALEROWUP2BILINEAR_16_NEON
+#define HAS_SCALEUVROWUP2LINEAR_NEON
+#define HAS_SCALEUVROWUP2BILINEAR_NEON
 #endif

 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@ -464,6 +470,24 @@ void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
                             int src_stepx,
                             uint8_t* dst_uv,
                             int dst_width);
+
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width);
+void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+
 void ScaleUVCols_C(uint8_t* dst_uv,
                   const uint8_t* src_uv,
                   int dst_width,
@ -1163,6 +1187,55 @@ void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   int dst_width);

+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width);
+void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      int dst_width);
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width);
+void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     ptrdiff_t dst_stride,
+                                     int dst_width);
+
 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1775
+#define LIBYUV_VERSION 1776

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@ -16,6 +16,7 @@
 #include "libyuv/rotate.h"
 #include "libyuv/row.h"
 #include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/scale_uv.h" // For UVScale()

 #ifdef __cplusplus
 namespace libyuv {
@ -613,6 +614,55 @@ int NV21ToI420(const uint8_t* src_y,
                    width, height);
 }

+LIBYUV_API
+int NV12ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+               Abs(width), Abs(height), kFilterBilinear);
+  }
+  UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+          SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+          Abs(height), kFilterBilinear);
+  return 0;
+}
+
+LIBYUV_API
+int NV16ToNV24(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (width == 0 || height == 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+               Abs(width), Abs(height), kFilterBilinear);
+  }
+  UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+          dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+  return 0;
+}
+
 // Convert YUY2 to I420.
 LIBYUV_API
 int YUY2ToI420(const uint8_t* src_yuy2,
--- a/source/scale.cc
+++ b/source/scale.cc
@ -1415,27 +1415,27 @@ void ScalePlaneUp2_Bilinear(int src_width,

  // This function can only scale up by 2 times.
  assert(src_width == ((dst_width + 1) / 2));
-  assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
+  assert(src_height == ((dst_height + 1) / 2));

-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
  if (TestCpuFlag(kCpuHasSSE2)) {
    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
  }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
  if (TestCpuFlag(kCpuHasSSSE3)) {
    Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
  }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
  if (TestCpuFlag(kCpuHasAVX2)) {
    Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
  }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_NEON
  if (TestCpuFlag(kCpuHasNEON)) {
    Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
  }
@ -1480,19 +1480,19 @@ void ScalePlaneUp2_16_Linear(int src_width,
  // This function can only scale up by 2 times horizontally.
  assert(src_width == ((dst_width + 1) / 2));

-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
  if (TestCpuFlag(kCpuHasSSE2)) {
    ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
  }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
  if (TestCpuFlag(kCpuHasAVX2)) {
    ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
  }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
  if (TestCpuFlag(kCpuHasNEON)) {
    ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
  }
@ -1532,21 +1532,21 @@ void ScalePlaneUp2_16_Bilinear(int src_width,

  // This function can only scale up by 2 times.
  assert(src_width == ((dst_width + 1) / 2));
-  assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
+  assert(src_height == ((dst_height + 1) / 2));

-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
  if (TestCpuFlag(kCpuHasSSE2)) {
    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
  }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
  if (TestCpuFlag(kCpuHasAVX2)) {
    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
  }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
  if (TestCpuFlag(kCpuHasNEON)) {
    Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
  }
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@ -640,7 +640,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
         0,
         uint16_t)

-#ifdef HAS_SCALECOLUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
 SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
         ScaleRowUp2_Linear_SSE2,
         ScaleRowUp2_Linear_C,
@ -648,7 +648,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
         uint8_t)
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
 SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
         ScaleRowUp2_Linear_SSSE3,
         ScaleRowUp2_Linear_C,
@ -656,7 +656,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
         uint8_t)
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
 SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
         ScaleRowUp2_Linear_16_SSE2,
         ScaleRowUp2_Linear_16_C,
@ -664,7 +664,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
         uint16_t)
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
 SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
         ScaleRowUp2_Linear_AVX2,
         ScaleRowUp2_Linear_C,
@ -672,7 +672,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
         uint8_t)
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
 SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
         ScaleRowUp2_Linear_16_AVX2,
         ScaleRowUp2_Linear_16_C,
@ -680,7 +680,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
         uint16_t)
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2LINEAR_NEON
 SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
         ScaleRowUp2_Linear_NEON,
         ScaleRowUp2_Linear_C,
@ -688,7 +688,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
         uint8_t)
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_16_NEON
+#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
 SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
         ScaleRowUp2_Linear_16_NEON,
         ScaleRowUp2_Linear_16_C,
@ -699,7 +699,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
 #undef SUH2LANY

 // Scale up 2 times using bilinear filter.
-// This function produces 2 rows at a time
+// This function produces 2 rows at a time.
 #define SU2BLANY(NAME, SIMD, C, MASK, PTYPE)                            \
  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
            ptrdiff_t dst_stride, int dst_width) {                      \
@ -736,7 +736,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
         0,
         uint16_t)

-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
         ScaleRowUp2_Bilinear_SSE2,
         ScaleRowUp2_Bilinear_C,
@ -744,7 +744,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
         uint8_t)
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
         ScaleRowUp2_Bilinear_16_SSE2,
         ScaleRowUp2_Bilinear_16_C,
@ -752,7 +752,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
         uint16_t)
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
         ScaleRowUp2_Bilinear_SSSE3,
         ScaleRowUp2_Bilinear_C,
@ -760,7 +760,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
         uint8_t)
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
 SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
         ScaleRowUp2_Bilinear_AVX2,
         ScaleRowUp2_Bilinear_C,
@ -768,7 +768,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
         uint8_t)
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
         ScaleRowUp2_Bilinear_16_AVX2,
         ScaleRowUp2_Bilinear_16_C,
@ -776,7 +776,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
         uint16_t)
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_NEON
 SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
         ScaleRowUp2_Bilinear_NEON,
         ScaleRowUp2_Bilinear_C,
@ -784,7 +784,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
         uint8_t)
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
+#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
         ScaleRowUp2_Bilinear_16_NEON,
         ScaleRowUp2_Bilinear_16_C,
@ -794,6 +794,120 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,

 #undef SU2BLANY

+// Scale bi-planar plane up horizontally 2 times using linear filter.
+#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE)                         \
+  void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) {    \
+    int work_width = (dst_width - 1) & ~1;                            \
+    int r = work_width & MASK;                                        \
+    int n = work_width & ~MASK;                                       \
+    dst_ptr[0] = src_ptr[0];                                          \
+    dst_ptr[1] = src_ptr[1];                                          \
+    if (work_width > 0) {                                             \
+      if (n != 0) {                                                   \
+        SIMD(src_ptr, dst_ptr + 2, n);                                \
+      }                                                               \
+      C(src_ptr + n, dst_ptr + 2 * n + 2, r);                         \
+    }                                                                 \
+    dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
+    dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
+  }
+
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
+          ScaleUVRowUp2_Linear_C,
+          ScaleUVRowUp2_Linear_C,
+          0,
+          uint8_t)
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
+          ScaleUVRowUp2_Linear_SSSE3,
+          ScaleUVRowUp2_Linear_C,
+          7,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
+          ScaleUVRowUp2_Linear_AVX2,
+          ScaleUVRowUp2_Linear_C,
+          15,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
+          ScaleUVRowUp2_Linear_NEON,
+          ScaleUVRowUp2_Linear_C,
+          7,
+          uint8_t)
+#endif
+
+#undef SBUH2LANY
+
+// Scale bi-planar plane up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE)                           \
+  void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+            ptrdiff_t dst_stride, int dst_width) {                      \
+    int work_width = (dst_width - 1) & ~1;                              \
+    int r = work_width & MASK;                                          \
+    int n = work_width & ~MASK;                                         \
+    const PTYPE* sa = src_ptr;                                          \
+    const PTYPE* sb = src_ptr + src_stride;                             \
+    PTYPE* da = dst_ptr;                                                \
+    PTYPE* db = dst_ptr + dst_stride;                                   \
+    da[0] = (3 * sa[0] + sb[0]) >> 2;                                   \
+    db[0] = (sa[0] + 3 * sb[0]) >> 2;                                   \
+    da[1] = (3 * sa[1] + sb[1]) >> 2;                                   \
+    db[1] = (sa[1] + 3 * sb[1]) >> 2;                                   \
+    if (work_width > 0) {                                               \
+      if (n != 0) {                                                     \
+        SIMD(sa, sb - sa, da + 2, db - da, n);                          \
+      }                                                                 \
+      C(sa + n, sb - sa, da + 2 * n + 2, db - da, r);                   \
+    }                                                                   \
+    da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] +       \
+                             sb[((dst_width + 1) & ~1) - 2]) >> 2;      \
+    db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] +           \
+                             3 * sb[((dst_width + 1) & ~1) - 2]) >> 2;  \
+    da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] +       \
+                             sb[((dst_width + 1) & ~1) - 1]) >> 2;      \
+    db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] +           \
+                             3 * sb[((dst_width + 1) & ~1) - 1]) >> 2;  \
+  }
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
+          ScaleUVRowUp2_Bilinear_C,
+          ScaleUVRowUp2_Bilinear_C,
+          0,
+          uint8_t)
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
+          ScaleUVRowUp2_Bilinear_SSSE3,
+          ScaleUVRowUp2_Bilinear_C,
+          7,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
+          ScaleUVRowUp2_Bilinear_AVX2,
+          ScaleUVRowUp2_Bilinear_C,
+          15,
+          uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
+          ScaleUVRowUp2_Bilinear_NEON,
+          ScaleUVRowUp2_Bilinear_C,
+          7,
+          uint8_t)
+#endif
+
+#undef SBU2BLANY
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -1200,6 +1200,56 @@ void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
  }
 }

+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    dst_ptr[4 * x + 0] =
+        (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+    dst_ptr[4 * x + 1] =
+        (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+    dst_ptr[4 * x + 2] =
+        (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+    dst_ptr[4 * x + 3] =
+        (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+  }
+}
+
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              ptrdiff_t dst_stride,
+                              int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  uint8_t* d = dst_ptr;
+  uint8_t* e = dst_ptr + dst_stride;
+  int src_width = dst_width >> 1;
+  int x;
+  assert((dst_width % 2 == 0) && (dst_width >= 0));
+  for (x = 0; x < src_width; ++x) {
+    d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                    t[2 * x + 2] * 1 + 8) >> 4;
+    d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                    t[2 * x + 3] * 1 + 8) >> 4;
+    d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+                    t[2 * x + 2] * 3 + 8) >> 4;
+    d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+                    t[2 * x + 3] * 3 + 8) >> 4;
+    e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+                    t[2 * x + 2] * 3 + 8) >> 4;
+    e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+                    t[2 * x + 3] * 3 + 8) >> 4;
+    e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+                    t[2 * x + 2] * 9 + 8) >> 4;
+    e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+                    t[2 * x + 3] * 9 + 8) >> 4;
+  }
+}
+
 // Scales a single row of pixels using point sampling.
 void ScaleUVCols_C(uint8_t* dst_uv,
                   const uint8_t* src_uv,
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@ -779,7 +779,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
        "xmm7");
 }

-#ifdef HAS_SCALECOLUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_SSE2
 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
@ -833,7 +833,7 @@ void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
 }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
 void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
@ -949,7 +949,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
 }
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
@ -999,7 +999,7 @@ void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
 }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
                                  ptrdiff_t src_stride,
                                  uint16_t* dst_ptr,
@ -1106,7 +1106,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
 }
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
 static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3,
                                          3, 1, 1, 3, 3, 1, 1, 3};

@ -1149,7 +1149,7 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
 }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
+#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
 void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
@ -1236,7 +1236,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 }
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_AVX2
 static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1,
                                         3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1,
                                         1, 3, 3, 1, 1, 3, 3, 1, 1, 3};
@ -1281,7 +1281,7 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
 }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
 void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
@ -1364,7 +1364,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 }
 #endif

-#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
 static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3,
                                             3, 1, 1, 3, 3, 1, 1, 3};

@ -1450,7 +1450,7 @@ void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr,
 }
 #endif

-#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
+#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
 void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                  ptrdiff_t src_stride,
                                  uint16_t* dst_ptr,
@ -2261,6 +2261,257 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
 }
 #endif  // HAS_SCALEUVROWDOWN2BOX_AVX2

+#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+static const uvec8 kUVLinearMadd31_SSSE3 = {3, 1, 3, 1, 1, 3, 1, 3,
+                                            3, 1, 3, 1, 1, 3, 1, 3};
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      "psrlw       $15,%%xmm4                    \n"
+      "psllw       $1,%%xmm4                     \n"  // all 2
+      "movdqu      %3,%%xmm3                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
+      "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
+      "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
+      "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
+      "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
+      "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
+      "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
+      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb   %%xmm2,%%xmm0,%%xmm0          \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),              // %0
+        "+r"(dst_ptr),              // %1
+        "+r"(dst_width)             // %2
+      : "m"(kUVLinearMadd31_SSSE3)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  ptrdiff_t dst_stride,
+                                  int dst_width) {
+  asm volatile(
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      "psrlw       $15,%%xmm6                    \n"
+      "psllw       $3,%%xmm6                     \n"  // all 8
+      "movdqu      %5,%%xmm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
+      "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
+      "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
+      "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
+      "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
+      "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
+
+      "movq        (%0,%3),%%xmm1                \n"
+      "movq        2(%0,%3),%%xmm4               \n"
+      "punpcklbw   %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpckhdq   %%xmm1,%%xmm3                 \n"
+      "punpckldq   %%xmm1,%%xmm1                 \n"
+      "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
+      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
+
+      // xmm0 xmm2
+      // xmm1 xmm3
+
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+      "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
+
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+      "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+      "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
+
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+      "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
+      "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+      "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
+
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
+      "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
+      "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
+      "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
+
+      "packuswb    %%xmm0,%%xmm4                 \n"
+      "movdqu      %%xmm4,(%1)                   \n"  // store above
+      "packuswb    %%xmm1,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1,%4)                \n"  // store below
+
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kUVLinearMadd31_SSSE3)      // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+static const lvec8 kUVLinearMadd31_AVX2 = {3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3,
+                                           1, 1, 3, 1, 3, 3, 1, 3, 1, 1, 3,
+                                           1, 3, 3, 1, 3, 1, 1, 3, 1, 3};
+
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
+      "vmovdqu     %3,%%ymm3                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"
+      "vmovdqu     2(%0),%%xmm1                  \n"
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
+      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),             // %0
+        "+r"(dst_ptr),             // %1
+        "+r"(dst_width)            // %2
+      : "m"(kUVLinearMadd31_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  asm volatile(
+      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
+      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
+      "vmovdqu     %5,%%ymm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"
+      "vmovdqu     2(%0),%%xmm1                  \n"
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
+
+      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
+      "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
+      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
+      "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
+      "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
+      "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
+
+      // ymm0 ymm1
+      // ymm2 ymm3
+
+      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
+
+      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
+
+      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
+
+      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
+
+      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
+
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride)),  // %4
+        "m"(kUVLinearMadd31_AVX2)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@ -509,20 +509,19 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
                             int dst_width) {
  const uint8_t* src_temp = src_ptr + 1;
  asm volatile(
-      "vmov.u16    q15, #3                       \n"
+      "vmov.u8     d30, #3                       \n"

      "1:                                        \n"
-      "vld1.8      {d0}, [%0]!                   \n"  // 01234567
-      "vld1.8      {d2}, [%3]!                   \n"  // 12345678
+      "vld1.8      {d4}, [%0]!                   \n"  // 01234567
+      "vld1.8      {d5}, [%3]!                   \n"  // 12345678

-      "vmovl.u8    q0, d0                        \n"  // 01234567 (16b)
-      "vmovl.u8    q1, d2                        \n"  // 12345678 (16b)
-      "vmovq       q2, q0                        \n"
-      "vmla.u16    q2, q1, q15                   \n"  // 3*near+far (odd)
-      "vmla.u16    q1, q0, q15                   \n"  // 3*near+far (even)
+      "vmovl.u8    q0, d4                        \n"  // 01234567 (16b)
+      "vmovl.u8    q1, d5                        \n"  // 12345678 (16b)
+      "vmlal.u8    q0, d5, d30                   \n"  // 3*near+far (odd)
+      "vmlal.u8    q1, d4, d30                   \n"  // 3*near+far (even)

-      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (odd)
-      "vrshrn.u16  d1, q2, #2                    \n"  // 3/4*near+1/4*far (even)
+      "vrshrn.u16  d1, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (even)

      "vst2.8      {d0, d1}, [%1]!               \n"  // store
      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
@ -548,25 +547,24 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,

  asm volatile(
      "vmov.u16    q15, #3                       \n"
+      "vmov.u8     d28, #3                       \n"

      "1:                                        \n"
-      "vld1.8      {d0}, [%0]!                   \n"  // 01234567
-      "vld1.8      {d2}, [%5]!                   \n"  // 12345678
+      "vld1.8      {d4}, [%0]!                   \n"  // 01234567
+      "vld1.8      {d5}, [%5]!                   \n"  // 12345678

-      "vmovl.u8    q0, d0                        \n"  // 01234567 (16b)
-      "vmovl.u8    q1, d2                        \n"  // 12345678 (16b)
-      "vmovq       q2, q0                        \n"
-      "vmla.u16    q0, q1, q15                   \n"  // 3*near+far (1, odd)
-      "vmla.u16    q1, q2, q15                   \n"  // 3*near+far (1, even)
+      "vmovl.u8    q0, d4                        \n"  // 01234567 (16b)
+      "vmovl.u8    q1, d5                        \n"  // 12345678 (16b)
+      "vmlal.u8    q0, d5, d28                   \n"  // 3*near+far (1, odd)
+      "vmlal.u8    q1, d4, d28                   \n"  // 3*near+far (1, even)

-      "vld1.8      {d4}, [%1]!                   \n"  // 01234567
-      "vld1.8      {d6}, [%6]!                   \n"  // 12345678
+      "vld1.8      {d8}, [%1]!                   \n"
+      "vld1.8      {d9}, [%6]!                   \n"

-      "vmovl.u8    q2, d4                        \n"  // 01234567 (16b)
-      "vmovl.u8    q3, d6                        \n"  // 12345678 (16b)
-      "vmovq       q4, q2                        \n"
-      "vmla.u16    q2, q3, q15                   \n"  // 3*near+far (2, odd)
-      "vmla.u16    q3, q4, q15                   \n"  // 3*near+far (2, even)
+      "vmovl.u8    q2, d8                        \n"
+      "vmovl.u8    q3, d9                        \n"
+      "vmlal.u8    q2, d9, d28                   \n"  // 3*near+far (2, odd)
+      "vmlal.u8    q3, d8, d28                   \n"  // 3*near+far (2, even)

      // e  o
      // q1 q0
@ -600,7 +598,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
        "+r"(src_temp),   // %5
        "+r"(src_temp1)   // %6
      :
-      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
        "q15"  // Clobber List
  );
 }
@ -694,6 +692,105 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
  );
 }

+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  const uint8_t* src_temp = src_ptr + 2;
+  asm volatile(
+      "vmov.u8     d30, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d4}, [%0]!                   \n"  // 00112233 (1u1v)
+      "vld1.8      {d5}, [%3]!                   \n"  // 11223344 (1u1v)
+
+      "vmovl.u8    q0, d4                        \n"  // 00112233 (1u1v, 16b)
+      "vmovl.u8    q1, d5                        \n"  // 11223344 (1u1v, 16b)
+      "vmlal.u8    q0, d5, d30                   \n"  // 3*near+far (odd)
+      "vmlal.u8    q1, d4, d30                   \n"  // 3*near+far (even)
+
+      "vrshrn.u16  d1, q0, #2                    \n"  // 3/4*near+1/4*far (odd)
+      "vrshrn.u16  d0, q1, #2                    \n"  // 3/4*near+1/4*far (even)
+
+      "vst2.16     {d0, d1}, [%1]!               \n"  // store
+      "subs        %2, %2, #8                    \n"  // 4 uv -> 8 uv
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_temp)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "d30"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint8_t* src_temp = src_ptr + 2;
+  const uint8_t* src_temp1 = src_ptr1 + 2;
+
+  asm volatile(
+      "vmov.u16    q15, #3                       \n"
+      "vmov.u8     d28, #3                       \n"
+
+      "1:                                        \n"
+      "vld1.8      {d4}, [%0]!                   \n"  // 00112233 (1u1v)
+      "vld1.8      {d5}, [%5]!                   \n"  // 11223344 (1u1v)
+
+      "vmovl.u8    q0, d4                        \n"  // 00112233 (1u1v, 16b)
+      "vmovl.u8    q1, d5                        \n"  // 11223344 (1u1v, 16b)
+      "vmlal.u8    q0, d5, d28                   \n"  // 3*near+far (1, odd)
+      "vmlal.u8    q1, d4, d28                   \n"  // 3*near+far (1, even)
+
+      "vld1.8      {d8}, [%1]!                   \n"  // 00112233 (1u1v)
+      "vld1.8      {d9}, [%6]!                   \n"  // 11223344 (1u1v)
+
+      "vmovl.u8    q2, d8                        \n"  // 00112233 (1u1v, 16b)
+      "vmovl.u8    q3, d9                        \n"  // 11223344 (1u1v, 16b)
+      "vmlal.u8    q2, d9, d28                   \n"  // 3*near+far (2, odd)
+      "vmlal.u8    q3, d8, d28                   \n"  // 3*near+far (2, even)
+
+      // e  o
+      // q1 q0
+      // q3 q2
+
+      "vmovq       q4, q2                        \n"
+      "vmovq       q5, q3                        \n"
+      "vmla.u16    q4, q0, q15                   \n"  // 9 3 3 1 (1, odd)
+      "vmla.u16    q5, q1, q15                   \n"  // 9 3 3 1 (1, even)
+      "vmla.u16    q0, q2, q15                   \n"  // 9 3 3 1 (2, odd)
+      "vmla.u16    q1, q3, q15                   \n"  // 9 3 3 1 (2, even)
+
+      // e  o
+      // q5 q4
+      // q1 q0
+
+      "vrshrn.u16  d2, q1, #4                    \n"  // 2, even
+      "vrshrn.u16  d3, q0, #4                    \n"  // 2, odd
+      "vrshrn.u16  d0, q5, #4                    \n"  // 1, even
+      "vrshrn.u16  d1, q4, #4                    \n"  // 1, odd
+
+      "vst2.16     {d0, d1}, [%2]!               \n"  // store
+      "vst2.16     {d2, d3}, [%3]!               \n"  // store
+      "subs        %4, %4, #8                    \n"  // 4 uv -> 8 uv
+      "bgt         1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(dst_ptr),    // %2
+        "+r"(dst_ptr1),   // %3
+        "+r"(dst_width),  // %4
+        "+r"(src_temp),   // %5
+        "+r"(src_temp1)   // %6
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
+        "q15"  // Clobber List
+  );
+}
+
 // Add a row of bytes to a row of shorts.  Used for box filter.
 // Reads 16 bytes and accumulates to 16 shorts at a time.
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -721,6 +721,101 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
  );
 }

+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  const uint8_t* src_temp = src_ptr + 2;
+  asm volatile(
+      "movi        v31.8b, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"  // 00112233 (1u1v)
+      "ldr         d1, [%1], #8                  \n"  // 11223344 (1u1v)
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.8h, v0.8b, #0              \n"  // 00112233 (1u1v, 16b)
+      "ushll       v3.8h, v1.8b, #0              \n"  // 11223344 (1u1v, 16b)
+
+      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (odd)
+      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (even)
+
+      "rshrn       v2.8b, v2.8h, #2              \n"  // 3/4*near+1/4*far (odd)
+      "rshrn       v1.8b, v3.8h, #2              \n"  // 3/4*near+1/4*far (even)
+
+      "st2         {v1.4h, v2.4h}, [%2], #16     \n"  // store
+      "subs        %w3, %w3, #8                  \n"  // 4 uv -> 8 uv
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(src_temp),  // %1
+        "+r"(dst_ptr),   // %2
+        "+r"(dst_width)  // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v31"  // Clobber List
+  );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 ptrdiff_t dst_stride,
+                                 int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+  const uint8_t* src_temp = src_ptr + 2;
+  const uint8_t* src_temp1 = src_ptr1 + 2;
+
+  asm volatile(
+      "movi        v31.8b, #3                    \n"
+      "movi        v30.8h, #3                    \n"
+
+      "1:                                        \n"
+      "ldr         d0, [%0], #8                  \n"
+      "ldr         d1, [%2], #8                  \n"
+      "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v2.8h, v0.8b, #0              \n"
+      "ushll       v3.8h, v1.8b, #0              \n"
+      "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (1, odd)
+      "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (1, even)
+
+      "ldr         d0, [%1], #8                  \n"
+      "ldr         d1, [%3], #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
+
+      "ushll       v4.8h, v0.8b, #0              \n"
+      "ushll       v5.8h, v1.8b, #0              \n"
+      "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
+      "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
+
+      "mov         v0.8h, v4.8h                  \n"
+      "mov         v1.8h, v5.8h                  \n"
+      "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
+      "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
+      "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
+      "mla         v3.8h, v1.8h, v30.8h          \n"  // 9 3 3 1 (2, even)
+
+      "rshrn       v2.8b, v2.8h, #4              \n"  // 2, odd
+      "rshrn       v1.8b, v3.8h, #4              \n"  // 2, even
+      "rshrn       v4.8b, v4.8h, #4              \n"  // 1, odd
+      "rshrn       v3.8b, v5.8h, #4              \n"  // 1, even
+
+      "st2         {v1.4h, v2.4h}, [%5], #16     \n"  // store 1
+      "st2         {v3.4h, v4.4h}, [%4], #16     \n"  // store 2
+      "subs        %w6, %w6, #8                  \n"  // 4 uv -> 8 uv
+      "b.gt        1b                            \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(src_ptr1),   // %1
+        "+r"(src_temp),   // %2
+        "+r"(src_temp1),  // %3
+        "+r"(dst_ptr),    // %4
+        "+r"(dst_ptr1),   // %5
+        "+r"(dst_width)   // %6
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+        "v31"  // Clobber List
+  );
+}
+
 // Add a row of bytes to a row of shorts.  Used for box filter.
 // Reads 16 bytes and accumulates to 16 shorts at a time.
 void ScaleAddRow_NEON(const uint8_t* src_ptr,
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@ -649,6 +649,116 @@ static void ScaleUVBilinearUp(int src_width,
 }
 #endif  // HAS_SCALEUVBILINEARUP

+// Scale UV, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of NV16 to NV24.
+void ScaleUVLinearUp2(int src_width,
+                      int src_height,
+                      int dst_width,
+                      int dst_height,
+                      int src_stride,
+                      int dst_stride,
+                      const uint8_t* src_uv,
+                      uint8_t* dst_uv) {
+  void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
+      ScaleUVRowUp2_Linear_Any_C;
+  int i;
+  int y;
+  int dy;
+
+  // This function can only scale up by 2 times horizontally.
+  assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON;
+  }
+#endif
+
+  if (dst_height == 1) {
+    ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv,
+               dst_width);
+  } else {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+    y = (1 << 15) - 1;
+    for (i = 0; i < dst_height; ++i) {
+      ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
+      dst_uv += dst_stride;
+      y += dy;
+    }
+  }
+}
+
+// Scale plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of NV12 to NV24.
+void ScaleUVBilinearUp2(int src_width,
+                        int src_height,
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t* src_ptr,
+                        uint8_t* dst_ptr) {
+  void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                      uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+      ScaleUVRowUp2_Bilinear_Any_C;
+  int x;
+
+  // This function can only scale up by 2 times.
+  assert(src_width == ((dst_width + 1) / 2));
+  assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2;
+  }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON;
+  }
+#endif
+
+  if (src_height == 1) {
+    Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
+  } else {
+    Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+    dst_ptr += dst_stride;
+    for (x = 0; x < src_height - 1; ++x) {
+      Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+      src_ptr += src_stride;
+      // TODO: Test performance of writing one row of destination at a time.
+      dst_ptr += 2 * dst_stride;
+    }
+    if (!(dst_height & 1)) {
+      Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+    }
+  }
+}
+
 // Scale UV to/from any dimensions, without interpolation.
 // Fixed point math is used for performance: The upper 16 bits
 // of x and dx is the integer part of the source position and
@ -844,6 +954,18 @@ static void ScaleUV(const uint8_t* src,
                       dst_stride, src, dst, x, y, dy, 4, filtering);
    return;
  }
+  if (filtering && src_height == dst_height) {
+    ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
+                     dst_stride, src, dst);
+    return;
+  }
+  if ((clip_height + 1) / 2 == src_height &&
+      (clip_width + 1) / 2 == src_width &&
+      (filtering == kFilterBilinear || filtering == kFilterBox)) {
+    ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height,
+                       src_stride, dst_stride, src, dst);
+    return;
+  }
 #if HAS_SCALEUVBILINEARUP
  if (filtering && dy < 65536) {
    ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -458,6 +458,8 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)

 TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
 TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
+TESTBIPLANARTOBP(NV12, 2, 2, NV24, 1, 1)
+TESTBIPLANARTOBP(NV16, 2, 1, NV24, 1, 1)

 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \