diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 96973da67..507509519 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -277,7 +277,7 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) &&                                       \
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-#define HAS_MERGEUV10ROW_AVX2
+#define HAS_MERGEUVROW_16_AVX2
 #endif
 
 // The following are available on Neon platforms:
@@ -1521,14 +1521,16 @@ void MergeRGBRow_Any_NEON(const uint8* src_r,
                           uint8* dst_rgb,
                           int width);
 
-void MergeUV10Row_C(const uint16* src_u,
-                    const uint16* src_v,
-                    uint16* dst_uv,
-                    int width);
-void MergeUV10Row_AVX2(const uint16* src_u,
-                       const uint16* src_v,
-                       uint16* dst_uv,
-                       int width);
+void MergeUVRow_16_C(const uint16* src_u,
+                     const uint16* src_v,
+                     uint16* dst_uv,
+                     int scale, /* 64 for 10 bit */
+                     int width);
+void MergeUVRow_16_AVX2(const uint16* src_u,
+                        const uint16* src_v,
+                        uint16* dst_uv,
+                        int scale,
+                        int width);
 
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_AVX(const uint8* src, uint8* dst, int count);
diff --git a/source/row_common.cc b/source/row_common.cc
index c3294ece5..8612665e5 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1798,21 +1798,22 @@ void MergeRGBRow_C(const uint8* src_r,
   }
 }
 
-void MergeUV10Row_C(const uint16* src_u,
-                    const uint16* src_v,
-                    uint16* dst_uv,
-                    int width) {
+void MergeUVRow_16_C(const uint16* src_u,
+                     const uint16* src_v,
+                     uint16* dst_uv,
+                     int scale,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    dst_uv[0] = src_u[x] << 6;
-    dst_uv[1] = src_v[x] << 6;
-    dst_uv[2] = src_u[x + 1] << 6;
-    dst_uv[3] = src_v[x + 1] << 6;
+    dst_uv[0] = src_u[x] * scale;
+    dst_uv[1] = src_v[x] * scale;
+    dst_uv[2] = src_u[x + 1] * scale;
+    dst_uv[3] = src_v[x + 1] * scale;
     dst_uv += 4;
   }
   if (width & 1) {
-    dst_uv[0] = src_u[width - 1] << 6;
-    dst_uv[1] = src_v[width - 1] << 6;
+    dst_uv[0] = src_u[width - 1] * scale;
+    dst_uv[1] = src_v[width - 1] * scale;
   }
 }
 
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index ff2e8a378..ecb77983e 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -2753,13 +2753,23 @@ void MergeUVRow_SSE2(const uint8* src_u,
 }
 #endif  // HAS_MERGEUVROW_SSE2
 
-#ifdef HAS_MERGEUV10ROW_AVX2
-void MergeUV10Row_AVX2(const uint16* src_u,
-                       const uint16* src_v,
-                       uint16* dst_uv,
-                       int width) {
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+
+#ifdef HAS_MERGEUVROW_16_AVX2
+void MergeUVRow_16_AVX2(const uint16* src_u,
+                        const uint16* src_v,
+                        uint16* dst_uv,
+                        int scale,
+                        int width) {
   // clang-format off
   asm volatile (
+    "vmovd      %4,%%xmm3                      \n"
+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
     "sub       %0,%1                           \n"
 
     // 16 pixels per loop.
@@ -2768,8 +2778,9 @@ void MergeUV10Row_AVX2(const uint16* src_u,
     "vmovdqu   (%0),%%ymm0                     \n"
     "vmovdqu   (%0,%1,1),%%ymm1                \n"
     "add        $0x20,%0                       \n"
-    "vpsllw    $0x6,%%ymm0,%%ymm0              \n"
-    "vpsllw    $0x6,%%ymm1,%%ymm1              \n"
+
+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
     "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"  // mutates
     "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"
     "vextractf128 $0x0,%%ymm2,(%2)             \n"
@@ -2784,8 +2795,8 @@ void MergeUV10Row_AVX2(const uint16* src_u,
     "+r"(src_v),   // %1
     "+r"(dst_uv),  // %2
     "+r"(width)    // %3
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  : "r"(scale)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
   // clang-format on
 }
 #endif  // HAS_MERGEUVROW_AVX2
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 1cbd13f8b..34414c9fc 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2618,8 +2618,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
 }
 
 // TODO(fbarchard): improve test for platforms and cpu detect
-#ifdef HAS_MERGEUV10ROW_AVX2
-TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
+#ifdef HAS_MERGEUVROW_16_AVX2
+TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
   const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels_u, kPixels * 2);
   align_buffer_page_end(src_pixels_v, kPixels * 2);
@@ -2631,20 +2631,22 @@ TEST_F(LibYUVPlanarTest, MergeUV10Row_Opt) {
   memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
   memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
 
-  MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
-                 reinterpret_cast<const uint16*>(src_pixels_v),
-                 reinterpret_cast<uint16*>(dst_pixels_uv_c), kPixels);
+  MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
+                  reinterpret_cast<const uint16*>(src_pixels_v),
+                  reinterpret_cast<uint16*>(dst_pixels_uv_c), 64, kPixels);
 
   int has_avx2 = TestCpuFlag(kCpuHasAVX2);
   for (int i = 0; i < benchmark_iterations_; ++i) {
     if (has_avx2) {
-      MergeUV10Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
-                        reinterpret_cast<const uint16*>(src_pixels_v),
-                        reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
+      MergeUVRow_16_AVX2(reinterpret_cast<const uint16*>(src_pixels_u),
+                         reinterpret_cast<const uint16*>(src_pixels_v),
+                         reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
+                         kPixels);
     } else {
-      MergeUV10Row_C(reinterpret_cast<const uint16*>(src_pixels_u),
-                     reinterpret_cast<const uint16*>(src_pixels_v),
-                     reinterpret_cast<uint16*>(dst_pixels_uv_opt), kPixels);
+      MergeUVRow_16_C(reinterpret_cast<const uint16*>(src_pixels_u),
+                      reinterpret_cast<const uint16*>(src_pixels_v),
+                      reinterpret_cast<uint16*>(dst_pixels_uv_opt), 64,
+                      kPixels);
     }
   }