From b4ddbaf549a1bf5572bf703fd2862d1eb7380c6a Mon Sep 17 00:00:00 2001
From: Justin Green <greenjustin@google.com>
Date: Thu, 3 Feb 2022 11:46:44 -0500
Subject: [PATCH] Add support for MM21.

Add support for MM21 to NV12 and I420 conversion, and add SIMD
optimizations for arm, aarch64, SSE2, and SSSE3 machines.

Bug: libyuv:915, b/215425056
Change-Id: Iecb0c33287f35766a6169d4adf3b7397f1ba8b5d
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3433269
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Justin Green <greenjustin@google.com>
---
 README.chromium                   |   4 +-
 include/libyuv/convert.h          |  28 ++++
 include/libyuv/planar_functions.h |  12 ++
 include/libyuv/row.h              |  40 +++++
 source/convert.cc                 |  54 +++++++
 source/planar_functions.cc        |  73 +++++++++-
 source/row_any.cc                 |  45 ++++++
 source/row_common.cc              |  24 +++
 source/row_gcc.cc                 |  58 +++++++-
 source/row_neon.cc                |  46 ++++++
 source/row_neon64.cc              |  23 +++
 unit_test/convert_test.cc         | 233 ++++++++++++++++--------------
 unit_test/planar_test.cc          | 101 +++++++++++++
 13 files changed, 628 insertions(+), 113 deletions(-)
 mode change 100755 => 100644 unit_test/planar_test.cc

diff --git a/README.chromium b/README.chromium
index b6a068144..5074954c5 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,8 +1,8 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1810
+Version: 1811
 License: BSD
 License File: LICENSE
 
 Description:
-libyuv is an open source project that includes YUV conversion and scaling functionality.
\ No newline at end of file
+libyuv is an open source project that includes YUV conversion and scaling functionality.
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 93e7550be..bcdefb5df 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -106,6 +106,34 @@ int I422ToI444(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert MM21 to NV12.
+LIBYUV_API
+int MM21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert MM21 to I420.
+LIBYUV_API
+int MM21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // Convert I422 to NV21.
 LIBYUV_API
 int I422ToNV21(const uint8_t* src_y,
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 47f3446ac..1037d7320 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -93,6 +93,18 @@ void DetilePlane(const uint8_t* src_y,
                  int height,
                  int tile_height);
 
+// Convert a UV plane of tiles of 16 x H into linear U and V planes.
+LIBYUV_API
+void DetileSplitUVPlane(const uint8_t* src_uv,
+                        int src_stride_uv,
+                        uint8_t* dst_u,
+                        int dst_stride_u,
+                        uint8_t* dst_v,
+                        int dst_stride_v,
+                        int width,
+                        int height,
+                        int tile_height);
+
 // Split interleaved UV plane into separate U and V planes.
 LIBYUV_API
 void SplitUVPlane(const uint8_t* src_uv,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 12e53dbe2..8cac886ee 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -290,6 +290,8 @@ extern "C" {
 #define HAS_AB64TOARGBROW_SSSE3
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
+#define HAS_DETILEROW_SSE2
+#define HAS_DETILESPLITUVROW_SSSE3
 #define HAS_HALFMERGEUVROW_SSSE3
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
@@ -537,6 +539,7 @@ extern "C" {
 #define HAS_GAUSSROW_F32_NEON
 #define HAS_GAUSSCOL_F32_NEON
 #define HAS_DETILEROW_NEON
+#define HAS_DETILESPLITUVROW_NEON
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 #define HAS_ABGRTOUVROW_MSA
@@ -1839,6 +1842,43 @@ void DetileRow_NEON(const uint8_t* src,
                     ptrdiff_t src_tile_stride,
                     uint8_t* dst,
                     int width);
+void DetileRow_Any_NEON(const uint8_t* src,
+                        ptrdiff_t src_tile_stride,
+                        uint8_t* dst,
+                        int width);
+void DetileRow_SSE2(const uint8_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint8_t* dst,
+                    int width);
+void DetileRow_Any_SSE2(const uint8_t* src,
+                        ptrdiff_t src_tile_stride,
+                        uint8_t* dst,
+                        int width);
+void DetileSplitUVRow_C(const uint8_t* src_uv,
+                        ptrdiff_t src_tile_stride,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
+                            ptrdiff_t src_tile_stride,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void DetileSplitUVRow_Any_SSSE3(const uint8_t* src_uv,
+                                ptrdiff_t src_tile_stride,
+                                uint8_t* dst_u,
+                                uint8_t* dst_v,
+                                int width);
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+                           ptrdiff_t src_tile_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void DetileSplitUVRow_Any_NEON(const uint8_t* src_uv,
+                               ptrdiff_t src_tile_stride,
+                               uint8_t* dst_u,
+                               uint8_t* dst_v,
+                               int width);
 void MergeUVRow_C(const uint8_t* src_u,
                   const uint8_t* src_v,
                   uint8_t* dst_uv,
diff --git a/source/convert.cc b/source/convert.cc
index b54f88b7d..9c5e8aa8f 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -564,6 +564,60 @@ int I422ToNV21(const uint8_t* src_y,
   return 0;
 }
 
+LIBYUV_API
+int MM21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_uv || !dst_uv || width <= 0) {
+    return -1;
+  }
+
+  int sign = height < 0 ? -1 : 1;
+
+  if (dst_y) {
+    DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32);
+  }
+  DetilePlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, (width + 1) & ~1,
+              (height + sign) / 2, 16);
+
+  return 0;
+}
+
+LIBYUV_API
+int MM21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int sign = height < 0 ? -1 : 1;
+
+  if (!src_uv || !dst_u || !dst_v || width <= 0) {
+    return -1;
+  }
+
+  if (dst_y) {
+    DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32);
+  }
+  DetileSplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, (width + 1) & ~1, (height + sign) / 2, 16);
+
+  return 0;
+}
+
 #ifdef I422TONV21_ROW_VERSION
 // Unittest fails for this version.
 // 422 chroma is 1/2 width, 1x height
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index d7cb8dc77..f2f2d6951 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -882,9 +882,20 @@ void DetilePlane(const uint8_t* src_y,
     dst_stride_y = -dst_stride_y;
   }
 
+#if defined(HAS_DETILEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    DetileRow = DetileRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow = DetileRow_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_DETILEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    DetileRow = DetileRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DetileRow = DetileRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DetileRow = DetileRow_NEON;
+    }
   }
 #endif
 
@@ -900,6 +911,64 @@ void DetilePlane(const uint8_t* src_y,
   }
 }
 
+LIBYUV_API
+void DetileSplitUVPlane(const uint8_t* src_uv,
+                        int src_stride_uv,
+                        uint8_t* dst_u,
+                        int dst_stride_u,
+                        uint8_t* dst_v,
+                        int dst_stride_v,
+                        int width,
+                        int height,
+                        int tile_height) {
+  const ptrdiff_t src_tile_stride = 16 * tile_height;
+  int y;
+  void (*DetileSplitUVRow)(const uint8_t* src, ptrdiff_t src_tile_stride,
+                           uint8_t* dst_u, uint8_t* dst_v, int width) =
+      DetileSplitUVRow_C;
+  assert(src_stride_uv >= 0);
+  assert(tile_height > 0);
+  assert(src_stride_uv > 0);
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_stride_u = -dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_v = -dst_stride_v;
+  }
+
+#if defined(HAS_DETILESPLITUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    DetileSplitUVRow = DetileSplitUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      DetileSplitUVRow = DetileSplitUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_DETILESPLITROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    DetileSplitUVRow = DetileSplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      DetileSplitUVRow = DetileSplitUVRow_NEON;
+    }
+  }
+#endif
+
+  // Detile plane
+  for (y = 0; y < height; ++y) {
+    DetileSplitUVRow(src_uv, src_tile_stride, dst_u, dst_v, width);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += 16;
+    // Advance to next row of tiles.
+    if ((y & (tile_height - 1)) == (tile_height - 1)) {
+      src_uv = src_uv - src_tile_stride + src_stride_uv * tile_height;
+    }
+  }
+}
+
 // Support function for NV12 etc RGB channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
diff --git a/source/row_any.cc b/source/row_any.cc
index e45f85366..755851283 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -2059,6 +2059,51 @@ ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
 #endif
 #undef ANY11S
 
+#define ANYDETILE(NAMEANY, ANY_SIMD, MASK)                                  \
+  void NAMEANY(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, \
+               int width) {                                                 \
+    SIMD_ALIGNED(uint8_t temp[16 * 2]);                                     \
+    memset(temp, 0, 16); /* for msan */                                     \
+    int r = width & MASK;                                                   \
+    int n = width & ~MASK;                                                  \
+    if (n > 0) {                                                            \
+      ANY_SIMD(src, src_tile_stride, dst, n);                               \
+    }                                                                       \
+    memcpy(temp, src + (n / 16) * src_tile_stride, r);                      \
+    ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1);                   \
+    memcpy(dst + n, temp + 16, r);                                          \
+  }
+
+#ifdef HAS_DETILEROW_NEON
+ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, 15)
+#endif
+#ifdef HAS_DETILEROW_SSE2
+ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, 15)
+#endif
+
+#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK)                \
+  void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \
+               uint8_t* dst_u, uint8_t* dst_v, int width) {      \
+    SIMD_ALIGNED(uint8_t temp[16 * 2]);                          \
+    memset(temp, 0, 16 * 2); /* for msan */                      \
+    int r = width & MASK;                                        \
+    int n = width & ~MASK;                                       \
+    if (n > 0) {                                                 \
+      ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n);        \
+    }                                                            \
+    memcpy(temp, src_uv + (n / 16) * src_tile_stride, r);        \
+    ANY_SIMD(temp, src_tile_stride, temp + 16, temp + 24, r);    \
+    memcpy(dst_u + n / 2, temp + 16, (r + 1) / 2);               \
+    memcpy(dst_v + n / 2, temp + 24, (r + 1) / 2);               \
+  }
+
+#ifdef HAS_DETILESPLITUVROW_NEON
+ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15)
+#endif
+#ifdef HAS_DETILESPLITUVROW_SSSE3
+ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15)
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/row_common.cc b/source/row_common.cc
index 84b395b64..3e5f419dd 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -2674,6 +2674,30 @@ void DetileRow_C(const uint8_t* src,
   }
 }
 
+void DetileSplitUVRow_C(const uint8_t* src_uv,
+                        ptrdiff_t src_tile_stride,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int tile;
+  for (tile = 0; tile < width / 16; tile++) {
+    for (int x = 0; x < 8; x++) {
+      *dst_u++ = src_uv[0];
+      *dst_v++ = src_uv[1];
+      src_uv += 2;
+    }
+    src_uv += src_tile_stride - 16;
+  }
+  for (int x = 0; x < (width & 0xF) / 2; ++x) {
+    *dst_u = *src_uv;
+    dst_u++;
+    src_uv++;
+    *dst_v = *src_uv;
+    dst_v++;
+    src_uv++;
+  }
+}
+
 void SplitUVRow_C(const uint8_t* src_uv,
                   uint8_t* dst_u,
                   uint8_t* dst_v,
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 5f7ab0c74..85376e4b0 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -9,7 +9,6 @@
  */
 
 #include "libyuv/row.h"
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -4765,6 +4764,63 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
 }
 #endif  // HAS_SPLITUVROW_SSE2
 
+#ifdef HAS_DETILEROW_SSE2
+void DetileRow_SSE2(const uint8_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint8_t* dst,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "sub         $0x10,%2                      \n"
+      "lea         (%0,%3),%0                    \n"
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         0x10(%1),%1                   \n"
+      "jg          1b                            \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "xmm0");
+}
+#endif  // HAS_DETILEROW_SSE2
+
+#ifdef HAS_DETILESPLITUVROW_SSSE3
+// TODO(greenjustin): Look into generating these constants instead of loading
+// them since this can cause branch mispredicts for fPIC code on 32-bit
+// machines.
+static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14,
+                                     1, 3, 5, 7, 9, 11, 13, 15};
+
+// TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very
+// slow on older SSE2 processors.
+void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
+                            ptrdiff_t src_tile_stride,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width) {
+  asm volatile(
+      "movdqu      %4,%%xmm1                     \n"
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"
+      "lea         (%0, %5),%0                   \n"
+      "pshufb      %%xmm1,%%xmm0                 \n"
+      "movq        %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "movhps      %%xmm0,(%2)                   \n"
+      "lea         0x8(%2),%2                    \n"
+      "sub         $0x10,%3                      \n"
+      "jg          1b                            \n"
+      : "+r"(src_uv),         // %0
+        "+r"(dst_u),          // %1
+        "+r"(dst_v),          // %2
+        "+r"(width)           // %3
+      : "m"(kDeinterlaceUV),  // %4
+        "r"(src_tile_stride)  // %5
+      : "cc", "memory", "xmm0", "xmm1");
+}
+#endif  // HAS_DETILESPLITUVROW_SSSE3
+
 #ifdef HAS_MERGEUVROW_AVX2
 void MergeUVRow_AVX2(const uint8_t* src_u,
                      const uint8_t* src_v,
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 543ebec9e..4781e2f6a 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -575,6 +575,52 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
   );
 }
 
+// Reads 16 byte Y's from tile and writes out 16 Y's.
+// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
+// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
+// width measured in bytes so 8 UV = 16.
+void DetileRow_NEON(const uint8_t* src,
+                    ptrdiff_t src_tile_stride,
+                    uint8_t* dst,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.16         {q0}, [%0], %3            \n"  // load 16 bytes
+      "subs            %2, %2, #16               \n"  // 16 processed per loop
+      "pld             [%0, 1792]                 \n"
+      "vst1.16         {q0}, [%1]!               \n"  // store 16 bytes
+      "bgt             1b                        \n"
+      : "+r"(src),            // %0
+        "+r"(dst),            // %1
+        "+r"(width)           // %2
+      : "r"(src_tile_stride)  // %3
+      : "cc", "memory", "q0"  // Clobber List
+  );
+}
+
+// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+                           ptrdiff_t src_tile_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8      {d0, d1}, [%0], %4            \n"
+      "subs        %3, %3, #16                   \n"
+      "pld         [%0, 1792]                     \n"
+      "vst1.8      {d0}, [%1]!                   \n"
+      "vst1.8      {d1}, [%2]!                   \n"
+      "bgt         1b                            \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3
+      : "r"(src_tile_stride)        // %4
+      : "cc", "memory", "d0", "d1"  // Clobber List
+  );
+}
+
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON(const uint8_t* src_u,
                      const uint8_t* src_v,
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 1d1f9bb10..442e60cdc 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -627,6 +627,29 @@ void DetileRow_NEON(const uint8_t* src,
   );
 }
 
+// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+                           ptrdiff_t src_tile_stride,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2         {v0.8b,v1.8b}, [%0], %4       \n"
+      "subs        %w3, %w3, #16                 \n"
+      "prfm        pldl1keep, [%0, 1792]          \n"
+      "st1         {v0.8b}, [%1], #8             \n"
+      "st1         {v1.8b}, [%2], #8             \n"
+      "b.gt        1b                            \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3
+      : "r"(src_tile_stride)        // %4
+      : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+
 #if LIBYUV_USE_ST2
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON(const uint8_t* src_u,
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 741812656..20ac91c83 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -419,7 +419,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
 #define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,       \
                           SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,           \
                           DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,    \
-                          DOY, SRC_DEPTH)                                      \
+                          DOY, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)             \
   TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
     static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");        \
     static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");        \
@@ -433,13 +433,18 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
                   "DST_SUBSAMP_Y unsupported");                                \
     const int kWidth = W1280;                                                  \
     const int kHeight = benchmark_height_;                                     \
-    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);                \
-    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);              \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
     const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);                \
     const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);              \
-    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);              \
-    align_buffer_page_end(src_uv,                                              \
-                          2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \
+    const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1);  \
+    const int kPaddedHeight =                                                  \
+        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                    \
+    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);    \
+    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y);  \
+    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF);  \
+    align_buffer_page_end(                                                     \
+        src_uv,                                                                \
+        2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF);       \
     align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                  \
     align_buffer_page_end(dst_uv_c,                                            \
                           2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);       \
@@ -448,11 +453,11 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
                           2 * kDstHalfWidth * kDstHalfHeight * DST_BPC);       \
     SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
     SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
+    for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) {                   \
       src_y_p[i] =                                                             \
           (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
     }                                                                          \
-    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) {             \
+    for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \
       src_uv_p[i] =                                                            \
           (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
     }                                                                          \
@@ -497,136 +502,148 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
 
 #define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
                          SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
-                         DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)              \
+                         DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH,  \
+                         TILE_HEIGHT)                                          \
   TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
                     SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
                     DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, 1,        \
-                    SRC_DEPTH)                                                 \
+                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                        \
   TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
                     SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
                     DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, 1,      \
-                    SRC_DEPTH)                                                 \
+                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                        \
   TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
                     SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
                     DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1,         \
-                    SRC_DEPTH)                                                 \
+                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                        \
   TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
                     SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
-                    DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH) \
+                    DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, \
+                    TILE_WIDTH, TILE_HEIGHT)                                   \
   TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,             \
                     SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,  \
                     DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0,          \
-                    SRC_DEPTH)
+                    SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
 
-TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
-TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8)
-TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8)
-TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8)
-TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10)
-TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10)
-TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10)
-TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12)
-TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12)
-TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12)
+TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
+TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
+TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1)
+TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1)
+TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
 
-#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,       \
-                         SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,           \
-                         DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,    \
-                         SRC_DEPTH)                                           \
-  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
-    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");       \
-    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");       \
-    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                   \
-                  "SRC_SUBSAMP_X unsupported");                               \
-    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                   \
-                  "SRC_SUBSAMP_Y unsupported");                               \
-    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                   \
-                  "DST_SUBSAMP_X unsupported");                               \
-    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                   \
-                  "DST_SUBSAMP_Y unsupported");                               \
-    const int kWidth = W1280;                                                 \
-    const int kHeight = benchmark_height_;                                    \
-    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);               \
-    const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y);             \
-    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);               \
-    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);             \
-    align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF);             \
-    align_buffer_page_end(src_uv,                                             \
-                          kSrcHalfWidth* kSrcHalfHeight* SRC_BPC * 2 + OFF);  \
-    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                 \
-    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
-    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);   \
-    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);               \
-    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
-    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
-    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                   \
-    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                 \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-      src_y_p[i] =                                                            \
-          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
-    }                                                                         \
-    for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) {            \
-      src_uv_p[i] =                                                           \
-          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));      \
-    }                                                                         \
-    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                             \
-    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
-    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);               \
-    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                         \
-    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
-    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);           \
-    MaskCpuFlags(disable_cpu_flags_);                                         \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
-        src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                         \
-        reinterpret_cast<DST_T*>(dst_y_c), kWidth,                            \
-        reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth,                     \
-        reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth,             \
-        NEG kHeight);                                                         \
-    MaskCpuFlags(benchmark_cpu_info_);                                        \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
-      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
-          src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                       \
-          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                        \
-          reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth,                 \
-          reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth,         \
-          NEG kHeight);                                                       \
-    }                                                                         \
-    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                    \
-      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                    \
-    }                                                                         \
-    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {      \
-      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                    \
-      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                    \
-    }                                                                         \
-    free_aligned_buffer_page_end(dst_y_c);                                    \
-    free_aligned_buffer_page_end(dst_u_c);                                    \
-    free_aligned_buffer_page_end(dst_v_c);                                    \
-    free_aligned_buffer_page_end(dst_y_opt);                                  \
-    free_aligned_buffer_page_end(dst_u_opt);                                  \
-    free_aligned_buffer_page_end(dst_v_opt);                                  \
-    free_aligned_buffer_page_end(src_y);                                      \
-    free_aligned_buffer_page_end(src_uv);                                     \
+#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,        \
+                         SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,            \
+                         DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF,     \
+                         SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                   \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
+    static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported");        \
+    static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported");        \
+    static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2,                    \
+                  "SRC_SUBSAMP_X unsupported");                                \
+    static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2,                    \
+                  "SRC_SUBSAMP_Y unsupported");                                \
+    static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2,                    \
+                  "DST_SUBSAMP_X unsupported");                                \
+    static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2,                    \
+                  "DST_SUBSAMP_Y unsupported");                                \
+    const int kWidth = W1280;                                                  \
+    const int kHeight = benchmark_height_;                                     \
+    const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X);                \
+    const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X);                \
+    const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y);              \
+    const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1);  \
+    const int kPaddedHeight =                                                  \
+        (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1);                    \
+    const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X);    \
+    const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y);  \
+    align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF);  \
+    align_buffer_page_end(                                                     \
+        src_uv, kSrcHalfPaddedWidth* kSrcHalfPaddedHeight* SRC_BPC * 2 + OFF); \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC);                  \
+    align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
+    align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC);    \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC);                \
+    align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
+    align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC);  \
+    SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF);                    \
+    SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF);                  \
+    for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) {                   \
+      src_y_p[i] =                                                             \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
+    }                                                                          \
+    for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \
+      src_uv_p[i] =                                                            \
+          (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH)));       \
+    }                                                                          \
+    memset(dst_y_c, 1, kWidth* kHeight* DST_BPC);                              \
+    memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
+    memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC);                \
+    memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC);                          \
+    memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
+    memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC);            \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
+        src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                          \
+        reinterpret_cast<DST_T*>(dst_y_c), kWidth,                             \
+        reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth,                      \
+        reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth,              \
+        NEG kHeight);                                                          \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                          \
+          src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2,                        \
+          reinterpret_cast<DST_T*>(dst_y_opt), kWidth,                         \
+          reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth,                  \
+          reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth,          \
+          NEG kHeight);                                                        \
+    }                                                                          \
+    for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) {                     \
+      EXPECT_EQ(dst_y_c[i], dst_y_opt[i]);                                     \
+    }                                                                          \
+    for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) {       \
+      EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);                                     \
+      EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);                                     \
+    }                                                                          \
+    free_aligned_buffer_page_end(dst_y_c);                                     \
+    free_aligned_buffer_page_end(dst_u_c);                                     \
+    free_aligned_buffer_page_end(dst_v_c);                                     \
+    free_aligned_buffer_page_end(dst_y_opt);                                   \
+    free_aligned_buffer_page_end(dst_u_opt);                                   \
+    free_aligned_buffer_page_end(dst_v_opt);                                   \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_uv);                                      \
   }
 
 #define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,         \
                         SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC,             \
-                        DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH)               \
+                        DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH,   \
+                        TILE_HEIGHT)                                           \
   TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \
+                   DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, \
+                   TILE_WIDTH, TILE_HEIGHT)                                    \
   TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
                    DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2,          \
-                   SRC_DEPTH)                                                  \
+                   SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)                         \
   TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH)  \
+                   DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH,  \
+                   TILE_WIDTH, TILE_HEIGHT)                                    \
   TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X,              \
                    SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X,   \
-                   DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)
+                   DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH,     \
+                   TILE_WIDTH, TILE_HEIGHT)
 
-TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
-TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
+TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
+TESTBIPLANARTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
 
 // Provide matrix wrappers for full range bt.709
 #define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
old mode 100755
new mode 100644
index bdbdb6a45..118259876
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -1523,6 +1523,107 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
   free_aligned_buffer_page_end(dst_opt);
 }
 
+TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
+  int i, j;
+
+  // orig is tiled.  Allocate enough memory for tiles.
+  int orig_width = (benchmark_width_ + 15) & ~15;
+  int orig_height = (benchmark_height_ + 15) & ~15;
+  int orig_plane_size = orig_width * orig_height;
+  int u_plane_size = benchmark_width_ * benchmark_height_;
+  int v_plane_size = u_plane_size;
+  align_buffer_page_end(orig_uv, orig_plane_size);
+  align_buffer_page_end(dst_u_c, u_plane_size);
+  align_buffer_page_end(dst_u_opt, u_plane_size);
+  align_buffer_page_end(dst_v_c, v_plane_size);
+  align_buffer_page_end(dst_v_opt, v_plane_size);
+
+  MemRandomize(orig_uv, orig_plane_size);
+  memset(dst_u_c, 0, u_plane_size);
+  memset(dst_u_opt, 0, u_plane_size);
+  memset(dst_v_c, 0, v_plane_size);
+  memset(dst_v_opt, 0, v_plane_size);
+
+  // Disable all optimizations.
+  MaskCpuFlags(disable_cpu_flags_);
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetileSplitUVPlane(orig_uv, orig_width, dst_u_c, (benchmark_width_ + 1) / 2,
+                       dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_,
+                       benchmark_height_, 16);
+  }
+
+  // Enable optimizations.
+  MaskCpuFlags(benchmark_cpu_info_);
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetileSplitUVPlane(
+        orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
+        (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
+  }
+
+  for (i = 0; i < u_plane_size; ++i) {
+    EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);
+  }
+  for (i = 0; i < v_plane_size; ++i) {
+    EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_uv);
+  free_aligned_buffer_page_end(dst_u_c);
+  free_aligned_buffer_page_end(dst_u_opt);
+  free_aligned_buffer_page_end(dst_v_c);
+  free_aligned_buffer_page_end(dst_v_opt);
+}
+
+TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
+  int i, j;
+
+  // orig is tiled.  Allocate enough memory for tiles.
+  int orig_width = (benchmark_width_ + 15) & ~15;
+  int orig_height = (benchmark_height_ + 15) & ~15;
+  int orig_plane_size = orig_width * orig_height;
+  int u_plane_size = benchmark_width_ * benchmark_height_;
+  int v_plane_size = u_plane_size;
+  align_buffer_page_end(orig_uv, orig_plane_size);
+  align_buffer_page_end(detiled_uv, orig_plane_size);
+  align_buffer_page_end(dst_u_two_stage, u_plane_size);
+  align_buffer_page_end(dst_u_opt, u_plane_size);
+  align_buffer_page_end(dst_v_two_stage, v_plane_size);
+  align_buffer_page_end(dst_v_opt, v_plane_size);
+
+  MemRandomize(orig_uv, orig_plane_size);
+  memset(detiled_uv, 0, orig_plane_size);
+  memset(dst_u_two_stage, 0, u_plane_size);
+  memset(dst_u_opt, 0, u_plane_size);
+  memset(dst_v_two_stage, 0, v_plane_size);
+  memset(dst_v_opt, 0, v_plane_size);
+
+  for (j = 0; j < benchmark_iterations_; j++) {
+    DetileSplitUVPlane(
+        orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
+        (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
+  }
+
+  DetilePlane(orig_uv, orig_width, detiled_uv, benchmark_width_,
+              benchmark_width_, benchmark_height_, 16);
+  SplitUVPlane(detiled_uv, orig_width, dst_u_two_stage,
+               (benchmark_width_ + 1) / 2, dst_v_two_stage,
+               (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_);
+
+  for (i = 0; i < u_plane_size; ++i) {
+    EXPECT_EQ(dst_u_two_stage[i], dst_u_opt[i]);
+  }
+  for (i = 0; i < v_plane_size; ++i) {
+    EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(orig_uv);
+  free_aligned_buffer_page_end(detiled_uv);
+  free_aligned_buffer_page_end(dst_u_two_stage);
+  free_aligned_buffer_page_end(dst_u_opt);
+  free_aligned_buffer_page_end(dst_v_two_stage);
+  free_aligned_buffer_page_end(dst_v_opt);
+}
+
 static int TestMultiply(int width,
                         int height,
                         int benchmark_iterations,