From 62a961bee72e48e4fa14365bd7444c9280540b6f Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Mon, 22 Oct 2012 17:24:50 +0000
Subject: [PATCH] Neon version of I420ToNV12 and I420ToNV21.  NV21ToI420 added
 as function.  CopyRow changed to vld4.8 to allow unaligned copy. BUG=none
 TEST=none Review URL: https://webrtc-codereview.appspot.com/922005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@435 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium               |  2 +-
 include/libyuv/convert.h      | 11 +++++-
 include/libyuv/convert_from.h | 17 +++++++++-
 include/libyuv/row.h          |  6 ++++
 include/libyuv/version.h      |  2 +-
 source/convert.cc             | 20 +++++++++--
 source/convert_from.cc        | 58 ++++++++++++++++++++++++++++++-
 source/planar_functions.cc    |  2 +-
 source/rotate.cc              |  2 +-
 source/rotate_argb.cc         |  2 +-
 source/row_common.cc          | 15 ++++++++
 source/row_neon.cc            | 39 ++++++++++++++++-----
 unit_test/convert_test.cc     | 64 +++++++++++++++++++----------------
 13 files changed, 193 insertions(+), 47 deletions(-)

diff --git a/README.chromium b/README.chromium
index 8ff9d4d56..2db963548 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 433
+Version: 435
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 2db1f14aa..e07bfd199 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -73,7 +73,7 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert NV12 to I420. Also used for NV21.
+// Convert NV12 to I420.
 LIBYUV_API
 int NV12ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_uv, int src_stride_uv,
@@ -82,6 +82,15 @@ int NV12ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
 // Convert M420 to I420.
 LIBYUV_API
 int M420ToI420(const uint8* src_m420, int src_stride_m420,
diff --git a/include/libyuv/convert_from.h b/include/libyuv/convert_from.h
index 4eae950cc..44ff4d98f 100644
--- a/include/libyuv/convert_from.h
+++ b/include/libyuv/convert_from.h
@@ -56,10 +56,25 @@ int I400Copy(const uint8* src_y, int src_stride_y,
              uint8* dst_y, int dst_stride_y,
              int width, int height);
 
-// TODO(fbarchard): I420ToNV12
 // TODO(fbarchard): I420ToM420
 // TODO(fbarchard): I420ToQ420
 
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
 LIBYUV_API
 int I420ToYUY2(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 94fd99720..89e35a608 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -168,6 +168,7 @@ extern "C" {
 #define HAS_ARGBTOARGB1555ROW_NEON
 #define HAS_ARGBTOARGB4444ROW_NEON
 #define HAS_ARGBTOYROW_NEON
+#define HAS_MERGEUV_NEON
 #endif
 
 // The following are available on Mips platforms
@@ -308,6 +309,11 @@ void SplitUV_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                             int pix);
 
+void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+               int width);
+void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width);
+
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_X86(const uint8* src, uint8* dst, int count);
 void CopyRow_NEON(const uint8* src, uint8* dst, int count);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 8f1c42561..3a3ad4182 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 433
+#define LIBYUV_VERSION 435
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/convert.cc b/source/convert.cc
index 51198602d..57ad6139e 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -302,7 +302,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
                        int width, int height) {
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
     CopyRow = CopyRow_NEON;
   }
 #elif defined(HAS_COPYROW_X86)
@@ -460,6 +460,22 @@ int NV12ToI420(const uint8* src_y, int src_stride_y,
                     width, height);
 }
 
+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_vu, src_stride_vu,
+                    dst_y, dst_stride_y,
+                    dst_v, dst_stride_v,
+                    dst_u, dst_stride_u,
+                    width, height);
+}
+
 // Convert M420 to I420.
 LIBYUV_API
 int M420ToI420(const uint8* src_m420, int src_stride_m420,
@@ -503,7 +519,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
   // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
     CopyRow = CopyRow_NEON;
   }
 #endif
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 443c140b7..474fea56f 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -50,7 +50,7 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
   int halfwidth = (width + 1) >> 1;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 32)) {
     CopyRow = CopyRow_NEON;
   }
 #elif defined(HAS_COPYROW_X86)
@@ -477,6 +477,62 @@ int I420ToV210(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    int halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_uv = -dst_stride_uv;
+  }
+
+  int halfwidth = (width + 1) >> 1;
+  void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) = MergeUV_C;
+#if defined(HAS_SPLITUV_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
+    MergeUV = MergeUV_NEON;
+  }
+#endif
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  int halfheight = (height + 1) >> 1;
+  for (int y = 0; y < halfheight; ++y) {
+    // Copy a row of UV.
+    MergeUV_C(src_u, src_v, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height) {
+  return I420ToNV12(src_y, src_stride_y,
+                    src_v, src_stride_v,
+                    src_u, src_stride_u,
+                    dst_y, src_stride_y,
+                    dst_vu, dst_stride_vu,
+                    width, height);
+}
+
 // Convert I420 to ARGB.
 LIBYUV_API
 int I420ToARGB(const uint8* src_y, int src_stride_y,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 2ffee68f3..2d0366fbf 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -30,7 +30,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
                int width, int height) {
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
     CopyRow = CopyRow_NEON;
   }
 #endif
diff --git a/source/rotate.cc b/source/rotate.cc
index 15ac961ac..8f9883f47 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -859,7 +859,7 @@ void RotatePlane180(const uint8* src, int src_stride,
 #endif
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
     CopyRow = CopyRow_NEON;
   }
 #endif
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index 9c9944674..7dcefa385 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -90,7 +90,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
 #endif
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 64)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
     CopyRow = CopyRow_NEON;
   }
 #endif
diff --git a/source/row_common.cc b/source/row_common.cc
index 1f54a07f6..cf826ea58 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -717,6 +717,21 @@ void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
   }
 }
 
+void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+               int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x];
+    dst_uv[1] = src_v[x];
+    dst_uv[2] = src_u[x + 1];
+    dst_uv[3] = src_v[x + 1];
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1];
+    dst_uv[1] = src_v[width - 1];
+  }
+}
+
 void CopyRow_C(const uint8* src, uint8* dst, int count) {
   memcpy(dst, src, count);
 }
diff --git a/source/row_neon.cc b/source/row_neon.cc
index ca0cab5c3..2a4d6b3a1 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -345,7 +345,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
     "vld2.u8    {q0, q1}, [%0:128]!            \n"  // load 16 pairs of UV
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     "vst1.u8    {q0}, [%1:128]!                \n"  // store U
-    "vst1.u8    {q1}, [%2:128]!                \n"  // Store V
+    "vst1.u8    {q1}, [%2:128]!                \n"  // store V
     "bgt        1b                             \n"
     : "+r"(src_uv),  // %0
       "+r"(dst_u),   // %1
@@ -355,6 +355,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
     : "memory", "cc", "q0", "q1"  // Clobber List
   );
 }
+
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
 // Alignment requirement: Multiple of 16 pixels, pointers unaligned.
 void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
@@ -365,7 +366,7 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "vld2.u8    {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     "vst1.u8    {q0}, [%1]!                    \n"  // store U
-    "vst1.u8    {q1}, [%2]!                    \n"  // Store V
+    "vst1.u8    {q1}, [%2]!                    \n"  // store V
     "bgt        1b                             \n"
     : "+r"(src_uv),  // %0
       "+r"(dst_u),   // %1
@@ -377,21 +378,43 @@ void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 }
 #endif  // HAS_SPLITUV_NEON
 
+#ifdef HAS_MERGEUV_NEON
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.u8    {q0}, [%1]!                    \n"  // load U
+    "vld1.u8    {q1}, [%2]!                    \n"  // load V
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vst2.u8    {q0, q1}, [%0]!                \n"  // store 16 pairs of UV
+    "bgt        1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "memory", "cc", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_MERGEUV_NEON
 #ifdef HAS_COPYROW_NEON
-// Copy multiple of 64
+// Copy multiple of 32.  vld4.u8 allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
   asm volatile (
     ".p2align  2                               \n"
   "1:                                          \n"
-    "vldm       %0!, {q0, q1, q2, q3}          \n"  // load 64
-    "subs       %2, %2, #64                    \n"  // 64 processed per loop
-    "vstm       %1!, {q0, q1, q2, q3}          \n"  // store 64
+    "vld4.u8    {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+    "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    "vst4.u8    {d0, d1, d2, d3}, [%1]!        \n"  // store 32
     "bgt        1b                             \n"
     : "+r"(src),   // %0
       "+r"(dst),   // %1
       "+r"(count)  // %2  // Output registers
     :                     // Input registers
-    : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+    : "memory", "cc", "q0", "q1"  // Clobber List
   );
 }
 #endif  // HAS_COPYROW_NEON
@@ -403,7 +426,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
     "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
     "1:                                        \n"
     "subs      %1, %1, #16                     \n"  // 16 bytes per loop
-    "vst1.u32  {q0}, [%0]!                     \n"  // store
+    "vst1.u8   {q0}, [%0]!                     \n"  // store
     "bgt       1b                              \n"
     : "+r"(dst),   // %0
       "+r"(count)  // %1
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index deda59687..c81a7ab3b 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -31,13 +31,15 @@
 namespace libyuv {
 
 #define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,           \
-                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG)        \
+                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF)   \
 TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
   const int kWidth = W1280;                                                    \
   const int kHeight = 720;                                                     \
-  align_buffer_16(src_y, kWidth * kHeight);                                    \
-  align_buffer_16(src_u, kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y);    \
-  align_buffer_16(src_v, kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y);    \
+  align_buffer_16(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_16(src_u,                                                       \
+                  kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y + OFF);     \
+  align_buffer_16(src_v,                                                       \
+                  kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y + OFF);     \
   align_buffer_16(dst_y_c, kWidth * kHeight);                                  \
   align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
   align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
@@ -47,26 +49,26 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
   srandom(time(NULL));                                                         \
   for (int i = 0; i < kHeight; ++i)                                            \
     for (int j = 0; j < kWidth; ++j)                                           \
-      src_y[(i * kWidth) + j] = (random() & 0xff);                             \
+      src_y[(i * kWidth) + j + OFF] = (random() & 0xff);                       \
   for (int i = 0; i < kHeight / SRC_SUBSAMP_Y; ++i) {                          \
     for (int j = 0; j < kWidth / SRC_SUBSAMP_X; ++j) {                         \
-      src_u[(i * kWidth / SRC_SUBSAMP_X) + j] = (random() & 0xff);             \
-      src_v[(i * kWidth / SRC_SUBSAMP_X) + j] = (random() & 0xff);             \
+      src_u[(i * kWidth / SRC_SUBSAMP_X) + j + OFF] = (random() & 0xff);       \
+      src_v[(i * kWidth / SRC_SUBSAMP_X) + j + OFF] = (random() & 0xff);       \
     }                                                                          \
   }                                                                            \
   MaskCpuFlags(0);                                                             \
-  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth,                                \
-                                 src_u, kWidth / SRC_SUBSAMP_X,                \
-                                 src_v, kWidth / SRC_SUBSAMP_X,                \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
+                                 src_u + OFF, kWidth / SRC_SUBSAMP_X,          \
+                                 src_v + OFF, kWidth / SRC_SUBSAMP_X,          \
                                  dst_y_c, kWidth,                              \
                                  dst_u_c, kWidth / SUBSAMP_X,                  \
                                  dst_v_c, kWidth / SUBSAMP_X,                  \
                                  kWidth, NEG kHeight);                         \
   MaskCpuFlags(-1);                                                            \
   for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth,                              \
-                                   src_u, kWidth / SRC_SUBSAMP_X,              \
-                                   src_v, kWidth / SRC_SUBSAMP_X,              \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
+                                   src_u + OFF, kWidth / SRC_SUBSAMP_X,        \
+                                   src_v + OFF, kWidth / SRC_SUBSAMP_X,        \
                                    dst_y_opt, kWidth,                          \
                                    dst_u_opt, kWidth / SUBSAMP_X,              \
                                    dst_v_opt, kWidth / SUBSAMP_X,              \
@@ -120,11 +122,13 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
 #define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,            \
                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                        \
     TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
-                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Opt, +)            \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Opt, +, 0)         \
     TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
-                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Invert, -)         \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Unaligned, +, 1)   \
     TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
-                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1276, _Any, +)
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Invert, -, 0)      \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1276, _Any, +, 0)
 
 TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
 TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
@@ -137,13 +141,13 @@ TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
 
 
 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
-                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG)      \
+                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
 TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
   const int kWidth = W1280;                                                    \
   const int kHeight = 720;                                                     \
-  align_buffer_16(src_y, kWidth * kHeight);                                    \
+  align_buffer_16(src_y, kWidth * kHeight + OFF);                              \
   align_buffer_16(src_uv, 2 * kWidth / SRC_SUBSAMP_X *                         \
-                  kHeight / SRC_SUBSAMP_Y);                                    \
+                  kHeight / SRC_SUBSAMP_Y + OFF);                              \
   align_buffer_16(dst_y_c, kWidth * kHeight);                                  \
   align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
   align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
@@ -153,23 +157,23 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
   srandom(time(NULL));                                                         \
   for (int i = 0; i < kHeight; ++i)                                            \
     for (int j = 0; j < kWidth; ++j)                                           \
-      src_y[(i * kWidth) + j] = (random() & 0xff);                             \
+      src_y[(i * kWidth) + j + OFF] = (random() & 0xff);                       \
   for (int i = 0; i < kHeight / SRC_SUBSAMP_Y; ++i) {                          \
     for (int j = 0; j < 2 * kWidth / SRC_SUBSAMP_X; ++j) {                     \
-      src_uv[(i * 2 * kWidth / SRC_SUBSAMP_X) + j] = (random() & 0xff);        \
+      src_uv[(i * 2 * kWidth / SRC_SUBSAMP_X) + j + OFF] = (random() & 0xff);  \
     }                                                                          \
   }                                                                            \
   MaskCpuFlags(0);                                                             \
-  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth,                                \
-                                 src_uv, 2 * kWidth / SRC_SUBSAMP_X,           \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth,                          \
+                                 src_uv + OFF, 2 * kWidth / SRC_SUBSAMP_X,     \
                                  dst_y_c, kWidth,                              \
                                  dst_u_c, kWidth / SUBSAMP_X,                  \
                                  dst_v_c, kWidth / SUBSAMP_X,                  \
                                  kWidth, NEG kHeight);                         \
   MaskCpuFlags(-1);                                                            \
   for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth,                              \
-                                   src_uv, 2 * kWidth / SRC_SUBSAMP_X,         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth,                        \
+                                   src_uv + OFF, 2 * kWidth / SRC_SUBSAMP_X,   \
                                    dst_y_opt, kWidth,                          \
                                    dst_u_opt, kWidth / SUBSAMP_X,              \
                                    dst_v_opt, kWidth / SUBSAMP_X,              \
@@ -222,14 +226,16 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
 #define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,          \
                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                      \
     TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
-                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Opt, +)          \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Opt, +, 0)       \
     TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
-                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Invert, -)       \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Unaligned, +, 1) \
     TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
-                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1276, _Any, +)
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Invert, -, 0)    \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1276, _Any, +, 0)
 
 TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
-
+TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
 
 #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
                        W1280, DIFF, N, NEG)                                    \