From e91bdaca3674830570cbb2aaab6d5c939f56dee4 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Tue, 9 Oct 2012 21:09:33 +0000
Subject: [PATCH] Move HalfRow to row_win and port to row_neon BUG=118
 TEST=libyuvTest.I420ToI422_OptVsC (247 ms) Review URL:
 https://webrtc-codereview.appspot.com/855012

git-svn-id: http://libyuv.googlecode.com/svn/trunk@400 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 include/libyuv/row.h      |  12 ++++-
 source/convert.cc         |  79 +++++------------------------
 source/row_common.cc      |   7 +++
 source/row_neon.cc        |  21 ++++++++
 source/row_posix.cc       |  97 ++++++++++++++++++++++++++++++++++++
 source/row_win.cc         |  24 +++++++++
 unit_test/compare_test.cc |  28 ++++++++---
 unit_test/planar_test.cc  | 101 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 292 insertions(+), 77 deletions(-)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index ebbc4572c..4c16269bb 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -53,11 +53,13 @@ extern "C" {
 #define HAS_BGRATOYROW_SSSE3
 #define HAS_COPYROW_SSE2
 #define HAS_COPYROW_X86
+#define HAS_HALFROW_SSE2
 #define HAS_I400TOARGBROW_SSE2
 #define HAS_I411TOARGBROW_SSSE3
 #define HAS_I422TOABGRROW_SSSE3
 #define HAS_I422TOARGBROW_SSSE3
 #define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
 #define HAS_I444TOARGBROW_SSSE3
 #define HAS_MIRRORROW_SSSE3
 #define HAS_MIRRORROWUV_SSSE3
@@ -96,7 +98,6 @@ extern "C" {
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ABGRTOARGBROW_SSSE3
 #define HAS_ARGBCOLORTABLEROW_X86
-#define HAS_I422TORGBAROW_SSSE3
 #define HAS_RGBATOARGBROW_SSSE3
 #define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
@@ -116,6 +117,7 @@ extern "C" {
 // The following are available on Neon platforms
 #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_COPYROW_NEON
+#define HAS_HALFROW_NEON
 #define HAS_I422TOABGRROW_NEON
 #define HAS_I422TOARGBROW_NEON
 #define HAS_I422TOBGRAROW_NEON
@@ -750,6 +752,14 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                               ptrdiff_t src_stride, int dst_width,
                               int source_y_fraction);
 
+void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+               uint8* dst_uv, int pix);
+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+                  uint8* dst_uv, int pix);
+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
+                  uint8* dst_uv, int pix);
+
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/convert.cc b/source/convert.cc
index 0882c92ba..0f21b03fa 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -62,66 +62,6 @@ int I420Copy(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Move to row_win etc.
-#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_HALFROW_SSE2
-__declspec(naked) __declspec(align(16))
-static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
-                         uint8* dst_uv, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // src_uv_stride
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    sub        edi, eax
-
-    align      16
-  convertloop:
-    movdqa     xmm0, [eax]
-    pavgb      xmm0, [eax + edx]
-    sub        ecx, 16
-    movdqa     [eax + edi], xmm0
-    lea        eax,  [eax + 16]
-    jg         convertloop
-    pop        edi
-    ret
-  }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_HALFROW_SSE2
-static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
-                         uint8* dst_uv, int pix) {
-  asm volatile (
-  "sub        %0,%1                            \n"
-  ".p2align  4                                 \n"
-"1:                                            \n"
-  "movdqa     (%0),%%xmm0                      \n"
-  "pavgb      (%0,%3),%%xmm0                   \n"
-  "sub        $0x10,%2                         \n"
-  "movdqa     %%xmm0,(%0,%1)                   \n"
-  "lea        0x10(%0),%0                      \n"
-  "jg         1b                               \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_uv),  // %1
-    "+r"(pix)      // %2
-  : "r"(static_cast<intptr_t>(src_uv_stride))  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0"
-#endif
-);
-}
-#endif
-
-static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
-                      uint8* dst_uv, int pix) {
-  for (int x = 0; x < pix; ++x) {
-    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
-  }
-}
-
 LIBYUV_API
 int I422ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
@@ -149,14 +89,17 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
   void (*HalfRow)(const uint8* src_uv, int src_uv_stride,
                   uint8* dst_uv, int pix) = HalfRow_C;
 #if defined(HAS_HALFROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(halfwidth, 16) &&
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) &&
       IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
       IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
       IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
       IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
     HalfRow = HalfRow_SSE2;
   }
+#elif defined(HAS_HALFROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
+    HalfRow = HalfRow_NEON;
+  }
 #endif
 
   // Copy Y plane
@@ -296,12 +239,12 @@ int I411ToI420(const uint8* src_y, int src_stride_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (height - 1) * dst_stride_u;
-    dst_v = dst_v + (height - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
   }
 
   // Copy Y plane
diff --git a/source/row_common.cc b/source/row_common.cc
index e0e426cd8..83c0d697b 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1240,6 +1240,13 @@ void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
   } while (dst_ptr < end);
 }
 
+void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+               uint8* dst_uv, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 19a783305..200538de5 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -821,6 +821,27 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
 }
 #endif  // HAS_UYVYTOYROW_NEON
 
+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
+                  uint8* dst_uv, int pix) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %0                         \n"
+    "1:                                        \n"
+    "vld1.u8    {q0}, [%0]!                    \n"  // load row 1 16 pixels.
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vld1.u8    {q1}, [%1]!                    \n"  // load row 2 16 pixels.
+    "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2
+    "vst1.u8    {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_uv),         // %0
+      "+r"(src_uv_stride),  // %1
+      "+r"(dst_uv),         // %2
+      "+r"(pix)             // %3
+    :
+    : "memory", "cc", "q0", "q1"  // Clobber List
+   );
+}
+
 #endif  // __ARM_NEON__
 
 #ifdef __cplusplus
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 33149dada..74783d370 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1816,6 +1816,43 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
   );
 }
 
+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* rgba_buf,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm2,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "movdqa    %%xmm5,%%xmm0                   \n"
+    "punpcklwd %%xmm1,%%xmm5                   \n"
+    "punpckhwd %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm5,(%[argb_buf])            \n"
+    "movdqa    %%xmm0,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(rgba_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
 void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
                                           const uint8* u_buf,
                                           const uint8* v_buf,
@@ -1888,6 +1925,44 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
 #endif
   );
 }
+
+void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* rgba_buf,
+                                          int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm2,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "movdqa    %%xmm5,%%xmm0                   \n"
+    "punpcklwd %%xmm1,%%xmm5                   \n"
+    "punpckhwd %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm5,(%[argb_buf])            \n"
+    "movdqa    %%xmm0,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(rgba_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
 #endif  // HAS_I422TOARGBROW_SSSE3
 
 #ifdef HAS_YTOARGBROW_SSE2
@@ -3654,6 +3729,28 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   );
 }
 
+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+                  uint8* dst_uv, int pix) {
+  asm volatile (
+    "sub        %0,%1                            \n"
+    ".p2align  4                                 \n"
+  "1:                                            \n"
+    "movdqa     (%0),%%xmm0                      \n"
+    "pavgb      (%0,%3),%%xmm0                   \n"
+    "sub        $0x10,%2                         \n"
+    "movdqa     %%xmm0,(%0,%1)                   \n"
+    "lea        0x10(%0),%0                      \n"
+    "jg         1b                               \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_uv),  // %1
+    "+r"(pix)      // %2
+  : "r"(static_cast<intptr_t>(src_uv_stride))  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+      , "xmm0"
+#endif
+  );
+}
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/source/row_win.cc b/source/row_win.cc
index de70b9435..8a29f24bb 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4193,6 +4193,30 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
+__declspec(naked) __declspec(align(16))
+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+                  uint8* dst_uv, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // src_uv_stride
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    sub        edi, eax
+
+    align      16
+  convertloop:
+    movdqa     xmm0, [eax]
+    pavgb      xmm0, [eax + edx]
+    sub        ecx, 16
+    movdqa     [eax + edi], xmm0
+    lea        eax,  [eax + 16]
+    jg         convertloop
+    pop        edi
+    ret
+  }
+}
+
 #endif  // _M_IX86
 
 #ifdef __cplusplus
diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc
index 8a49a612f..f6086f03d 100644
--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -108,19 +108,25 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_C) {
   align_buffer_16(src_a, kMaxWidth)
   align_buffer_16(src_b, kMaxWidth)
 
+  MaskCpuFlags(kCpuInitialized);
+
+  memcpy(src_a, "test0123test4567", 16);
+  memcpy(src_b, "tick0123tock4567", 16);
+  uint64 h1 = ComputeSumSquareError(src_a, src_b, 16);
+  EXPECT_EQ(790u, h1);
+
   for (int i = 0; i < kMaxWidth; ++i) {
     src_a[i] = i;
     src_b[i] = i;
   }
 
-  MaskCpuFlags(kCpuInitialized);
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    ComputeSumSquareError(src_a, src_b, kMaxWidth);
+  int count = benchmark_iterations_ * 1280 * 720 / kMaxWidth;
+  for (int i = 0; i < count; ++i) {
+    h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
   }
 
   MaskCpuFlags(-1);
-
-  EXPECT_EQ(0, 0);
+  EXPECT_EQ(h1, 0);
 
   free_aligned_buffer_16(src_a)
   free_aligned_buffer_16(src_b)
@@ -131,16 +137,22 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_OPT) {
   align_buffer_16(src_a, kMaxWidth)
   align_buffer_16(src_b, kMaxWidth)
 
+  memcpy(src_a, "test0123test4567", 16);
+  memcpy(src_b, "tick0123tock4567", 16);
+  uint64 h1 = ComputeSumSquareError(src_a, src_b, 16);
+  EXPECT_EQ(790u, h1);
+
   for (int i = 0; i < kMaxWidth; ++i) {
     src_a[i] = i;
     src_b[i] = i;
   }
 
-  for (int i = 0; i < benchmark_iterations_; ++i) {
-    ComputeSumSquareError(src_a, src_b, kMaxWidth);
+  int count = benchmark_iterations_ * 1280 * 720 / kMaxWidth;
+  for (int i = 0; i < count; ++i) {
+    h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
   }
 
-  EXPECT_EQ(0, 0);
+  EXPECT_EQ(h1, 0);
 
   free_aligned_buffer_16(src_a)
   free_aligned_buffer_16(src_b)
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index e9053a359..8af0bf6c2 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -28,6 +28,107 @@
 
 namespace libyuv {
 
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,           \
+    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, N, NEG)                                  \
+TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N##_OptVsC) {               \
+  const int kWidth = 1280;                                                     \
+  const int kHeight = 720;                                                     \
+  align_buffer_16(src_y, kWidth * kHeight);                                    \
+  align_buffer_16(src_u, kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y);    \
+  align_buffer_16(src_v, kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y);    \
+  align_buffer_16(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
+  align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);          \
+  align_buffer_16(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_16(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
+  align_buffer_16(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[(i * kWidth) + j] = (random() & 0xff);                             \
+  for (int i = 0; i < kHeight / SRC_SUBSAMP_Y; ++i)                            \
+    for (int j = 0; j < kWidth / SRC_SUBSAMP_X; ++j) {                         \
+      src_u[(i * kWidth / SRC_SUBSAMP_X) + j] = (random() & 0xff);             \
+      src_v[(i * kWidth / SRC_SUBSAMP_X) + j] = (random() & 0xff);             \
+    }                                                                          \
+  MaskCpuFlags(kCpuInitialized);                                               \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth,                                \
+                                 src_u, kWidth / SRC_SUBSAMP_X,                \
+                                 src_v, kWidth / SRC_SUBSAMP_X,                \
+                                 dst_y_c, kWidth,                              \
+                                 dst_u_c, kWidth / SUBSAMP_X,                  \
+                                 dst_v_c, kWidth / SUBSAMP_X,                  \
+                                 kWidth, NEG kHeight);                         \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y, kWidth,                              \
+                                   src_u, kWidth / SRC_SUBSAMP_X,              \
+                                   src_v, kWidth / SRC_SUBSAMP_X,              \
+                                   dst_y_opt, kWidth,                          \
+                                   dst_u_opt, kWidth / SUBSAMP_X,              \
+                                   dst_v_opt, kWidth / SUBSAMP_X,              \
+                                   kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 2);                                                      \
+  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) {                              \
+    for (int j = 0; j < kWidth / SUBSAMP_X; ++j) {                             \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_u_c[i * kWidth / SUBSAMP_X + j]) -          \
+              static_cast<int>(dst_u_opt[i * kWidth / SUBSAMP_X + j]));        \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 2);                                                      \
+  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) {                              \
+    for (int j = 0; j < kWidth / SUBSAMP_X; ++j) {                             \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_v_c[i * kWidth / SUBSAMP_X + j]) -          \
+              static_cast<int>(dst_v_opt[i * kWidth / SUBSAMP_X + j]));        \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 2);                                                      \
+  free_aligned_buffer_16(dst_y_c)                                              \
+  free_aligned_buffer_16(dst_u_c)                                              \
+  free_aligned_buffer_16(dst_v_c)                                              \
+  free_aligned_buffer_16(dst_y_opt)                                            \
+  free_aligned_buffer_16(dst_u_opt)                                            \
+  free_aligned_buffer_16(dst_v_opt)                                            \
+  free_aligned_buffer_16(src_y)                                                \
+  free_aligned_buffer_16(src_u)                                                \
+  free_aligned_buffer_16(src_v)                                                \
+}
+
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,            \
+                      FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                        \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +)                      \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -)                \
+
+TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
+TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
+TESTPLANARTOP(I411, 4, 1, I420, 2, 2)
+TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
+TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
+TESTPLANARTOP(I420, 2, 2, I411, 4, 1)
+
 #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, N, NEG) \
 TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) {                        \
   const int kWidth = 1280;                                                     \