From 0e9b515f5348a453cd97578b4668b3c330973298 Mon Sep 17 00:00:00 2001
From: "frkoenig@google.com"
 <frkoenig@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Sat, 29 Oct 2011 00:26:17 +0000
Subject: [PATCH] Neon 38 downscaler.

Fixed up unit tests for filters to use same image generation and comparison code.

Added timing information output from doing scale.
Review URL: http://webrtc-codereview.appspot.com/244016

git-svn-id: http://libyuv.googlecode.com/svn/trunk@48 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 source/scale.cc         | 250 +++++++++++++++++++++++++++++
 unit_test/scale_test.cc | 339 ++++++++++++++++++----------------------
 2 files changed, 402 insertions(+), 187 deletions(-)

diff --git a/source/scale.cc b/source/scale.cc
index 6be7f4ed8..15544a2a7 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -286,6 +286,244 @@ static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
   );
 }
 
+#define HAS_SCALEROWDOWN38_NEON
+const uint8 shuf38[16] __attribute__ ((aligned(16))) =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+const uint8 shuf38_2[16] __attribute__ ((aligned(16))) =
+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+static void ScaleRowDown38_NEON(const uint8* src_ptr, int,
+                                uint8* dst_ptr, int dst_width) {
+  __asm__ volatile
+  (
+    "vld1.u8      {q3}, [%3]                  \n"
+    "1:                                       \n"
+    "vld1.u8      {d0, d1, d2, d3}, [%0]!     \n"
+    "vtbl.u8      d4, {d0, d1, d2, d3}, d6    \n"
+    "vtbl.u8      d5, {d0, d1, d2, d3}, d7    \n"
+    "vst1.u8      {d4}, [%1]!                 \n"
+    "vst1.u32     {d5[0]}, [%1]!              \n"
+    "subs         %2, #12                     \n"
+    "bhi          1b                          \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    : "r"(shuf38)             // %3
+    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  __asm__ volatile
+  (
+    "vld1.u16     {q4}, [%4]                  \n"
+    "vld1.u8      {q5}, [%5]                  \n"
+    "vld1.u8      {q8}, [%6]                  \n"
+    "add          r4, %0, %3, lsl #1          \n"
+    "add          %3, %0                      \n"
+    "1:                                       \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!     \n"
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!     \n"
+    "vld4.u8      {d12, d13, d14, d15}, [r4]! \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                      \n"
+    "vtrn.u8      d4, d5                      \n"
+    "vtrn.u8      d12, d13                    \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                      \n"
+    "vtrn.u8      d6, d7                      \n"
+    "vtrn.u8      d14, d15                    \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                      \n"
+    "vpaddl.u8    q2, q2                      \n"
+    "vpaddl.u8    q6, q6                      \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                      \n"
+    "vpaddl.u8    d7, d7                      \n"
+    "vpaddl.u8    d15, d15                    \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                      \n"
+    "vadd.u16     q0, q6                      \n"
+    "vadd.u16     d4, d3, d7                  \n"
+    "vadd.u16     d4, d15                     \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "vqrdmulh.s16 q2, q4                      \n"
+    "vmovn.u16    d4, q2                      \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg.  This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded.  Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                      \n"
+    "vmovl.u8     q3, d6                      \n"
+    "vmovl.u8     q7, d14                     \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                      \n"
+    "vadd.u16     q1, q7                      \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                      \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                      \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                      \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2.  So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q8                      \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                      \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d10       \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d11       \n"
+
+    "vst1.u8      {d3}, [%1]!                 \n"
+    "vst1.u32     {d4[0]}, [%1]!              \n"
+    "subs         %2, #12                     \n"
+    "bhi          1b                          \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    : "r"(mult38_div6),       // %4
+      "r"(shuf38_2),          // %5
+      "r"(mult38_div9)        // %6
+    : "r4", "q0", "q1", "q2", "q3", "q4",
+      "q5", "q6", "q7", "q8", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  __asm__ volatile
+  (
+    "vld1.u16     {q4}, [%4]              \n"
+    "vld1.u8      {q5}, [%5]              \n"
+    "add          %3, %0                  \n"
+    "1:                                   \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    "vld4.u8      {d0, d1, d2, d3}, [%0]! \n"
+    "vld4.u8      {d4, d5, d6, d7}, [%3]! \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                  \n"
+    "vtrn.u8      d4, d5                  \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                  \n"
+    "vtrn.u8      d6, d7                  \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                  \n"
+    "vpaddl.u8    q2, q2                  \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                  \n"
+    "vpaddl.u8    d7, d7                  \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                  \n"
+    "vadd.u16     d4, d3, d7              \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "vqrshrn.u16  d4, q2, #2              \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg.  This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded.  Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                  \n"
+    "vmovl.u8     q3, d6                  \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                  \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                  \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                  \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                  \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2.  So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q4                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                  \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d10   \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d11   \n"
+
+    "vst1.u8      {d3}, [%1]!             \n"
+    "vst1.u32     {d4[0]}, [%1]!          \n"
+    "subs         %2, #12                 \n"
+    "bhi          1b                      \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    : "r"(mult38_div6),       // %4
+      "r"(shuf38_2)           // %5
+    : "q0", "q1", "q2", "q3", "q4", "q5", "memory", "cc"
+  );
+}
+
 /**
  * SSE2 downscalers with interpolation.
  *
@@ -3064,6 +3302,18 @@ static void ScalePlaneDown38(int src_width, int src_height,
                            uint8* dst_ptr, int dst_width);
   void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
                            uint8* dst_ptr, int dst_width);
+#if defined(HAS_SCALEROWDOWN38_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
+    }
+  } else
+#endif
 #if defined(HAS_SCALEROWDOWN38_SSSE3)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
       (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 1d41668cf..44774a54f 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -27,142 +27,37 @@ using namespace libyuv;
   free(var##_mem);  \
   var = 0;
 
-TEST_F(libyuvTest, ScaleDownBy4) {
-  int b = 128;
-  int src_width = 1280;
-  int src_height = 720;
-  int src_width_uv = (src_width + 1) >> 1;
-  int src_height_uv = (src_height + 1) >> 1;
+#ifdef WIN32
 
-  int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b));
-  int src_uv_plane_size = (src_width_uv + (2 * b)) * (src_height_uv + (2 * b));
-
-  int src_stride_y = 2 * b + src_width;
-  int src_stride_uv = 2 * b + src_width_uv;
-
-  align_buffer_16(src_y, src_y_plane_size)
-  align_buffer_16(src_u, src_uv_plane_size)
-  align_buffer_16(src_v, src_uv_plane_size)
-
-  int dst_width = src_width >> 2;
-  int dst_height = src_height >> 2;
-
-  int dst_width_uv = (dst_width + 1) >> 1;
-  int dst_height_uv = (dst_height + 1) >> 1;
-
-  int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b));
-  int dst_uv_plane_size = (dst_width_uv + (2 * b)) * (dst_height_uv + (2 * b));
-
-  int dst_stride_y = 2 * b + dst_width;
-  int dst_stride_uv = 2 * b + dst_width_uv;
-
-  align_buffer_16(dst_y, dst_y_plane_size)
-  align_buffer_16(dst_u, dst_uv_plane_size)
-  align_buffer_16(dst_v, dst_uv_plane_size)
-
-  // create an image with random data reoccurring in 4x4 grid.  When the image
-  // is filtered all the values should be the same.
-  srandom(time(NULL));
-
-  uint8 block_data[16];
-
-  int i, j;
-
-  // Pulling 16 random numbers there is an infinitesimally small
-  //  chance that they are all 0.  Then the output will be all 0.
-  //  Output buffer is filled with 0, want to make sure that after the
-  //  filtering something went into the output buffer.
-  //  Avoid this by setting one of the values to 128.  Also set the
-  //  random data to at least 1 for when point sampling to prevent
-  //  output all being 0.
-  block_data[0] = 128;
-
-  for (i = 1; i < 16; i++)
-    block_data[i] = (random() & 0xfe) + 1;
-
-  for (i = b; i < (src_height + b); i += 4) {
-    for (j = b; j < (src_width + b); j += 4) {
-      uint8 *ptr = src_y + (i * src_stride_y) + j;
-      int k, l;
-      for (k = 0; k < 4; ++k)
-        for (l = 0; l < 4; ++l)
-          ptr[k + src_stride_y * l] = block_data[k + 4 * l];
-    }
-  }
-
-  for (i = 1; i < 16; i++)
-    block_data[i] = (random() & 0xfe) + 1;
-
-  for (i = b; i < (src_height_uv + b); i += 4) {
-    for (j = b; j < (src_width_uv + b); j += 4) {
-      uint8 *ptru = src_u + (i * src_stride_uv) + j;
-      uint8 *ptrv = src_v + (i * src_stride_uv) + j;
-      int k, l;
-      for (k = 0; k < 4; ++k)
-        for (l = 0; l < 4; ++l) {
-          ptru[k + src_stride_uv * l] = block_data[k + 4 * l];
-          ptrv[k + src_stride_uv * l] = block_data[k + 4 * l];
-        }
-    }
-  }
-
-  int f;
-  int err = 0;
-
-  // currently three filter modes, defined as FilterMode in scale.h
-  for (f = 0; f < 3; ++f) {
-    I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
-              src_u + (src_stride_uv * b) + b, src_stride_uv,
-              src_v + (src_stride_uv * b) + b, src_stride_uv,
-              src_width, src_height,
-              dst_y + (dst_stride_y * b) + b, dst_stride_y,
-              dst_u + (dst_stride_uv * b) + b, dst_stride_uv,
-              dst_v + (dst_stride_uv * b) + b, dst_stride_uv,
-              dst_width, dst_height,
-              static_cast<FilterMode>(f));
-
-    int value = dst_y[(dst_stride_y * b) + b];
-
-    // catch the case that the output buffer is all 0
-    if (value == 0)
-      ++err;
-
-    for (i = b; i < (dst_height + b); ++i) {
-      for (j = b; j < (dst_width + b); ++j) {
-        if (value != dst_y[(i * dst_stride_y) + j])
-          ++err;
-      }
-    }
-
-    value = dst_u[(dst_stride_uv * b) + b];
-
-    if (value == 0)
-      ++err;
-
-    for (i = b; i < (dst_height_uv + b); ++i) {
-      for (j = b; j < (dst_width_uv + b); ++j) {
-        if (value != dst_u[(i * dst_stride_uv) + j])
-          ++err;
-        if (value != dst_v[(i * dst_stride_uv) + j])
-          ++err;
-      }
-    }
-  }
-
-  free_aligned_buffer_16(src_y)
-  free_aligned_buffer_16(src_u)
-  free_aligned_buffer_16(src_v)
-  free_aligned_buffer_16(dst_y)
-  free_aligned_buffer_16(dst_u)
-  free_aligned_buffer_16(dst_v)
-
-  EXPECT_EQ(0, err);
+#include <windows.h>
+static double get_time()
+{
+    LARGE_INTEGER t, f;
+    QueryPerformanceCounter(&t);
+    QueryPerformanceFrequency(&f);
+    return double(t.QuadPart)/double(f.QuadPart);
 }
 
-TEST_F(libyuvTest, ScaleDownBy34) {
+#else
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+static double get_time()
+{
+    struct timeval t;
+    struct timezone tzp;
+    gettimeofday(&t, &tzp);
+    return t.tv_sec + t.tv_usec*1e-6;
+}
+
+#endif
+
+static int TestFilter(int src_width, int src_height,
+                      int dst_width, int dst_height,
+                      FilterMode f) {
+
   int b = 128;
-  int src_width = 1280;
-  int src_height = 720;
   int src_width_uv = (src_width + 1) >> 1;
   int src_height_uv = (src_height + 1) >> 1;
 
@@ -176,9 +71,6 @@ TEST_F(libyuvTest, ScaleDownBy34) {
   align_buffer_16(src_u, src_uv_plane_size)
   align_buffer_16(src_v, src_uv_plane_size)
 
-  int dst_width = (src_width*3) >> 2;
-  int dst_height = (src_height*3) >> 2;
-
   int dst_width_uv = (dst_width + 1) >> 1;
   int dst_height_uv = (dst_height + 1) >> 1;
 
@@ -205,20 +97,18 @@ TEST_F(libyuvTest, ScaleDownBy34) {
     }
   }
 
-  int f;
-  int err = 0;
+  const int runs = 128;
+  align_buffer_16(dst_y_c, dst_y_plane_size)
+  align_buffer_16(dst_u_c, dst_uv_plane_size)
+  align_buffer_16(dst_v_c, dst_uv_plane_size)
+  align_buffer_16(dst_y_opt, dst_y_plane_size)
+  align_buffer_16(dst_u_opt, dst_uv_plane_size)
+  align_buffer_16(dst_v_opt, dst_uv_plane_size)
 
-  // currently three filter modes, defined as FilterMode in scale.h
-  for (f = 0; f < 3; ++f) {
-    int max_diff = 0;
-    align_buffer_16(dst_y_c, dst_y_plane_size)
-    align_buffer_16(dst_u_c, dst_uv_plane_size)
-    align_buffer_16(dst_v_c, dst_uv_plane_size)
-    align_buffer_16(dst_y_opt, dst_y_plane_size)
-    align_buffer_16(dst_u_opt, dst_uv_plane_size)
-    align_buffer_16(dst_v_opt, dst_uv_plane_size)
+  libyuv::MaskCpuFlags(0);
+  double c_time = get_time();
 
-    libyuv::MaskCpuFlagsForTest(0);
+  for (i = 0; i < runs; ++i)
     I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
               src_u + (src_stride_uv * b) + b, src_stride_uv,
               src_v + (src_stride_uv * b) + b, src_stride_uv,
@@ -226,10 +116,14 @@ TEST_F(libyuvTest, ScaleDownBy34) {
               dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
               dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
               dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
-              dst_width, dst_height,
-              static_cast<FilterMode>(f));
+              dst_width, dst_height, f);
 
-    libyuv::MaskCpuFlagsForTest(-1);
+  c_time = (get_time() - c_time) / runs;
+
+  libyuv::MaskCpuFlags(-1);
+  double opt_time = get_time();
+
+  for (i = 0; i < runs; ++i)
     I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
               src_u + (src_stride_uv * b) + b, src_stride_uv,
               src_v + (src_stride_uv * b) + b, src_stride_uv,
@@ -237,50 +131,121 @@ TEST_F(libyuvTest, ScaleDownBy34) {
               dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
               dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
               dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
-              dst_width, dst_height,
-              static_cast<FilterMode>(f));
+              dst_width, dst_height, f);
 
-    // C version may be a little off from the optimized.  Order of
-    //  operations may introduce rounding somewhere.  So do a difference
-    //  of the buffers and look to see that the max difference isn't
-    //  over 2.
-    for (i = b; i < (dst_height + b); ++i) {
-      for (j = b; j < (dst_width + b); ++j) {
-        int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
-                           dst_y_opt[(i * dst_stride_y) + j]);
-        if (abs_diff > max_diff)
-          max_diff = abs_diff;
-      }
+  opt_time = (get_time() - opt_time) / runs;
+
+  printf ("filter %d - %8d us c - %8d us opt\n",
+          f, (int)(c_time*1e6), (int)(opt_time*1e6));
+  ::testing::Test::RecordProperty("C", (int)c_time);
+  ::testing::Test::RecordProperty("Opt", (int)opt_time);
+
+  // C version may be a little off from the optimized.  Order of
+  //  operations may introduce rounding somewhere.  So do a difference
+  //  of the buffers and look to see that the max difference isn't
+  //  over 2.
+  int err = 0;
+  int max_diff = 0;
+  for (i = b; i < (dst_height + b); ++i) {
+    for (j = b; j < (dst_width + b); ++j) {
+      int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
+                          dst_y_opt[(i * dst_stride_y) + j]);
+      if (abs_diff > max_diff)
+        max_diff = abs_diff;
     }
-
-    for (i = b; i < (dst_height_uv + b); ++i) {
-      for (j = b; j < (dst_width_uv + b); ++j) {
-        int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
-                           dst_u_opt[(i * dst_stride_uv) + j]);
-        if (abs_diff > max_diff)
-          max_diff = abs_diff;
-        abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
-                       dst_v_opt[(i * dst_stride_uv) + j]);
-        if (abs_diff > max_diff)
-          max_diff = abs_diff;
-
-      }
-    }
-
-    if (max_diff > 2)
-      err++;
-
-    free_aligned_buffer_16(dst_y_c)
-    free_aligned_buffer_16(dst_u_c)
-    free_aligned_buffer_16(dst_v_c)
-    free_aligned_buffer_16(dst_y_opt)
-    free_aligned_buffer_16(dst_u_opt)
-    free_aligned_buffer_16(dst_v_opt)
   }
 
+  for (i = b; i < (dst_height_uv + b); ++i) {
+    for (j = b; j < (dst_width_uv + b); ++j) {
+      int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
+                          dst_u_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff)
+        max_diff = abs_diff;
+      abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
+                      dst_v_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff)
+        max_diff = abs_diff;
+
+    }
+  }
+
+  if (max_diff > 2)
+    err++;
+
+  free_aligned_buffer_16(dst_y_c)
+  free_aligned_buffer_16(dst_u_c)
+  free_aligned_buffer_16(dst_v_c)
+  free_aligned_buffer_16(dst_y_opt)
+  free_aligned_buffer_16(dst_u_opt)
+  free_aligned_buffer_16(dst_v_opt)
+
   free_aligned_buffer_16(src_y)
   free_aligned_buffer_16(src_u)
   free_aligned_buffer_16(src_v)
 
+  return err;
+}
+
+TEST_F(libyuvTest, ScaleDownBy2) {
+
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width >> 1;
+  const int dst_height = src_height >> 1;
+  int err = 0;
+
+  for (int f = 0; f < 3; ++f)
+    err += TestFilter (src_width, src_height,
+                       dst_width, dst_height,
+                       static_cast<FilterMode>(f));
+
+  EXPECT_EQ(0, err);
+}
+
+TEST_F(libyuvTest, ScaleDownBy4) {
+
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width >> 2;
+  const int dst_height = src_height >> 2;
+  int err = 0;
+
+  for (int f = 0; f < 3; ++f)
+    err += TestFilter (src_width, src_height,
+                       dst_width, dst_height,
+                       static_cast<FilterMode>(f));
+
+  EXPECT_EQ(0, err);
+}
+
+TEST_F(libyuvTest, ScaleDownBy34) {
+
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = (src_width*3) >> 2;
+  const int dst_height = (src_height*3) >> 2;
+  int err = 0;
+
+  for (int f = 0; f < 3; ++f)
+    err += TestFilter (src_width, src_height,
+                       dst_width, dst_height,
+                       static_cast<FilterMode>(f));
+
+  EXPECT_EQ(0, err);
+}
+
+TEST_F(libyuvTest, ScaleDownBy38) {
+  int src_width = 1280;
+  int src_height = 720;
+  int dst_width = (src_width*3) >> 3;
+  int dst_height = (src_height*3) >> 3;
+
+  int err = 0;
+
+  for (int f = 0; f < 3; ++f)
+    err += TestFilter (src_width, src_height,
+                       dst_width, dst_height,
+                       static_cast<FilterMode>(f));
+
   EXPECT_EQ(0, err);
 }