diff --git a/include/libyuv/scale_argb.h b/include/libyuv/scale_argb.h
index 575119c9c..225f77258 100644
--- a/include/libyuv/scale_argb.h
+++ b/include/libyuv/scale_argb.h
@@ -22,7 +22,7 @@ extern "C" {
 enum FilterMode {
   kFilterNone = 0,  // Point sample; Fastest
   kFilterBilinear = 1,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 2  // Highest quality
+  kFilterBox = 2  // Highest quality (not supported for ARGB)
 };
 
 int ARGBScale(const uint8* src_argb, int src_stride_argb,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index fcbb506b0..8efe68508 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -775,13 +775,14 @@ __declspec(naked) __declspec(align(16))
 static void SetRows32_X86(uint8* dst, uint32 v32, int width,
                          int dst_stride, int height) {
   __asm {
+    push       esi
     push       edi
     push       ebp
-    mov        edi, [esp + 8 + 4]   // dst
-    mov        eax, [esp + 8 + 8]   // v32
-    mov        ebp, [esp + 8 + 12]  // width
-    mov        edx, [esp + 8 + 16]  // dst_stride
-    mov        ebx, [esp + 8 + 20]  // height
+    mov        edi, [esp + 12 + 4]   // dst
+    mov        eax, [esp + 12 + 8]   // v32
+    mov        ebp, [esp + 12 + 12]  // width
+    mov        edx, [esp + 12 + 16]  // dst_stride
+    mov        esi, [esp + 12 + 20]  // height
     lea        ecx, [ebp * 4]
     sub        edx, ecx             // stride - width * 4
 
@@ -790,11 +791,12 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
     mov        ecx, ebp
     rep stosd
     add        edi, edx
-    sub        ebx, 1
+    sub        esi, 1
     jg         convertloop
 
     pop        ebp
     pop        edi
+    pop        esi
     ret
   }
 }
diff --git a/source/scale.cc b/source/scale.cc
index c34a5a272..235bcf8f2 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -55,7 +55,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
   asm volatile (
     "1:                                        \n"
     // load even pixels into q0, odd into q1
-    "vld2.u8    {q0,q1}, [%0]!                 \n"  
+    "vld2.u8    {q0,q1}, [%0]!                 \n"
     "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
     "subs       %2, %2, #16                    \n"  // 16 processed per loop
     "bgt        1b                             \n"
@@ -71,14 +71,14 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
                            uint8* dst, int dst_width) {
   asm volatile (
     // change the stride to row 2 pointer
-    "add        %1, %0                         \n"  
+    "add        %1, %0                         \n"
     "1:                                        \n"
     "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post inc
     "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post inc
     "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
     "vpaddl.u8  q1, q1                         \n"
     // row 2 add adjacent, add row 1 to row 2
-    "vpadal.u8  q0, q2                         \n"  
+    "vpadal.u8  q0, q2                         \n"
     "vpadal.u8  q1, q3                         \n"
     "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
     "vrshrn.u16 d1, q1, #2                     \n"
@@ -1399,6 +1399,10 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
 }
 
 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
+// Normal formula for bilinear interpolation is:
+//   source_y_fraction * row1 + (1 - source_y_fraction) row0
+// SSE2 version using the a single multiply of difference:
+//   source_y_fraction * (row1 - row0) + row0
 #define HAS_SCALEFILTERROWS_SSE2
 __declspec(naked) __declspec(align(16))
 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
@@ -1424,8 +1428,6 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     pshufd     xmm5, xmm5, 0
     pxor       xmm4, xmm4
 
-    // f * row1 + (1 - frac) row0
-    // frac * (row1 - row0) + row0
     align      16
   xloop:
     movdqa     xmm0, [esi]  // row0
@@ -3677,11 +3679,13 @@ void ScalePlane(const uint8* src, int src_stride,
       // optimized, 3/8
       ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
                        src_stride, dst_stride, src, dst, filtering);
-    } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
+    } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+               filtering != kFilterBilinear) {
       // optimized, 1/4
       ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
                       src_stride, dst_stride, src, dst, filtering);
-    } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
+    } else if (8 * dst_width == src_width && 8 * dst_height == src_height &&
+               filtering != kFilterBilinear) {
       // optimized, 1/8
       ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
                       src_stride, dst_stride, src, dst, filtering);
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index a7c1fe383..50ba89bce 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -36,7 +36,7 @@ extern "C" {
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 __declspec(naked) __declspec(align(16))
 static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr, int src_stride,
-                               uint8* dst_ptr, int dst_width) {
+                                   uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
@@ -61,8 +61,8 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr, int src_stride,
 // Blends 8x2 rectangle to 4x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 __declspec(naked) __declspec(align(16))
-void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
-                               uint8* dst_ptr, int dst_width) {
+static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
@@ -79,8 +79,7 @@ void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2            // average rows
     pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
     shufps     xmm0, xmm1, 0x88      // even pixels
     shufps     xmm2, xmm1, 0xdd      // odd pixels
     pavgb      xmm0, xmm2
@@ -94,6 +93,94 @@ void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
   }
 }
 
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
+                                      int src_stepx,
+                                      uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       ebx
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_ptr
+                                     // src_stride ignored
+    mov        ebx, [esp + 8 + 12]   // src_stepx
+    mov        edx, [esp + 8 + 16]   // dst_ptr
+    mov        ecx, [esp + 8 + 20]   // dst_width
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+    align      16
+  wloop:
+    movd       xmm0, [eax]
+    movd       xmm1, [eax + ebx]
+    punpckldq  xmm0, xmm1
+    movd       xmm2, [eax + ebx * 2]
+    movd       xmm3, [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        edi
+    pop        ebx
+    ret
+  }
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr, int src_stride,
+                                         int src_stepx,
+                                         uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]    // src_ptr
+    mov        esi, [esp + 12 + 8]    // src_stride
+    mov        ebx, [esp + 12 + 12]   // src_stepx
+    mov        edx, [esp + 12 + 16]   // dst_ptr
+    mov        ecx, [esp + 12 + 20]   // dst_width
+    lea        esi, [eax + esi]      // row1 pointer
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+    align      16
+  wloop:
+    movq       xmm0, qword ptr [eax] // row0 4 pairs
+    movhps     xmm0, qword ptr [eax + ebx]
+    movq       xmm1, qword ptr [eax + ebx * 2]
+    movhps     xmm1, qword ptr [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    movq       xmm2, qword ptr [esi] // row1 4 pairs
+    movhps     xmm2, qword ptr [esi + ebx]
+    movq       xmm3, qword ptr [esi + ebx * 2]
+    movhps     xmm3, qword ptr [esi + edi]
+    lea        esi,  [esi + ebx * 4]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version.
 #define HAS_SCALEARGBFILTERROWS_SSE2
 __declspec(naked) __declspec(align(16))
@@ -472,16 +559,16 @@ static void ScaleARGBRowDown2_C(const uint8* src_ptr, int,
   for (int x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[0];
     dst[1] = src[2];
-    dst += 2;
     src += 4;
+    dst += 2;
   }
   if (dst_width & 1) {
     dst[0] = src[0];
   }
 }
 
-void ScaleARGBRowDown2Int_C(const uint8* src_ptr, int src_stride,
-                        uint8* dst_ptr, int dst_width) {
+static void ScaleARGBRowDown2Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* dst_ptr, int dst_width) {
   for (int x = 0; x < dst_width; ++x) {
     dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
                   src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
@@ -491,8 +578,42 @@ void ScaleARGBRowDown2Int_C(const uint8* src_ptr, int src_stride,
                   src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
     dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
                   src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
-    dst_ptr += 4;
     src_ptr += 8;
+    dst_ptr += 4;
+  }
+}
+
+static void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
+                                   int src_stepx,
+                                   uint8* dst_ptr, int dst_width) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
+  uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
+
+  for (int x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+static void ScaleARGBRowDownEvenInt_C(const uint8* src_ptr, int src_stride,
+                                      int src_stepx,
+                                      uint8* dst_ptr, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
+                  src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
+    dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
+                  src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
+    dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
+                  src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
+    dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
+                  src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
+    src_ptr += src_stepx * 4;
+    dst_ptr += 4;
   }
 }
 
@@ -583,7 +704,7 @@ static void ScaleARGBDown2(int src_width, int src_height,
       filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(dst_width, 16) &&
+      IS_ALIGNED(dst_width, 4) &&
       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
       IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
     ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
@@ -599,15 +720,52 @@ static void ScaleARGBDown2(int src_width, int src_height,
   }
 }
 
+
+/**
+ * ScaleARGB ARGB Even
+ *
+ * This is an optimized version for scaling down a ARGB to even
+ * multiple of its original size.
+ *
+ */
+static void ScaleARGBDownEven(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_ptr, uint8* dst_ptr,
+                              FilterMode filtering) {
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+                               int src_step, uint8* dst_ptr, int dst_width) =
+      filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 :
+        ScaleARGBRowDownEven_SSE2;
+  }
+#endif
+  int src_step = src_width / dst_width;
+  // Adjust to point to center of box.
+  int row_step = src_height / dst_height;
+  int row_stride = row_step * src_stride;
+  src_ptr += ((row_step >> 1) - 1) * src_stride + ((src_step >> 1) - 1) * 4;
+  for (int y = 0; y < dst_height; ++y) {
+    ScaleARGBRowDownEven(src_ptr, src_stride, src_step, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
 /**
  * ScaleARGB ARGB to/from any dimensions, with bilinear
  * interpolation.
  */
 
-void ScaleARGBBilinear(int src_width, int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_ptr, uint8* dst_ptr) {
+static void ScaleARGBBilinear(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_ptr, uint8* dst_ptr) {
   assert(dst_width > 0);
   assert(dst_height > 0);
   assert(src_width <= kMaxInputWidth);
@@ -728,11 +886,25 @@ static void ScaleARGB(const uint8* src, int src_stride,
     return;
   }
   if (2 * dst_width == src_width && 2 * dst_height == src_height) {
-    // optimized 1/2.
+    // Optimized 1/2.
     ScaleARGBDown2(src_width, src_height, dst_width, dst_height,
                    src_stride, dst_stride, src, dst, filtering);
     return;
   }
+  int scale_down_x = src_width / dst_width;
+  int scale_down_y = src_height / dst_height;
+  if (dst_width * scale_down_x == src_width &&
+      dst_height * scale_down_y == src_height) {
+    if (!(scale_down_x & 1) && !(scale_down_y & 1)) {
+      // Optimized even scale down. ie 4, 6, 8, 10x
+      ScaleARGBDownEven(src_width, src_height, dst_width, dst_height,
+                        src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if ((scale_down_x & 1) && (scale_down_y & 1)) {
+      filtering = kFilterNone;
+    }
+  }
   // Arbitrary scale up and/or down.
   ScaleARGBAnySize(src_width, src_height, dst_width, dst_height,
                    src_stride, dst_stride, src, dst, filtering);
diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index 489d91323..08103ca87 100644
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -20,129 +20,169 @@ namespace libyuv {
 static int ARGBTestFilter(int src_width, int src_height,
                       int dst_width, int dst_height,
                       FilterMode f) {
+  const int b = 128;
+  int src_argb_plane_size = (src_width + (2 * b)) * (src_height + (2 * b)) * 4;
+  int src_stride_argb = (2 * b + src_width) * 4;
 
-  int b = 128;
+  align_buffer_16(src_argb, src_argb_plane_size)
+  memset(src_argb, 1, src_argb_plane_size);
 
-  int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b)) * 4;
-  int src_stride_y = (2 * b + src_width) * 4;
-
-  align_buffer_16(src_y, src_y_plane_size)
-
-  int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b)) * 4;
-  int dst_stride_y = (2 * b + dst_width) * 4;
+  int dst_argb_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b)) * 4;
+  int dst_stride_argb = (2 * b + dst_width) * 4;
 
   srandom(time(NULL));
 
   int i, j;
-
   for (i = b; i < (src_height + b); ++i) {
     for (j = b; j < (src_width + b) * 4; ++j) {
-      src_y[(i * src_stride_y) + j] = (random() & 0xff);
+      src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
     }
   }
 
   const int runs = 1000;
-  align_buffer_16(dst_y_c, dst_y_plane_size)
-  align_buffer_16(dst_y_opt, dst_y_plane_size)
+  align_buffer_16(dst_argb_c, dst_argb_plane_size)
+  align_buffer_16(dst_argb_opt, dst_argb_plane_size)
+  memset(dst_argb_c, 2, dst_argb_plane_size);
+  memset(dst_argb_opt, 3, dst_argb_plane_size);
 
-  MaskCpuFlags(kCpuInitialized);
+  // Warm up both versions for consistent benchmarks.
+  MaskCpuFlags(0);  // Disable all CPU optimization.
+  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+            src_width, src_height,
+            dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+            dst_width, dst_height, f);
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+            src_width, src_height,
+            dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+            dst_width, dst_height, f);
+
+  MaskCpuFlags(0);  // Disable all CPU optimization.
   double c_time = get_time();
-
-  for (i = 0; i < runs; ++i)
-    ARGBScale(src_y + (src_stride_y * b) + b * 4, src_stride_y,
+  for (i = 0; i < runs; ++i) {
+    ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
               src_width, src_height,
-              dst_y_c + (dst_stride_y * b) + b * 4, dst_stride_y,
+              dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
               dst_width, dst_height, f);
-
+  }
   c_time = (get_time() - c_time) / runs;
 
-  MaskCpuFlags(-1);
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
   double opt_time = get_time();
-
-  for (i = 0; i < runs; ++i)
-    ARGBScale(src_y + (src_stride_y * b) + b * 4, src_stride_y,
+  for (i = 0; i < runs; ++i) {
+    ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
               src_width, src_height,
-              dst_y_opt + (dst_stride_y * b) + b * 4, dst_stride_y,
+              dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
               dst_width, dst_height, f);
-
+  }
   opt_time = (get_time() - opt_time) / runs;
 
-  printf ("filter %d - %8d us c - %8d us opt\n",
-          f, (int)(c_time*1e6), (int)(opt_time*1e6));
+  // Report performance of C vs OPT
+  printf("filter %d - %8d us C - %8d us OPT\n",
+         f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
 
   // C version may be a little off from the optimized.  Order of
   //  operations may introduce rounding somewhere.  So do a difference
   //  of the buffers and look to see that the max difference isn't
   //  over 2.
-  int err = 0;
   int max_diff = 0;
   for (i = b; i < (dst_height + b); ++i) {
     for (j = b * 4; j < (dst_width + b) * 4; ++j) {
-      int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
-                         dst_y_opt[(i * dst_stride_y) + j]);
+      int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+                         dst_argb_opt[(i * dst_stride_argb) + j]);
       if (abs_diff > max_diff)
         max_diff = abs_diff;
     }
   }
 
-  if (max_diff > 2)
-    err++;
-
-  free_aligned_buffer_16(dst_y_c)
-  free_aligned_buffer_16(dst_y_opt)
-  free_aligned_buffer_16(src_y)
-  return err;
+  free_aligned_buffer_16(dst_argb_c)
+  free_aligned_buffer_16(dst_argb_opt)
+  free_aligned_buffer_16(src_argb)
+  return max_diff;
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy2) {
-
   const int src_width = 1280;
   const int src_height = 720;
   const int dst_width = src_width / 2;
   const int dst_height = src_height / 2;
-  int err = 0;
 
   for (int f = 0; f < 2; ++f) {
-    err += ARGBTestFilter(src_width, src_height,
-                          dst_width, dst_height,
-                          static_cast<FilterMode>(f));
+    int err = ARGBTestFilter(src_width, src_height,
+                             dst_width, dst_height,
+                             static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
   }
-
-  EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy4) {
-
   const int src_width = 1280;
   const int src_height = 720;
   const int dst_width = src_width / 4;
   const int dst_height = src_height / 4;
-  int err = 0;
 
   for (int f = 0; f < 2; ++f) {
-    err += ARGBTestFilter(src_width, src_height,
-                          dst_width, dst_height,
-                          static_cast<FilterMode>(f));
+    int err = ARGBTestFilter(src_width, src_height,
+                             dst_width, dst_height,
+                             static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
   }
+}
 
-  EXPECT_EQ(0, err);
+TEST_F(libyuvTest, ARGBScaleDownBy5) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 5;
+  const int dst_height = src_height / 5;
+
+  for (int f = 0; f < 2; ++f) {
+    int err = ARGBTestFilter(src_width, src_height,
+                             dst_width, dst_height,
+                             static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
+  }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy8) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 8;
+  const int dst_height = src_height / 8;
+
+  for (int f = 0; f < 2; ++f) {
+    int err = ARGBTestFilter(src_width, src_height,
+                             dst_width, dst_height,
+                             static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
+  }
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy16) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 16;
+  const int dst_height = src_height / 16;
+
+  for (int f = 0; f < 2; ++f) {
+    int err = ARGBTestFilter(src_width, src_height,
+                             dst_width, dst_height,
+                             static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
+  }
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy34) {
-
   const int src_width = 1280;
   const int src_height = 720;
   const int dst_width = src_width * 3 / 4;
   const int dst_height = src_height * 3 / 4;
-  int err = 0;
 
   for (int f = 0; f < 2; ++f) {
-    err += ARGBTestFilter(src_width, src_height,
-                          dst_width, dst_height,
-                          static_cast<FilterMode>(f));
+    int err = ARGBTestFilter(src_width, src_height,
+                             dst_width, dst_height,
+                             static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
   }
-
-  EXPECT_EQ(0, err);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy38) {
@@ -150,31 +190,27 @@ TEST_F(libyuvTest, ARGBScaleDownBy38) {
   int src_height = 720;
   int dst_width = src_width * 3 / 8;
   int dst_height = src_height * 3 / 8;
-  int err = 0;
 
   for (int f = 0; f < 2; ++f) {
-    err += ARGBTestFilter(src_width, src_height,
-                          dst_width, dst_height,
-                          static_cast<FilterMode>(f));
+    int err = ARGBTestFilter(src_width, src_height,
+                             dst_width, dst_height,
+                             static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
   }
-
-  EXPECT_EQ(0, err);
 }
 
-TEST_F(libyuvTest, ARGBScalePlaneBilinear) {
+TEST_F(libyuvTest, ARGBScaleTo1366) {
   int src_width = 1280;
   int src_height = 720;
   int dst_width = 1366;
   int dst_height = 768;
-  int err = 0;
 
   for (int f = 0; f < 2; ++f) {
-    err += ARGBTestFilter(src_width, src_height,
-                          dst_width, dst_height,
-                          static_cast<FilterMode>(f));
+    int err = ARGBTestFilter(src_width, src_height,
+                             dst_width, dst_height,
+                             static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
   }
-
-  EXPECT_EQ(0, err);
 }
 
 }  // namespace libyuv
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index 215aadfb6..4701c1e1a 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -20,8 +20,7 @@ namespace libyuv {
 static int TestFilter(int src_width, int src_height,
                       int dst_width, int dst_height,
                       FilterMode f) {
-
-  int b = 128;
+  const int b = 128;
   int src_width_uv = (src_width + 1) >> 1;
   int src_height_uv = (src_height + 1) >> 1;
 
@@ -47,7 +46,6 @@ static int TestFilter(int src_width, int src_height,
   srandom(time(NULL));
 
   int i, j;
-
   for (i = b; i < (src_height + b); ++i) {
     for (j = b; j < (src_width + b); ++j) {
       src_y[(i * src_stride_y) + j] = (random() & 0xff);
@@ -69,10 +67,29 @@ static int TestFilter(int src_width, int src_height,
   align_buffer_16(dst_u_opt, dst_uv_plane_size)
   align_buffer_16(dst_v_opt, dst_uv_plane_size)
 
-  MaskCpuFlags(kCpuInitialized);
-  double c_time = get_time();
+  // Warm up both versions for consistent benchmarks.
+  MaskCpuFlags(0);  // Disable all CPU optimization.
+  I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+            src_u + (src_stride_uv * b) + b, src_stride_uv,
+            src_v + (src_stride_uv * b) + b, src_stride_uv,
+            src_width, src_height,
+            dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
+            dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_width, dst_height, f);
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
+  I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+            src_u + (src_stride_uv * b) + b, src_stride_uv,
+            src_v + (src_stride_uv * b) + b, src_stride_uv,
+            src_width, src_height,
+            dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
+            dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+            dst_width, dst_height, f);
 
-  for (i = 0; i < runs; ++i)
+  MaskCpuFlags(0);  // Disable all CPU optimization.
+  double c_time = get_time();
+  for (i = 0; i < runs; ++i) {
     I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
               src_u + (src_stride_uv * b) + b, src_stride_uv,
               src_v + (src_stride_uv * b) + b, src_stride_uv,
@@ -81,13 +98,12 @@ static int TestFilter(int src_width, int src_height,
               dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
               dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
               dst_width, dst_height, f);
-
+  }
   c_time = (get_time() - c_time) / runs;
 
-  MaskCpuFlags(-1);
+  MaskCpuFlags(-1);  // Enable all CPU optimization.
   double opt_time = get_time();
-
-  for (i = 0; i < runs; ++i)
+  for (i = 0; i < runs; ++i) {
     I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
               src_u + (src_stride_uv * b) + b, src_stride_uv,
               src_v + (src_stride_uv * b) + b, src_stride_uv,
@@ -96,24 +112,25 @@ static int TestFilter(int src_width, int src_height,
               dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
               dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
               dst_width, dst_height, f);
-
+  }
   opt_time = (get_time() - opt_time) / runs;
 
-  printf ("filter %d - %8d us c - %8d us opt\n",
-          f, (int)(c_time*1e6), (int)(opt_time*1e6));
+  // Report performance of C vs OPT
+  printf("filter %d - %8d us C - %8d us OPT\n",
+         f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
 
   // C version may be a little off from the optimized.  Order of
   //  operations may introduce rounding somewhere.  So do a difference
   //  of the buffers and look to see that the max difference isn't
   //  over 2.
-  int err = 0;
   int max_diff = 0;
   for (i = b; i < (dst_height + b); ++i) {
     for (j = b; j < (dst_width + b); ++j) {
       int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
                          dst_y_opt[(i * dst_stride_y) + j]);
-      if (abs_diff > max_diff)
+      if (abs_diff > max_diff) {
         max_diff = abs_diff;
+      }
     }
   }
 
@@ -121,19 +138,17 @@ static int TestFilter(int src_width, int src_height,
     for (j = b; j < (dst_width_uv + b); ++j) {
       int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
                          dst_u_opt[(i * dst_stride_uv) + j]);
-      if (abs_diff > max_diff)
+      if (abs_diff > max_diff) {
         max_diff = abs_diff;
+      }
       abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
                      dst_v_opt[(i * dst_stride_uv) + j]);
-      if (abs_diff > max_diff)
+      if (abs_diff > max_diff) {
         max_diff = abs_diff;
-
+      }
     }
   }
 
-  if (max_diff > 2)
-    err++;
-
   free_aligned_buffer_16(dst_y_c)
   free_aligned_buffer_16(dst_u_c)
   free_aligned_buffer_16(dst_v_c)
@@ -145,55 +160,91 @@ static int TestFilter(int src_width, int src_height,
   free_aligned_buffer_16(src_u)
   free_aligned_buffer_16(src_v)
 
-  return err;
+  return max_diff;
 }
 
 TEST_F(libyuvTest, ScaleDownBy2) {
-
   const int src_width = 1280;
   const int src_height = 720;
   const int dst_width = src_width / 2;
   const int dst_height = src_height / 2;
-  int err = 0;
 
-  for (int f = 0; f < 3; ++f)
-    err += TestFilter(src_width, src_height,
-                      dst_width, dst_height,
-                      static_cast<FilterMode>(f));
-
-  EXPECT_EQ(0, err);
+  for (int f = 0; f < 3; ++f) {
+    int err = TestFilter(src_width, src_height,
+                         dst_width, dst_height,
+                         static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
+  }
 }
 
 TEST_F(libyuvTest, ScaleDownBy4) {
-
   const int src_width = 1280;
   const int src_height = 720;
   const int dst_width = src_width / 4;
   const int dst_height = src_height / 4;
-  int err = 0;
 
-  for (int f = 0; f < 3; ++f)
-    err += TestFilter(src_width, src_height,
-                      dst_width, dst_height,
-                      static_cast<FilterMode>(f));
+  for (int f = 0; f < 3; ++f) {
+    int err = TestFilter(src_width, src_height,
+                         dst_width, dst_height,
+                         static_cast<FilterMode>(f));
+    EXPECT_GE(2, err);  // This is the only scale factor with error of 2.
+  }
+}
 
-  EXPECT_EQ(0, err);
+TEST_F(libyuvTest, ScaleDownBy5) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 5;
+  const int dst_height = src_height / 5;
+
+  for (int f = 0; f < 3; ++f) {
+    int err = TestFilter(src_width, src_height,
+                         dst_width, dst_height,
+                         static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
+  }
+}
+
+TEST_F(libyuvTest, ScaleDownBy8) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 8;
+  const int dst_height = src_height / 8;
+
+  for (int f = 0; f < 3; ++f) {
+    int err = TestFilter(src_width, src_height,
+                         dst_width, dst_height,
+                         static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
+  }
+}
+
+TEST_F(libyuvTest, ScaleDownBy16) {
+  const int src_width = 1280;
+  const int src_height = 720;
+  const int dst_width = src_width / 16;
+  const int dst_height = src_height / 16;
+
+  for (int f = 0; f < 3; ++f) {
+    int err = TestFilter(src_width, src_height,
+                         dst_width, dst_height,
+                         static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
+  }
 }
 
 TEST_F(libyuvTest, ScaleDownBy34) {
-
   const int src_width = 1280;
   const int src_height = 720;
   const int dst_width = src_width * 3 / 4;
   const int dst_height = src_height * 3 / 4;
-  int err = 0;
 
-  for (int f = 0; f < 3; ++f)
-    err += TestFilter(src_width, src_height,
-                      dst_width, dst_height,
-                      static_cast<FilterMode>(f));
-
-  EXPECT_EQ(0, err);
+  for (int f = 0; f < 3; ++f) {
+    int err = TestFilter(src_width, src_height,
+                         dst_width, dst_height,
+                         static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
+  }
 }
 
 TEST_F(libyuvTest, ScaleDownBy38) {
@@ -201,29 +252,27 @@ TEST_F(libyuvTest, ScaleDownBy38) {
   int src_height = 720;
   int dst_width = src_width * 3 / 8;
   int dst_height = src_height * 3 / 8;
-  int err = 0;
 
-  for (int f = 0; f < 3; ++f)
-    err += TestFilter(src_width, src_height,
-                      dst_width, dst_height,
-                      static_cast<FilterMode>(f));
-
-  EXPECT_EQ(0, err);
+  for (int f = 0; f < 3; ++f) {
+    int err = TestFilter(src_width, src_height,
+                         dst_width, dst_height,
+                         static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
+  }
 }
 
-TEST_F(libyuvTest, ScalePlaneBilinear) {
+TEST_F(libyuvTest, ScaleTo1366) {
   int src_width = 1280;
   int src_height = 720;
   int dst_width = 1366;
   int dst_height = 768;
-  int err = 0;
 
-  for (int f = 0; f < 3; ++f)
-    err += TestFilter(src_width, src_height,
-                      dst_width, dst_height,
-                      static_cast<FilterMode>(f));
-
-  EXPECT_EQ(0, err);
+  for (int f = 0; f < 3; ++f) {
+    int err = TestFilter(src_width, src_height,
+                         dst_width, dst_height,
+                         static_cast<FilterMode>(f));
+    EXPECT_GE(1, err);
+  }
 }
 
 }  // namespace libyuv