From b911428afd3994f47e5780a80c876d05d1d4c590 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Thu, 30 May 2013 23:42:27 +0000
Subject: [PATCH] Adapt row interpolator to do YUV as well as ARGB without
 extrude so it can be used in I420Scale. BUG=237 TEST=Scale*
 R=ryanpetrie@google.com

Review URL: https://webrtc-codereview.appspot.com/1587004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@710 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium              |   2 +-
 include/libyuv/row.h         |  65 +--
 include/libyuv/version.h     |   2 +-
 source/planar_functions.cc   |  43 +-
 source/row_any.cc            |  38 +-
 source/row_common.cc         |  21 +-
 source/row_mips.cc           |  57 +++
 source/row_neon.cc           |  19 +-
 source/row_posix.cc          |  94 ++--
 source/row_win.cc            |  92 ++--
 source/scale.cc              | 940 +++--------------------------------
 source/scale_argb.cc         | 111 +++--
 source/scale_mips.cc         |  58 ---
 unit_test/scale_argb_test.cc |  78 ++-
 unit_test/scale_test.cc      |  50 +-
 15 files changed, 475 insertions(+), 1195 deletions(-)

diff --git a/README.chromium b/README.chromium
index 4f9044897..e4a82a09d 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 709
+Version: 710
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 2c17a3585..eccad6582 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -109,8 +109,8 @@ extern "C" {
 #define HAS_ARGBBLENDROW_SSSE3
 #define HAS_ARGBCOLORMATRIXROW_SSSE3
 #define HAS_ARGBGRAYROW_SSSE3
-#define HAS_ARGBINTERPOLATEROW_SSE2
-#define HAS_ARGBINTERPOLATEROW_SSSE3
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
 #define HAS_ARGBMIRRORROW_SSSE3
 #define HAS_ARGBMULTIPLYROW_SSE2
 #define HAS_ARGBQUANTIZEROW_SSE2
@@ -261,7 +261,6 @@ extern "C" {
 #define HAS_ARGBBLENDROW_NEON
 #define HAS_ARGBCOLORMATRIXROW_NEON
 #define HAS_ARGBGRAYROW_NEON
-#define HAS_ARGBINTERPOLATEROW_NEON
 #define HAS_ARGBMIRRORROW_NEON
 #define HAS_ARGBMULTIPLYROW_NEON
 #define HAS_ARGBQUANTIZEROW_NEON
@@ -272,6 +271,7 @@ extern "C" {
 #define HAS_SOBELXYROW_NEON
 #define HAS_SOBELXROW_NEON
 #define HAS_SOBELYROW_NEON
+#define HAS_INTERPOLATEROW_NEON
 #endif
 
 // The following are available on Mips platforms
@@ -281,6 +281,7 @@ extern "C" {
 #define HAS_I422TOABGRROW_MIPS_DSPR2
 #define HAS_I422TOARGBROW_MIPS_DSPR2
 #define HAS_I422TOBGRAROW_MIPS_DSPR2
+#define HAS_INTERPOLATEROWS_MIPS_DSPR2
 #define HAS_MIRRORROW_MIPS_DSPR2
 #define HAS_MIRRORUVROW_MIPS_DSPR2
 #define HAS_SPLITUVROW_MIPS_DSPR2
@@ -1455,34 +1456,40 @@ LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* uv_dudv, int width);
 
-// Used for ARGBScale and ARGBInterpolate.
-void ARGBInterpolateRow_C(uint8* dst_argb, const uint8* src_argb,
-                          ptrdiff_t src_stride_argb,
-                          int width, int source_y_fraction);
-void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
-                             ptrdiff_t src_stride_argb, int width,
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride_ptr,
+                      int width, int source_y_fraction);
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride_ptr, int width,
+                          int source_y_fraction);
+void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                                ptrdiff_t src_stride_ptr, int width,
+                                int source_y_fraction);
+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                                   ptrdiff_t src_stride_ptr, int width,
+                                   int source_y_fraction);
+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                                    ptrdiff_t src_stride_ptr, int width,
+                                    int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
                              int source_y_fraction);
-void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                              ptrdiff_t src_stride_argb, int width,
+void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride_ptr, int width,
                               int source_y_fraction);
-void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb,
-                             ptrdiff_t src_stride_argb, int width,
-                             int source_y_fraction);
-void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
-                                       ptrdiff_t src_stride_argb, int width,
-                                       int source_y_fraction);
-void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                                        ptrdiff_t src_stride_argb, int width,
-                                        int source_y_fraction);
-void ARGBInterpolateRow_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                                 ptrdiff_t src_stride_argb, int width,
-                                 int source_y_fraction);
-void ARGBInterpolateRow_Any_SSE2(uint8* dst_argb, const uint8* src_argb,
-                                 ptrdiff_t src_stride_argb, int width,
-                                 int source_y_fraction);
-void ARGBInterpolateRow_Any_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                                  ptrdiff_t src_stride_argb, int width,
-                                  int source_y_fraction);
+void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                                    ptrdiff_t src_stride_ptr, int width,
+                                    int source_y_fraction);
 
 // Sobel images.
 void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 779b8f00d..4a1170669 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 709
+#define LIBYUV_VERSION 710
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 2133f4e2b..09e2f5368 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1617,7 +1617,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
 
 // Interpolate 2 ARGB images by specified amount (0 to 255).
 // TODO(fbarchard): Consider selecting a specialization for interpolation so
-//     row function doesn't need to check interpolation on each row.
+// row function doesn't need to check interpolation on each row.
 LIBYUV_API
 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                     const uint8* src_argb1, int src_stride_argb1,
@@ -1642,46 +1642,55 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                            width * height, 1,
                            interpolation);
   }
-  void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-                             ptrdiff_t src_stride, int dst_width,
-                             int source_y_fraction) = ARGBInterpolateRow_C;
-#if defined(HAS_ARGBINTERPOLATEROW_SSE2)
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
-    ARGBInterpolateRow = ARGBInterpolateRow_Any_SSE2;
+    InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
-      ARGBInterpolateRow = ARGBInterpolateRow_Unaligned_SSE2;
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
       if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
           IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
           IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        ARGBInterpolateRow = ARGBInterpolateRow_SSE2;
+        InterpolateRow = InterpolateRow_SSE2;
       }
     }
   }
 #endif
-#if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
+#if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
-    ARGBInterpolateRow = ARGBInterpolateRow_Any_SSSE3;
+    InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(width, 4)) {
-      ARGBInterpolateRow = ARGBInterpolateRow_Unaligned_SSSE3;
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
       if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
           IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
           IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
+        InterpolateRow = InterpolateRow_SSSE3;
       }
     }
   }
 #endif
-#if defined(HAS_ARGBINTERPOLATEROW_NEON)
+#if defined(HAS_INTERPOLATEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
-    ARGBInterpolateRow = ARGBInterpolateRow_Any_NEON;
+    InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(width, 4)) {
-      ARGBInterpolateRow = ARGBInterpolateRow_NEON;
+      InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 &&
+      IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
+      IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+
   for (int y = 0; y < height; ++y) {
-    ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
-                       width, interpolation);
+    InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
+                   width * 4, interpolation);
     src_argb0 += src_stride_argb0;
     src_argb1 += src_stride_argb1;
     dst_argb += dst_stride_argb;
diff --git a/source/row_any.cc b/source/row_any.cc
index 83afb420e..72100d90e 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -483,29 +483,33 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
 #undef YANY
 
 // Interpolate may want to work in place, so last16 method can not be used.
-#define NANY(NAMEANY, ARGBTERP_SIMD, ARGBTERP_C, SBPP, BPP, MASK)              \
-    void NAMEANY(uint8* dst_argb, const uint8* src_argb,                       \
-                 ptrdiff_t src_stride_argb, int width,                         \
+#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK)                      \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 ptrdiff_t src_stride_ptr, int width,                          \
                  int source_y_fraction) {                                      \
       int n = width & ~MASK;                                                   \
-      ARGBTERP_SIMD(dst_argb, src_argb, src_stride_argb,                       \
-                    n, source_y_fraction);                                     \
-      ARGBTERP_C(dst_argb + n * BPP,                                           \
-                 src_argb + n * SBPP, src_stride_argb,                         \
-                 width & MASK, source_y_fraction);                             \
+      TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr,                              \
+                n, source_y_fraction);                                         \
+      TERP_C(dst_ptr + n * BPP,                                                \
+             src_ptr + n * SBPP, src_stride_ptr,                               \
+             width & MASK, source_y_fraction);                                 \
     }
 
-#ifdef HAS_ARGBINTERPOLATEROW_SSSE3
-NANY(ARGBInterpolateRow_Any_SSSE3, ARGBInterpolateRow_Unaligned_SSSE3,
-     ARGBInterpolateRow_C, 4, 4, 3)
+#ifdef HAS_INTERPOLATEROW_SSSE3
+NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
+     InterpolateRow_C, 1, 1, 15)
 #endif
-#ifdef HAS_ARGBINTERPOLATEROW_SSE2
-NANY(ARGBInterpolateRow_Any_SSE2, ARGBInterpolateRow_Unaligned_SSE2,
-     ARGBInterpolateRow_C, 4, 4, 3)
+#ifdef HAS_INTERPOLATEROW_SSE2
+NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
+     InterpolateRow_C, 1, 1, 15)
 #endif
-#ifdef HAS_ARGBINTERPOLATEROW_NEON
-NANY(ARGBInterpolateRow_Any_NEON, ARGBInterpolateRow_NEON,
-     ARGBInterpolateRow_C, 4, 4, 3)
+#ifdef HAS_INTERPOLATEROW_NEON
+NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
+     InterpolateRow_C, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
+NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
+     InterpolateRow_C, 1, 1, 3)
 #endif
 #undef NANY
 
diff --git a/source/row_common.cc b/source/row_common.cc
index 43f79a329..badea4405 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1775,9 +1775,9 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
 }
 
 // C version 2x2 -> 2x1.
-void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride,
-                          int width, int source_y_fraction) {
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride,
+                      int width, int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   const uint8* src_ptr1 = src_ptr + src_stride;
@@ -1785,21 +1785,12 @@ void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
   for (int x = 0; x < width - 1; x += 2) {
     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
-    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
-    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
-    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
-    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
-    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
-    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
-    src_ptr += 8;
-    src_ptr1 += 8;
-    dst_ptr += 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
   }
   if (width & 1) {
     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
-    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
-    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
   }
 }
 
diff --git a/source/row_mips.cc b/source/row_mips.cc
index c4f1e773b..69677aa2d 100644
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@@ -909,6 +909,63 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
       "s4", "s5", "s6"
   );
 }
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                                ptrdiff_t src_stride, int dst_width,
+                                int source_y_fraction) {
+    int y0_fraction = 256 - source_y_fraction;
+    const uint8* src_ptr1 = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+     ".set push                                           \n"
+     ".set noreorder                                      \n"
+
+     "replv.ph          $t0, %[y0_fraction]               \n"
+     "replv.ph          $t1, %[source_y_fraction]         \n"
+   "1:                                                    \n"
+     "lw                $t2, 0(%[src_ptr])                \n"
+     "lw                $t3, 0(%[src_ptr1])               \n"
+     "lw                $t4, 4(%[src_ptr])                \n"
+     "lw                $t5, 4(%[src_ptr1])               \n"
+     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
+     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
+     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
+     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
+     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
+     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
+     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
+     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
+     "addq.ph           $t6, $t6, $t8                     \n"
+     "addq.ph           $t7, $t7, $t9                     \n"
+     "addq.ph           $t2, $t2, $t4                     \n"
+     "addq.ph           $t3, $t3, $t5                     \n"
+     "shra.ph           $t6, $t6, 8                       \n"
+     "shra.ph           $t7, $t7, 8                       \n"
+     "shra.ph           $t2, $t2, 8                       \n"
+     "shra.ph           $t3, $t3, 8                       \n"
+     "precr.qb.ph       $t6, $t6, $t7                     \n"
+     "precr.qb.ph       $t2, $t2, $t3                     \n"
+     "addiu             %[src_ptr], %[src_ptr], 8         \n"
+     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
+     "addiu             %[dst_width], %[dst_width], -8    \n"
+     "sw                $t6, 0(%[dst_ptr])                \n"
+     "sw                $t2, 4(%[dst_ptr])                \n"
+     "bgtz              %[dst_width], 1b                  \n"
+     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
+
+     ".set pop                                            \n"
+  : [dst_ptr] "+r" (dst_ptr),
+    [src_ptr1] "+r" (src_ptr1),
+    [src_ptr] "+r" (src_ptr),
+    [dst_width] "+r" (dst_width)
+  : [source_y_fraction] "r" (source_y_fraction),
+    [y0_fraction] "r" (y0_fraction),
+    [src_stride] "r" (src_stride)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
 #endif  // __mips_dsp_rev >= 2
 
 #ifdef __cplusplus
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 82587a334..53da16afa 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2161,11 +2161,10 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
   );
 }
 
-// 4x2 -> 4x1
-// Same as ScaleARGBFilterRows_NEON but with last pixel not duplicated.
-void ARGBInterpolateRow_NEON(uint8* dst_ptr,
-                             const uint8* src_ptr, ptrdiff_t src_stride,
-                             int dst_width, int source_y_fraction) {
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
   asm volatile (
     "cmp        %4, #0                         \n"
     "beq        100f                           \n"
@@ -2184,7 +2183,7 @@ void ARGBInterpolateRow_NEON(uint8* dst_ptr,
   "1:                                          \n"
     "vld1.u8    {q0}, [%1]!                    \n"
     "vld1.u8    {q1}, [%2]!                    \n"
-    "subs       %3, #4                         \n"
+    "subs       %3, %3, #16                    \n"
     "vmull.u8   q13, d0, d4                    \n"
     "vmull.u8   q14, d1, d4                    \n"
     "vmlal.u8   q13, d2, d5                    \n"
@@ -2199,7 +2198,7 @@ void ARGBInterpolateRow_NEON(uint8* dst_ptr,
   "25:                                         \n"
     "vld1.u8    {q0}, [%1]!                    \n"
     "vld1.u8    {q1}, [%2]!                    \n"
-    "subs       %3, #4                         \n"
+    "subs       %3, %3, #16                    \n"
     "vrhadd.u8  q0, q1                         \n"
     "vrhadd.u8  q0, q1                         \n"
     "vst1.u8    {q0}, [%0]!                    \n"
@@ -2210,7 +2209,7 @@ void ARGBInterpolateRow_NEON(uint8* dst_ptr,
   "50:                                         \n"
     "vld1.u8    {q0}, [%1]!                    \n"
     "vld1.u8    {q1}, [%2]!                    \n"
-    "subs       %3, #4                         \n"
+    "subs       %3, %3, #16                    \n"
     "vrhadd.u8  q0, q1                         \n"
     "vst1.u8    {q0}, [%0]!                    \n"
     "bgt        50b                            \n"
@@ -2220,7 +2219,7 @@ void ARGBInterpolateRow_NEON(uint8* dst_ptr,
   "75:                                         \n"
     "vld1.u8    {q1}, [%1]!                    \n"
     "vld1.u8    {q0}, [%2]!                    \n"
-    "subs       %3, #4                         \n"
+    "subs       %3, %3, #16                    \n"
     "vrhadd.u8  q0, q1                         \n"
     "vrhadd.u8  q0, q1                         \n"
     "vst1.u8    {q0}, [%0]!                    \n"
@@ -2230,7 +2229,7 @@ void ARGBInterpolateRow_NEON(uint8* dst_ptr,
     // Blend 100 / 0 - Copy row unchanged.
   "100:                                        \n"
     "vld1.u8    {q0}, [%1]!                    \n"
-    "subs       %3, #4                         \n"
+    "subs       %3, %3, #16                    \n"
     "vst1.u8    {q0}, [%0]!                    \n"
     "bgt        100b                           \n"
 
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 6b6c88605..760b9a984 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -4781,7 +4781,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
   : "+r"(src_argb),  // %0
     "+r"(src_argb_stride_temp),  // %1
     "+r"(dst_argb),  // %2
-    "+r"(src_dudv),   // %3
+    "+r"(src_dudv),  // %3
     "+rm"(width),    // %4
     "+r"(temp)   // %5
   :
@@ -4793,11 +4793,10 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
-void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                              ptrdiff_t src_stride, int dst_width,
-                              int source_y_fraction) {
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
@@ -4831,7 +4830,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "psrlw     $0x7,%%xmm0                     \n"
     "psrlw     $0x7,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
@@ -4844,7 +4843,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "movdqa    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        25b                             \n"
@@ -4856,7 +4855,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "movdqa    (%1),%%xmm0                     \n"
     "movdqa    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        50b                             \n"
@@ -4869,7 +4868,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "movdqa    (%1,%4,1),%%xmm0                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        75b                             \n"
@@ -4879,14 +4878,14 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
     ".p2align  4                               \n"
   "100:                                        \n"
     "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
@@ -4897,11 +4896,10 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
   );
 }
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
-void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
-                             ptrdiff_t src_stride, int dst_width,
-                             int source_y_fraction) {
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
@@ -4943,7 +4941,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
     "paddw     %%xmm2,%%xmm0                   \n"
     "paddw     %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
@@ -4956,7 +4954,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
     "movdqa    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        25b                             \n"
@@ -4968,7 +4966,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
     "movdqa    (%1),%%xmm0                     \n"
     "movdqa    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        50b                             \n"
@@ -4981,7 +4979,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
     "movdqa    (%1,%4,1),%%xmm0                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        75b                             \n"
@@ -4991,14 +4989,14 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
     ".p2align  4                               \n"
   "100:                                        \n"
     "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
@@ -5009,11 +5007,10 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
   );
 }
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
-void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                                        ptrdiff_t src_stride, int dst_width,
-                                        int source_y_fraction) {
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                                    ptrdiff_t src_stride, int dst_width,
+                                    int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
@@ -5047,7 +5044,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "psrlw     $0x7,%%xmm0                     \n"
     "psrlw     $0x7,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
@@ -5060,7 +5057,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "movdqu    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        25b                             \n"
@@ -5072,7 +5069,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "movdqu    (%1),%%xmm0                     \n"
     "movdqu    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        50b                             \n"
@@ -5085,7 +5082,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "movdqu    (%1,%4,1),%%xmm0                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        75b                             \n"
@@ -5095,14 +5092,14 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
     ".p2align  4                               \n"
   "100:                                        \n"
     "movdqu    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
@@ -5113,11 +5110,10 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
   );
 }
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
-void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
-                                       ptrdiff_t src_stride, int dst_width,
-                                       int source_y_fraction) {
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                                   ptrdiff_t src_stride, int dst_width,
+                                   int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
@@ -5159,7 +5155,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
     "paddw     %%xmm2,%%xmm0                   \n"
     "paddw     %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
@@ -5172,7 +5168,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
     "movdqu    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        25b                             \n"
@@ -5184,7 +5180,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
     "movdqu    (%1),%%xmm0                     \n"
     "movdqu    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        50b                             \n"
@@ -5197,7 +5193,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
     "movdqu    (%1,%4,1),%%xmm0                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        75b                             \n"
@@ -5207,14 +5203,14 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
     ".p2align  4                               \n"
   "100:                                        \n"
     "movdqu    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
diff --git a/source/row_win.cc b/source/row_win.cc
index 59a58d726..0ecd6cf49 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -5923,17 +5923,16 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
+// Bilinear filter 16x2 -> 16x1
 __declspec(naked) __declspec(align(16))
-void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                              ptrdiff_t src_stride, int dst_width,
-                              int source_y_fraction) {
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]   // dst_argb
-    mov        esi, [esp + 8 + 8]   // src_argb
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@@ -5969,7 +5968,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
     psrlw      xmm0, 7
     psrlw      xmm1, 7
     packuswb   xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop
@@ -5982,7 +5981,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
     movdqa     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop25
@@ -5994,7 +5993,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
     movdqa     xmm0, [esi]
     movdqa     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop50
@@ -6007,7 +6006,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
     movdqa     xmm0, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop75
@@ -6017,7 +6016,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
     align      16
   xloop100:
     movdqa     xmm0, [esi]
-    sub        ecx, 4
+    sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop100
@@ -6029,17 +6028,16 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
   }
 }
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSE2 but without last pixel duplicated.
+// Bilinear filter 16x2 -> 16x1
 __declspec(naked) __declspec(align(16))
-void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
-                             ptrdiff_t src_stride, int dst_width,
-                             int source_y_fraction) {
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]   // dst_argb
-    mov        esi, [esp + 8 + 8]   // src_argb
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@@ -6081,7 +6079,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
     paddw      xmm0, xmm2  // sum rows
     paddw      xmm1, xmm3
     packuswb   xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop
@@ -6094,7 +6092,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
     movdqa     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop25
@@ -6106,7 +6104,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
     movdqa     xmm0, [esi]
     movdqa     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop50
@@ -6119,7 +6117,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
     movdqa     xmm0, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop75
@@ -6129,7 +6127,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
     align      16
   xloop100:
     movdqa     xmm0, [esi]
-    sub        ecx, 4
+    sub        ecx, 16
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop100
@@ -6141,17 +6139,16 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
   }
 }
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
+// Bilinear filter 16x2 -> 16x1
 __declspec(naked) __declspec(align(16))
-void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                                        ptrdiff_t src_stride, int dst_width,
-                                        int source_y_fraction) {
+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                                    ptrdiff_t src_stride, int dst_width,
+                                    int source_y_fraction) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]   // dst_argb
-    mov        esi, [esp + 8 + 8]   // src_argb
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@@ -6187,7 +6184,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
     psrlw      xmm0, 7
     psrlw      xmm1, 7
     packuswb   xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop
@@ -6200,7 +6197,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
     movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop25
@@ -6212,7 +6209,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop50
@@ -6225,7 +6222,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
     movdqu     xmm0, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop75
@@ -6235,7 +6232,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
     align      16
   xloop100:
     movdqu     xmm0, [esi]
-    sub        ecx, 4
+    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop100
@@ -6247,17 +6244,16 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
   }
 }
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSE2 but without last pixel duplicated.
+// Bilinear filter 16x2 -> 16x1
 __declspec(naked) __declspec(align(16))
-void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
-                                       ptrdiff_t src_stride, int dst_width,
-                                       int source_y_fraction) {
+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                                   ptrdiff_t src_stride, int dst_width,
+                                   int source_y_fraction) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]   // dst_argb
-    mov        esi, [esp + 8 + 8]   // src_argb
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@@ -6299,7 +6295,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
     paddw      xmm0, xmm2  // sum rows
     paddw      xmm1, xmm3
     packuswb   xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop
@@ -6312,7 +6308,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
     movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop25
@@ -6324,7 +6320,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop50
@@ -6337,7 +6333,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
     movdqu     xmm0, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 4
+    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop75
@@ -6347,7 +6343,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
     align      16
   xloop100:
     movdqu     xmm0, [esi]
-    sub        ecx, 4
+    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop100
diff --git a/source/scale.cc b/source/scale.cc
index 4189d3dcd..7641b07df 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -91,11 +91,6 @@ void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width);
-// 16x2 -> 16x1
-#define HAS_SCALEFILTERROWS_NEON
-void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction);
 
 // SSE2 downscalers with interpolation.
 // Constants for SSSE3 code
@@ -809,350 +804,6 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
-// Normal formula for bilinear interpolation is:
-//   source_y_fraction * row1 + (1 - source_y_fraction) row0
-// SSE2 version using the a single multiply of difference:
-//   source_y_fraction * (row1 - row0) + row0
-// TODO(fbarchard): Specialize same as SSSE3.
-
-#define HAS_SCALEFILTERROWS_SSE2
-__declspec(naked) __declspec(align(16))
-static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                                 ptrdiff_t src_stride, int dst_width,
-                                 int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 256.  Blend 100 / 0.
-    cmp        eax, 64
-    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
-    cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
-    cmp        eax, 192
-    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
-
-    movd       xmm5, eax            // xmm5 = y fraction
-    punpcklbw  xmm5, xmm5
-    psrlw      xmm5, 1
-    punpcklwd  xmm5, xmm5
-    punpckldq  xmm5, xmm5
-    punpcklqdq xmm5, xmm5
-    pxor       xmm4, xmm4
-
-    align      16
-  xloop:
-    movdqa     xmm0, [esi]  // row0
-    movdqa     xmm2, [esi + edx]  // row1
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    punpcklbw  xmm2, xmm4
-    punpckhbw  xmm3, xmm4
-    punpcklbw  xmm0, xmm4
-    punpckhbw  xmm1, xmm4
-    psubw      xmm2, xmm0  // row1 - row0
-    psubw      xmm3, xmm1
-    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
-    paddw      xmm3, xmm3
-    pmulhw     xmm2, xmm5  // scale diff
-    pmulhw     xmm3, xmm5
-    paddw      xmm0, xmm2  // sum rows
-    paddw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-    jmp        xloop99
-
-    // Blend 25 / 75.
-    align      16
-  xloop25:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop25
-    jmp        xloop99
-
-    // Blend 50 / 50.
-    align      16
-  xloop50:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-    jmp        xloop99
-
-    // Blend 75 / 25.
-    align      16
-  xloop75:
-    movdqa     xmm1, [esi]
-    movdqa     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-    jmp        xloop99
-
-    // Blend 100 / 0 - Copy row unchanged.
-    align      16
-  xloop100:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-
-    // Extrude last pixel.
-  xloop99:
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
-#define HAS_SCALEFILTERROWS_SSSE3
-__declspec(naked) __declspec(align(16))
-static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                  ptrdiff_t src_stride, int dst_width,
-                                  int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    shr        eax, 1
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 128.  Blend 100 / 0.
-    cmp        eax, 32
-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
-    cmp        eax, 64
-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
-    cmp        eax, 96
-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
-
-    movd       xmm0, eax  // high fraction 1..127.
-    neg        eax
-    add        eax, 128
-    movd       xmm5, eax  // low fraction 127..1.
-    punpcklbw  xmm5, xmm0
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-
-    // General purpose row blend.
-    align      16
-  xloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2
-    punpckhbw  xmm1, xmm2
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm1, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm1, 7
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-    jmp        xloop99
-
-    // Blend 25 / 75.
-    align      16
-  xloop25:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop25
-    jmp        xloop99
-
-    // Blend 50 / 50.
-    align      16
-  xloop50:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-    jmp        xloop99
-
-    // Blend 75 / 25.
-    align      16
-  xloop75:
-    movdqa     xmm1, [esi]
-    movdqa     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-    jmp        xloop99
-
-    // Blend 100 / 0 - Copy row unchanged.
-    align      16
-  xloop100:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-
-    // Extrude last pixel.
-  xloop99:
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
-                                            const uint8* src_ptr,
-                                            ptrdiff_t src_stride, int dst_width,
-                                            int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    shr        eax, 1
-    cmp        eax, 0  // dispatch to specialized filters if applicable.
-    je         xloop100
-    cmp        eax, 32
-    je         xloop75
-    cmp        eax, 64
-    je         xloop50
-    cmp        eax, 96
-    je         xloop25
-
-    movd       xmm0, eax  // high fraction 1..127.
-    neg        eax
-    add        eax, 128
-    movd       xmm5, eax  // low fraction 127..1.
-    punpcklbw  xmm5, xmm0
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-
-    // General purpose row blend.
-    align      16
-  xloop:
-    movdqu     xmm0, [esi]
-    movdqu     xmm2, [esi + edx]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2
-    punpckhbw  xmm1, xmm2
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm1, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm1, 7
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-    jmp        xloop99
-
-    // Blend 25 / 75.
-    align      16
-  xloop25:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop25
-    jmp        xloop99
-
-    // Blend 50 / 50.
-    align      16
-  xloop50:
-    movdqu     xmm0, [esi]
-    movdqu     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-    jmp        xloop99
-
-    // Blend 75 / 25.
-    align      16
-  xloop75:
-    movdqu     xmm1, [esi]
-    movdqu     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-    jmp        xloop99
-
-    // Blend 100 / 0 - Copy row unchanged.
-    align      16
-  xloop100:
-    movdqu     xmm0, [esi]
-    sub        ecx, 16
-    movdqu     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-
-    // Extrude last pixel.
-  xloop99:
-    punpckhbw  xmm0, xmm0
-    pshufhw    xmm0, xmm0, 0xff
-    punpckhqdq xmm0, xmm0
-    movdqu     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-  }
-}
 #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
@@ -1745,337 +1396,6 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   );
 }
 
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
-// For more info see comment above ScaleFilterRows_SSE2 for MSVC++
-#define HAS_SCALEFILTERROWS_SSE2
-static void ScaleFilterRows_SSE2(uint8* dst_ptr,
-                                 const uint8* src_ptr, ptrdiff_t src_stride,
-                                 int dst_width, int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x20,%3                        \n"
-    "je        75f                             \n"
-    "cmp       $0x40,%3                        \n"
-    "je        50f                             \n"
-    "cmp       $0x60,%3                        \n"
-    "je        25f                             \n"
-
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-
-    // General purpose row blend.
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm2                \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm2                   \n"
-    "punpckhbw %%xmm4,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm0                   \n"
-    "punpckhbw %%xmm4,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm2                   \n"
-    "psubw     %%xmm1,%%xmm3                   \n"
-    "paddw     %%xmm2,%%xmm2                   \n"
-    "paddw     %%xmm3,%%xmm3                   \n"
-    "pmulhw    %%xmm5,%%xmm2                   \n"
-    "pmulhw    %%xmm5,%%xmm3                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 25 / 75.
-    ".p2align  4                               \n"
-  "25:                                         \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm1                \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        25b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    ".p2align  4                               \n"
-  "50:                                         \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm1                \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 75 / 25.
-    ".p2align  4                               \n"
-  "75:                                         \n"
-    "movdqa    (%1),%%xmm1                     \n"
-    "movdqa    (%1,%4,1),%%xmm0                \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        75b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    ".p2align  4                               \n"
-  "100:                                        \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        100b                            \n"
-
-  "99:                                         \n"
-    "punpckhbw %%xmm0,%%xmm0                   \n"
-    "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
-    "punpckhqdq %%xmm0,%%xmm0                  \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"(source_y_fraction),  // %3
-    "r"(static_cast<intptr_t>(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
-#define HAS_SCALEFILTERROWS_SSSE3
-static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
-                                  const uint8* src_ptr, ptrdiff_t src_stride,
-                                  int dst_width, int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x20,%3                        \n"
-    "je        75f                             \n"
-    "cmp       $0x40,%3                        \n"
-    "je        50f                             \n"
-    "cmp       $0x60,%3                        \n"
-    "je        25f                             \n"
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-
-    // General purpose row blend.
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm2                \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "pmaddubsw %%xmm5,%%xmm0                   \n"
-    "pmaddubsw %%xmm5,%%xmm1                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 25 / 75.
-    ".p2align  4                               \n"
-  "25:                                         \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm1                \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        25b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    ".p2align  4                               \n"
-  "50:                                         \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm1                \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 75 / 25.
-    ".p2align  4                               \n"
-  "75:                                         \n"
-    "movdqa    (%1),%%xmm1                     \n"
-    "movdqa    (%1,%4,1),%%xmm0                \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        75b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    ".p2align  4                               \n"
-  "100:                                        \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        100b                            \n"
-
-    // Extrude last pixel.
-  "99:                                         \n"
-    "punpckhbw %%xmm0,%%xmm0                   \n"
-    "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
-    "punpckhqdq %%xmm0,%%xmm0                  \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"(static_cast<intptr_t>(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
-  );
-}
-
-static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr,
-                                            const uint8* src_ptr,
-                                            ptrdiff_t src_stride, int dst_width,
-                                            int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x20,%3                        \n"
-    "je        75f                             \n"
-    "cmp       $0x40,%3                        \n"
-    "je        50f                             \n"
-    "cmp       $0x60,%3                        \n"
-    "je        25f                             \n"
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-
-    // General purpose row blend.
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    (%1,%4,1),%%xmm2                \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "pmaddubsw %%xmm5,%%xmm0                   \n"
-    "pmaddubsw %%xmm5,%%xmm1                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 25 / 75.
-    ".p2align  4                               \n"
-  "25:                                         \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    (%1,%4,1),%%xmm1                \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        25b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    ".p2align  4                               \n"
-  "50:                                         \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    (%1,%4,1),%%xmm1                \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 75 / 25.
-    ".p2align  4                               \n"
-  "75:                                         \n"
-    "movdqu    (%1),%%xmm1                     \n"
-    "movdqu    (%1,%4,1),%%xmm0                \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        75b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    ".p2align  4                               \n"
-  "100:                                        \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        100b                            \n"
-
-    // Extrude last pixel.
-  "99:                                         \n"
-    "punpckhbw %%xmm0,%%xmm0                   \n"
-    "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
-    "punpckhqdq %%xmm0,%%xmm0                  \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"(static_cast<intptr_t>(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
-  );
-}
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #if !defined(LIBYUV_DISABLE_MIPS) && \
@@ -2085,11 +1405,6 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                               uint8* dst, int dst_width);
 void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
                                  uint8* dst, int dst_width);
-#define HAS_SCALEFILTERROWS_MIPS_DSPR2
-void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
-                                const unsigned char* src_ptr,
-                                ptrdiff_t src_stride,
-                                int dst_width, int source_y_fraction);
 #define HAS_SCALEROWDOWN4_MIPS_DSPR2
 void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                               uint8* dst, int dst_width);
@@ -2280,44 +1595,6 @@ static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-#if defined(HAS_SCALEFILTERROWS_SSE2)
-// Filter row to 3/4
-static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
-                                int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  const uint8* s = src_ptr;
-  uint8* dend = dst_ptr + dst_width;
-  do {
-    dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    dst_ptr += 3;
-    s += 4;
-  } while (dst_ptr < dend);
-}
-
-#define HAS_SCALEROWDOWN34_SSE2
-// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Box_SSE2(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
-  ScaleFilterCols34_C(dst_ptr, row, dst_width);
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Box_SSE2(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
-  ScaleFilterCols34_C(dst_ptr, row, dst_width);
-}
-#endif
-
 static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                              uint8* dst, int dst_width) {
   assert(dst_width % 3 == 0);
@@ -2376,35 +1653,6 @@ static void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-// Blend 2 rows into 1 with filtering. N x 2 to N x 1
-static void ScaleFilterRows_C(uint8* dst_ptr,
-                              const uint8* src_ptr, ptrdiff_t src_stride,
-                              int dst_width, int source_y_fraction) {
-  assert(dst_width > 0);
-  // Specialized case for 100% first row.  Helps avoid reading beyond last row.
-  if (source_y_fraction == 0) {
-    memcpy(dst_ptr, src_ptr, dst_width);
-    dst_ptr[dst_width] = dst_ptr[dst_width - 1];
-    return;
-  }
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
-
-  for (int x = 0; x < dst_width - 1; x += 2) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
-    src_ptr += 2;
-    src_ptr1 += 2;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-    dst_ptr += 1;
-  }
-  dst_ptr[0] = dst_ptr[-1];
-}
-
 void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
                     uint16* dst_ptr, int src_width, int src_height) {
   assert(src_width > 0);
@@ -2542,13 +1790,6 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN34_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) {
-    ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSE2;
-    ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSE2;
-  }
-#endif
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
@@ -2825,12 +2066,59 @@ static void ScalePlaneBox(int src_width, int src_height,
   }
 }
 
-// Scale plane to/from any dimensions, with interpolation.
+// Scale plane to/from any dimensions, with bilinear interpolation.
 
-static void ScalePlaneBilinearSimple(int src_width, int src_height,
-                                     int dst_width, int dst_height,
-                                     int src_stride, int dst_stride,
-                                     const uint8* src_ptr, uint8* dst_ptr) {
+void ScalePlaneBilinear(int src_width, int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_ptr, uint8* dst_ptr) {
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  assert(Abs(src_width) <= kMaxStride);
+
+  SIMD_ALIGNED(uint8 row[kMaxStride + 16]);
+
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
   int dx = 0;
   int dy = 0;
   int x = 0;
@@ -2853,119 +2141,18 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height,
   } else if (dst_height > 1) {
     dy = ((src_height - 1) << 16) / (dst_height - 1);
   }
-  int maxx = (Abs(src_width) > 1) ? ((Abs(src_width) - 1) << 16) - 1 : 0;
   int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-  if (y > maxy) {
-    y = maxy;
-  }
-  for (int i = 0; i < dst_height; ++i) {
-    int xs = x;
-    int yi = y >> 16;
-    int yf = y & 0xffff;
-    const uint8* src0 = src_ptr + yi * src_stride;
-    const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0;
-    uint8* dst = dst_ptr;
-    for (int j = 0; j < dst_width; ++j) {
-      int xi = xs >> 16;
-      int xf = xs & 0xffff;
-      int x1 = (xi < (src_width - 1)) ? xi + 1 : xi;
-      int a = src0[xi];
-      int b = src0[x1];
-      int r0 = BLENDER(a, b, xf);
-      a = src1[xi];
-      b = src1[x1];
-      int r1 = BLENDER(a, b, xf);
-      *dst++ = BLENDER(r0, r1, yf);
-      xs += dx;
-      if (xs > maxx)
-        xs = maxx;
+  for (int j = 0; j < dst_height; ++j) {
+    if (y > maxy) {
+      y = maxy;
     }
+    int yi = y >> 16;
+    int yf = (y >> 8) & 255;
+    const uint8* src = src_ptr + yi * src_stride;
+    InterpolateRow(row, src, src_stride, src_width, yf);
+    ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
     dst_ptr += dst_stride;
     y += dy;
-    if (y > maxy)
-      y = maxy;
-  }
-}
-
-
-// Scale plane to/from any dimensions, with bilinear interpolation.
-
-void ScalePlaneBilinear(int src_width, int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_ptr, uint8* dst_ptr) {
-  assert(dst_width > 0);
-  assert(dst_height > 0);
-  if (Abs(src_width) > kMaxStride) {
-    ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
-                             src_stride, dst_stride, src_ptr, dst_ptr);
-
-  } else {
-    SIMD_ALIGNED(uint8 row[kMaxStride + 16]);
-    void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
-                            ptrdiff_t src_stride,
-                            int dst_width, int source_y_fraction) =
-        ScaleFilterRows_C;
-#if defined(HAS_SCALEFILTERROWS_NEON)
-    if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) {
-      ScaleFilterRows = ScaleFilterRows_NEON;
-    }
-#endif
-#if defined(HAS_SCALEFILTERROWS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16) &&
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-      ScaleFilterRows = ScaleFilterRows_SSE2;
-    }
-#endif
-#if defined(HAS_SCALEFILTERROWS_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 16)) {
-      ScaleFilterRows = ScaleFilterRows_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-        ScaleFilterRows = ScaleFilterRows_SSSE3;
-      }
-    }
-#endif
-#if defined(HAS_SCALEFILTERROWS_MIPS_DSPR2)
-    if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
-        IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4)) {
-      ScaleFilterRows = ScaleFilterRows_MIPS_DSPR2;
-    }
-#endif
-    int dx = 0;
-    int dy = 0;
-    int x = 0;
-    int y = 0;
-    if (dst_width <= Abs(src_width)) {
-      dx = (Abs(src_width) << 16) / dst_width;
-      x = (dx >> 1) - 32768;
-    } else if (dst_width > 1) {
-      dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1);
-    }
-    // Negative src_width means horizontally mirror.
-    if (src_width < 0) {
-      x += (dst_width - 1) * dx;
-      dx = -dx;
-      src_width = -src_width;
-    }
-    if (dst_height <= src_height) {
-      dy = (src_height << 16) / dst_height;
-      y = (dy >> 1) - 32768;
-    } else if (dst_height > 1) {
-      dy = ((src_height - 1) << 16) / (dst_height - 1);
-    }
-    int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-    for (int j = 0; j < dst_height; ++j) {
-      if (y > maxy) {
-        y = maxy;
-      }
-      int yi = y >> 16;
-      int yf = (y >> 8) & 255;
-      const uint8* src = src_ptr + yi * src_stride;
-      ScaleFilterRows(row, src, src_stride, src_width, yf);
-      ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
-      dst_ptr += dst_stride;
-      y += dy;
-    }
   }
 }
 
@@ -3010,11 +2197,10 @@ static void ScalePlaneAnySize(int src_width, int src_height,
                               int src_stride, int dst_stride,
                               const uint8* src_ptr, uint8* dst_ptr,
                               FilterMode filtering) {
-  if (!filtering) {
+  if (!filtering || src_width > kMaxStride) {
     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
                      src_stride, dst_stride, src_ptr, dst_ptr);
   } else {
-    // fall back to non-optimized version
     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
                        src_stride, dst_stride, src_ptr, dst_ptr);
   }
@@ -3031,7 +2217,7 @@ static void ScalePlaneDown(int src_width, int src_height,
                            int src_stride, int dst_stride,
                            const uint8* src_ptr, uint8* dst_ptr,
                            FilterMode filtering) {
-  if (!filtering) {
+  if (!filtering || src_width > kMaxStride) {
     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
                      src_stride, dst_stride, src_ptr, dst_ptr);
   } else if (filtering == kFilterBilinear || dst_height * 2 > src_height) {
@@ -3099,7 +2285,7 @@ void ScalePlane(const uint8* src, int src_stride,
 
 // Scale an I420 image.
 // This function in turn calls a scaling function for each plane.
-
+// TODO(fbarchard): Disable UNDER_ALLOCATED_HACK
 #define UNDER_ALLOCATED_HACK 1
 
 LIBYUV_API
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index 989df55a6..3162f9e07 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -824,41 +824,51 @@ static void ScaleARGBBilinearDown(int src_height,
   int xr = (dx >= 0) ? xlast : x;
   xl = (xl >> 16) & ~3;  // Left edge aligned.
   xr = (xr >> 16) + 1;  // Right most pixel used.
-  int clip_src_width = ((xr - xl) + 1 + 3) & ~3;  // Width aligned to 4.
+  int clip_src_width = (((xr - xl) + 1 + 3) & ~3) * 4;  // Width aligned to 4.
   src_argb += xl * 4;
   x -= (xl << 16);
-  assert(clip_src_width * 4 <= kMaxStride);
+  assert(clip_src_width <= kMaxStride);
+  // TODO(fbarchard): Remove clip_src_width alignment checks.
   SIMD_ALIGNED(uint8 row[kMaxStride + 16]);
-  void (*ScaleARGBFilterRows)(uint8* dst_argb, const uint8* src_argb,
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
       ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      ARGBInterpolateRow_C;
-#if defined(HAS_ARGBINTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 4) {
-    ScaleARGBFilterRows = ARGBInterpolateRow_Any_SSE2;
-    if (IS_ALIGNED(clip_src_width, 4)) {
-      ScaleARGBFilterRows = ARGBInterpolateRow_Unaligned_SSE2;
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
       if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
-        ScaleARGBFilterRows = ARGBInterpolateRow_SSE2;
+        InterpolateRow = InterpolateRow_SSE2;
       }
     }
   }
 #endif
-#if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 4) {
-    ScaleARGBFilterRows = ARGBInterpolateRow_Any_SSSE3;
-    if (IS_ALIGNED(clip_src_width, 4)) {
-      ScaleARGBFilterRows = ARGBInterpolateRow_Unaligned_SSSE3;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
       if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
-        ScaleARGBFilterRows = ARGBInterpolateRow_SSSE3;
+        InterpolateRow = InterpolateRow_SSSE3;
       }
     }
   }
 #endif
-#if defined(HAS_ARGBINTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 4) {
-    ScaleARGBFilterRows = ARGBInterpolateRow_Any_NEON;
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
     if (IS_ALIGNED(clip_src_width, 4)) {
-      ScaleARGBFilterRows = ARGBInterpolateRow_NEON;
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
     }
   }
 #endif
@@ -877,7 +887,7 @@ static void ScaleARGBBilinearDown(int src_height,
     int yi = y >> 16;
     int yf = (y >> 8) & 255;
     const uint8* src = src_argb + yi * src_stride;
-    ScaleARGBFilterRows(row, src, src_stride, clip_src_width, yf);
+    InterpolateRow(row, src, src_stride, clip_src_width, yf);
     ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
     dst_argb += dst_stride;
     y += dy;
@@ -895,38 +905,44 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
   assert(dst_width > 0);
   assert(dst_height > 0);
   assert(dst_width * 4 <= kMaxStride);
-  void (*ScaleARGBFilterRows)(uint8* dst_argb, const uint8* src_argb,
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
       ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      ARGBInterpolateRow_C;
-#if defined(HAS_ARGBINTERPOLATEROW_SSE2)
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
-    ScaleARGBFilterRows = ARGBInterpolateRow_Any_SSE2;
+    InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterRows = ARGBInterpolateRow_Unaligned_SSE2;
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
       if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        ScaleARGBFilterRows = ARGBInterpolateRow_SSE2;
+        InterpolateRow = InterpolateRow_SSE2;
       }
     }
   }
 #endif
-#if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
+#if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
-    ScaleARGBFilterRows = ARGBInterpolateRow_Any_SSSE3;
+    InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterRows = ARGBInterpolateRow_Unaligned_SSSE3;
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
       if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        ScaleARGBFilterRows = ARGBInterpolateRow_SSSE3;
+        InterpolateRow = InterpolateRow_SSSE3;
       }
     }
   }
 #endif
-#if defined(HAS_ARGBINTERPOLATEROW_NEON)
+#if defined(HAS_INTERPOLATEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
-    ScaleARGBFilterRows = ARGBInterpolateRow_Any_NEON;
+    InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBFilterRows = ARGBInterpolateRow_NEON;
+      InterpolateRow = InterpolateRow_NEON;
     }
   }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
 #endif
   void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
       int dst_width, int x, int dx) = ScaleARGBFilterCols_C;
@@ -965,7 +981,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
       }
     }
     int yf = (y >> 8) & 255;
-    ScaleARGBFilterRows(dst_argb, rowptr, rowstride, dst_width, yf);
+    InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
     dst_argb += dst_stride;
     y += dy;
   }
@@ -1024,24 +1040,23 @@ static void ScaleARGBAnySize(int src_width, int src_height,
                              const uint8* src_argb, uint8* dst_argb,
                              int x, int dx, int y, int dy,
                              FilterMode filtering) {
-  if (!filtering ||
-      (src_width * 4 > kMaxStride && dst_width * 4 > kMaxStride)) {
-    ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
-                    src_stride, dst_stride, src_argb, dst_argb,
-                    x, dx, y, dy);
-    return;
-  }
-  if (dy >= 65536 || dst_width * 4 > kMaxStride) {
-    ScaleARGBBilinearDown(src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src_argb, dst_argb,
-                          x, dx, y, dy);
-  } else {
+  if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) {
     ScaleARGBBilinearUp(src_width, src_height,
                         clip_width, clip_height,
                         src_stride, dst_stride, src_argb, dst_argb,
                         x, dx, y, dy);
+    return;
   }
+  if (filtering && src_width * 4 < kMaxStride) {
+    ScaleARGBBilinearDown(src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src_argb, dst_argb,
+                          x, dx, y, dy);
+    return;
+  }
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+                  src_stride, dst_stride, src_argb, dst_argb,
+                  x, dx, y, dy);
 }
 
 // ScaleARGB a ARGB.
diff --git a/source/scale_mips.cc b/source/scale_mips.cc
index 66f2571a1..cfd48b5b0 100644
--- a/source/scale_mips.cc
+++ b/source/scale_mips.cc
@@ -629,64 +629,6 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
   );
 }
 
-void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
-                                const unsigned char* src_ptr,
-                                ptrdiff_t src_stride,
-                                int dst_width, int source_y_fraction) {
-    int y0_fraction = 256 - source_y_fraction;
-    const unsigned char* src_ptr1 = src_ptr + src_stride;
-
-  __asm__ __volatile__ (
-     ".set push                                           \n"
-     ".set noreorder                                      \n"
-
-     "replv.ph          $t0, %[y0_fraction]               \n"
-     "replv.ph          $t1, %[source_y_fraction]         \n"
-   "1:                                                    \n"
-     "lw                $t2, 0(%[src_ptr])                \n"
-     "lw                $t3, 0(%[src_ptr1])               \n"
-     "lw                $t4, 4(%[src_ptr])                \n"
-     "lw                $t5, 4(%[src_ptr1])               \n"
-     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
-     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
-     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
-     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
-     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
-     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
-     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
-     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
-     "addq.ph           $t6, $t6, $t8                     \n"
-     "addq.ph           $t7, $t7, $t9                     \n"
-     "addq.ph           $t2, $t2, $t4                     \n"
-     "addq.ph           $t3, $t3, $t5                     \n"
-     "shra.ph           $t6, $t6, 8                       \n"
-     "shra.ph           $t7, $t7, 8                       \n"
-     "shra.ph           $t2, $t2, 8                       \n"
-     "shra.ph           $t3, $t3, 8                       \n"
-     "precr.qb.ph       $t6, $t6, $t7                     \n"
-     "precr.qb.ph       $t2, $t2, $t3                     \n"
-     "addiu             %[src_ptr], %[src_ptr], 8         \n"
-     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
-     "addiu             %[dst_width], %[dst_width], -8    \n"
-     "sw                $t6, 0(%[dst_ptr])                \n"
-     "sw                $t2, 4(%[dst_ptr])                \n"
-     "bgtz              %[dst_width], 1b                  \n"
-     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
-
-     "lbu               $t0, -1(%[dst_ptr])               \n"
-     "sb                $t0, 0(%[dst_ptr])                \n"
-     ".set pop                                            \n"
-  : [dst_ptr] "+r" (dst_ptr),
-    [src_ptr1] "+r" (src_ptr1),
-    [src_ptr] "+r" (src_ptr),
-    [dst_width] "+r" (dst_width)
-  : [source_y_fraction] "r" (source_y_fraction),
-    [y0_fraction] "r" (y0_fraction),
-    [src_stride] "r" (src_stride)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 
 #ifdef __cplusplus
diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index 947531f04..4195bfa58 100644
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -117,7 +117,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy2_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy2_Bilinear) {
@@ -143,7 +143,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy1_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 0);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy1_Bilinear) {
@@ -156,7 +156,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy1_Bilinear) {
                                 dst_width, dst_height,
                                 kFilterBilinear,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 0);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy4_None) {
@@ -169,7 +169,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy4_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy4_Bilinear) {
@@ -195,7 +195,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy5_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy5_Bilinear) {
@@ -221,7 +221,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy8_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy8_Bilinear) {
@@ -247,7 +247,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy16_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy16_Bilinear) {
@@ -263,6 +263,32 @@ TEST_F(libyuvTest, ARGBScaleDownBy16_Bilinear) {
   EXPECT_LE(max_diff, 2);
 }
 
+TEST_F(libyuvTest, ARGBScaleDownBy23_None) {
+  const int src_width = benchmark_width_;
+  const int src_height = benchmark_height_;
+  const int dst_width = Abs(src_width) * 2 / 3;
+  const int dst_height = Abs(src_height) * 2 / 3;
+
+  int max_diff = ARGBTestFilter(src_width, src_height,
+                                dst_width, dst_height,
+                                kFilterNone,
+                                benchmark_iterations_);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBScaleDownBy23_Bilinear) {
+  const int src_width = benchmark_width_;
+  const int src_height = benchmark_height_;
+  const int dst_width = Abs(src_width) * 2 / 3;
+  const int dst_height = Abs(src_height) * 2 / 3;
+
+  int max_diff = ARGBTestFilter(src_width, src_height,
+                                dst_width, dst_height,
+                                kFilterBilinear,
+                                benchmark_iterations_);
+  EXPECT_LE(max_diff, 2);
+}
+
 TEST_F(libyuvTest, ARGBScaleDownBy34_None) {
   const int src_width = benchmark_width_;
   const int src_height = benchmark_height_;
@@ -273,7 +299,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy34_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy34_Bilinear) {
@@ -299,7 +325,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy38_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleDownBy38_Bilinear) {
@@ -325,7 +351,7 @@ TEST_F(libyuvTest, ARGBScaleTo1366x768_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleTo1366x768_Bilinear) {
@@ -352,7 +378,7 @@ TEST_F(libyuvTest, ARGBScaleTo1280x720_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleTo1280x720_Bilinear) {
@@ -378,7 +404,7 @@ TEST_F(libyuvTest, ARGBScaleTo853x480_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleTo853x480_Bilinear) {
@@ -404,7 +430,7 @@ TEST_F(libyuvTest, ARGBScaleFrom640x360_None) {
                                 dst_width, dst_height,
                                 kFilterNone,
                                 benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ARGBScaleFrom640x360_Bilinear) {
@@ -675,6 +701,32 @@ TEST_F(libyuvTest, ARGBScaleClipDownBy16_Bilinear) {
   EXPECT_EQ(0, max_diff);
 }
 
+TEST_F(libyuvTest, ARGBScaleClipDownBy23_None) {
+  const int src_width = benchmark_width_;
+  const int src_height = benchmark_height_;
+  const int dst_width = Abs(src_width) * 2 / 3;
+  const int dst_height = Abs(src_height) * 2 / 3;
+
+  int max_diff = ARGBClipTestFilter(src_width, src_height,
+                                    dst_width, dst_height,
+                                    kFilterNone,
+                                    benchmark_iterations_);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBScaleClipDownBy23_Bilinear) {
+  const int src_width = benchmark_width_;
+  const int src_height = benchmark_height_;
+  const int dst_width = Abs(src_width) * 2 / 3;
+  const int dst_height = Abs(src_height) * 2 / 3;
+
+  int max_diff = ARGBClipTestFilter(src_width, src_height,
+                                    dst_width, dst_height,
+                                    kFilterBilinear,
+                                    benchmark_iterations_);
+  EXPECT_EQ(0, max_diff);
+}
+
 TEST_F(libyuvTest, ARGBScaleClipDownBy34_None) {
   const int src_width = benchmark_width_;
   const int src_height = benchmark_height_;
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index ff718b12c..5facf7d51 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -175,7 +175,7 @@ TEST_F(libyuvTest, ScaleDownBy2_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleDownBy2_Bilinear) {
@@ -214,7 +214,7 @@ TEST_F(libyuvTest, ScaleDownBy4_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 2);  // This is the only scale factor with error of 2.
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleDownBy4_Bilinear) {
@@ -253,7 +253,7 @@ TEST_F(libyuvTest, ScaleDownBy5_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleDownBy5_Bilinear) {
@@ -292,7 +292,7 @@ TEST_F(libyuvTest, ScaleDownBy8_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleDownBy8_Bilinear) {
@@ -331,7 +331,7 @@ TEST_F(libyuvTest, ScaleDownBy16_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleDownBy16_Bilinear) {
@@ -344,7 +344,7 @@ TEST_F(libyuvTest, ScaleDownBy16_Bilinear) {
                             dst_width, dst_height,
                             kFilterBilinear,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_LE(max_diff, 2);
 }
 
 TEST_F(libyuvTest, ScaleDownBy16_Box) {
@@ -360,6 +360,32 @@ TEST_F(libyuvTest, ScaleDownBy16_Box) {
   EXPECT_LE(max_diff, 1);
 }
 
+TEST_F(libyuvTest, ScaleDownBy23_None) {
+  const int src_width = benchmark_width_;
+  const int src_height = benchmark_height_;
+  const int dst_width = Abs(src_width) * 2 / 3;
+  const int dst_height = Abs(src_height) * 2 / 3;
+
+  int max_diff = TestFilter(src_width, src_height,
+                            dst_width, dst_height,
+                            kFilterNone,
+                            benchmark_iterations_);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ScaleDownBy23_Bilinear) {
+  const int src_width = benchmark_width_;
+  const int src_height = benchmark_height_;
+  const int dst_width = Abs(src_width) * 2 / 3;
+  const int dst_height = Abs(src_height) * 2 / 3;
+
+  int max_diff = TestFilter(src_width, src_height,
+                            dst_width, dst_height,
+                            kFilterBilinear,
+                            benchmark_iterations_);
+  EXPECT_LE(max_diff, 2);
+}
+
 TEST_F(libyuvTest, ScaleDownBy34_None) {
   const int src_width = benchmark_width_;
   const int src_height = benchmark_height_;
@@ -370,7 +396,7 @@ TEST_F(libyuvTest, ScaleDownBy34_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleDownBy34_Bilinear) {
@@ -409,7 +435,7 @@ TEST_F(libyuvTest, ScaleDownBy38_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleDownBy38_Bilinear) {
@@ -448,7 +474,7 @@ TEST_F(libyuvTest, ScaleTo1366x768_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleTo1366x768_Bilinear) {
@@ -487,7 +513,7 @@ TEST_F(libyuvTest, ScaleTo1280x720_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleTo1280x720_Bilinear) {
@@ -526,7 +552,7 @@ TEST_F(libyuvTest, ScaleTo853x480_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 1);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleTo853x480_Bilinear) {
@@ -565,7 +591,7 @@ TEST_F(libyuvTest, ScaleFrom640x360_None) {
                             dst_width, dst_height,
                             kFilterNone,
                             benchmark_iterations_);
-  EXPECT_LE(max_diff, 2);
+  EXPECT_EQ(0, max_diff);
 }
 
 TEST_F(libyuvTest, ScaleFrom640x360_Bilinear) {