From 788f757016c118f5d095b4cd4f0b157af0931169 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Mon, 11 Nov 2013 18:53:19 +0000
Subject: [PATCH] Linear interpolation. BUG=none TEST=*Linear*
 R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/3689004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@848 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium              |   2 +-
 include/libyuv/scale.h       |   3 +-
 include/libyuv/version.h     |   2 +-
 source/scale.cc              | 242 +++++++++++++++++++++++++++++------
 source/scale_argb.cc         | 127 +++++++++++++++---
 unit_test/scale_argb_test.cc |  25 ++--
 unit_test/scale_test.cc      |  29 +++--
 7 files changed, 345 insertions(+), 85 deletions(-)

diff --git a/README.chromium b/README.chromium
index 2ce0a625a..186cdcb3e 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 847
+Version: 848
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h
index b1efc95d2..03a4f50ce 100644
--- a/include/libyuv/scale.h
+++ b/include/libyuv/scale.h
@@ -22,7 +22,8 @@ extern "C" {
 enum FilterMode {
   kFilterNone = 0,  // Point sample; Fastest.
   kFilterBilinear = 1,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 2  // Highest quality.
+  kFilterBox = 2,  // Highest quality.
+  kFilterLinear = 3  // Faster than bilinear, slower than None.
 };
 
 // Scale a YUV plane.
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 19fe8dbd3..674592788 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 847
+#define LIBYUV_VERSION 848
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/scale.cc b/source/scale.cc
index 3271013d5..6c708c795 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -37,19 +37,7 @@ static __inline int Half(int v) {
 // Note: Some SSE2 reference manuals
 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
 
-// Set the following flag to true to revert to only
-// using the reference implementation ScalePlaneBox(), and
-// NOT the optimized versions. Useful for debugging and
-// when comparing the quality of the resulting YUV planes
-// as produced by the optimized and non-optimized versions.
-static bool use_reference_impl_ = false;
-
-LIBYUV_API
-void SetUseReferenceImpl(bool use) {
-  use_reference_impl_ = use;
-}
-
-// ScaleRowDown2Int also used by planar functions
+// ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
 
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
@@ -208,6 +196,44 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
+// Blends 32x1 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+    align      16
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    sub        ecx, 16
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 __declspec(naked) __declspec(align(16))
@@ -281,6 +307,44 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
   }
 }
 
+// Blends 32x1 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t,
+                                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+    align      16
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    sub        ecx, 16
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 __declspec(naked) __declspec(align(16))
@@ -838,6 +902,40 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   );
 }
 
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t,
+                              uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
   asm volatile (
@@ -903,6 +1001,40 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
   );
 }
 
+static void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t,
+                                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
 static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
                                             ptrdiff_t src_stride,
                                             uint8* dst_ptr, int dst_width) {
@@ -1447,6 +1579,21 @@ static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
   }
 }
 
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  uint8* dend = dst + dst_width - 1;
+  do {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  } while (dst < dend);
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
 void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst, int dst_width) {
   const uint8* s = src_ptr;
@@ -1685,7 +1832,9 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
                             FilterMode filtering) {
   void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) =
-      filtering ? ScaleRowDown2Box_C : ScaleRowDown2_C;
+    filtering == kFilterNone ? ScaleRowDown2_C :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_C :
+        ScaleRowDown2Box_C);
   int row_stride = src_stride << 1;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
@@ -1698,12 +1847,15 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
   }
 #elif defined(HAS_SCALEROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_Unaligned_SSE2 :
-        ScaleRowDown2_Unaligned_SSE2;
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
+        ScaleRowDown2Box_Unaligned_SSE2);
     if (IS_ALIGNED(src_ptr, 16) &&
         IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
         IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-      ScaleRowDown2 = filtering ? ScaleRowDown2Box_SSE2 : ScaleRowDown2_SSE2;
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+          ScaleRowDown2Box_SSE2);
     }
   }
 #elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
@@ -1715,6 +1867,9 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
   }
 #endif
 
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
   // TODO(fbarchard): Loop through source height to allow odd height.
   for (int y = 0; y < dst_height; ++y) {
     ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
@@ -1759,6 +1914,9 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
   }
 #endif
 
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
   for (int y = 0; y < dst_height; ++y) {
     ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
     src_ptr += row_stride;
@@ -1822,14 +1980,15 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
   }
 #endif
 
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   for (int y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
                      dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
@@ -1837,7 +1996,7 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
 
   // Remainder 1 or 2 rows with last row vertically unfiltered
   if ((dst_height % 3) == 2) {
-    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
     ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
@@ -1914,21 +2073,22 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
   }
 #endif
 
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   for (int y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 3;
     dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 3;
     dst_ptr += dst_stride;
-    ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
 
   // Remainder 1 or 2 rows with last row vertically unfiltered
   if ((dst_height % 3) == 2) {
-    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 3;
     dst_ptr += dst_stride;
     ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
@@ -2080,7 +2240,8 @@ SAFEBUFFERS
 void ScalePlaneBilinear(int src_width, int src_height,
                         int dst_width, int dst_height,
                         int src_stride, int dst_stride,
-                        const uint8* src_ptr, uint8* dst_ptr) {
+                        const uint8* src_ptr, uint8* dst_ptr,
+                        FilterMode filtering) {
   assert(dst_width > 0);
   assert(dst_height > 0);
   assert(Abs(src_width) <= kMaxStride);
@@ -2164,10 +2325,14 @@ void ScalePlaneBilinear(int src_width, int src_height,
       y = max_y;
     }
     int yi = y >> 16;
-    int yf = (y >> 8) & 255;
     const uint8* src = src_ptr + yi * src_stride;
-    InterpolateRow(row, src, src_stride, src_width, yf);
-    ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols_C(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, src_width, yf);
+      ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
+    }
     dst_ptr += dst_stride;
     y += dy;
   }
@@ -2219,15 +2384,11 @@ static void ScalePlaneAnySize(int src_width, int src_height,
                      src_stride, dst_stride, src_ptr, dst_ptr);
   } else {
     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src_ptr, dst_ptr);
+                       src_stride, dst_stride, src_ptr, dst_ptr, filtering);
   }
 }
 
 // Scale plane down, any size
-//
-// This is an optimized version for scaling down a plane to any size.
-// The current implementation is ~10 times faster compared to the
-// reference implementation for e.g. XGA->LowResPAL
 
 static void ScalePlaneDown(int src_width, int src_height,
                            int dst_width, int dst_height,
@@ -2237,10 +2398,11 @@ static void ScalePlaneDown(int src_width, int src_height,
   if (!filtering || src_width > kMaxStride) {
     ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
                      src_stride, dst_stride, src_ptr, dst_ptr);
-  } else if (filtering == kFilterBilinear || dst_height * 2 > src_height) {
+  } else if (filtering == kFilterBilinear || filtering == kFilterLinear ||
+             dst_height * 2 > src_height) {
     // between 1/2x and 1x use bilinear
     ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src_ptr, dst_ptr);
+                       src_stride, dst_stride, src_ptr, dst_ptr, filtering);
   } else {
     ScalePlaneBox(src_width, src_height, dst_width, dst_height,
                   src_stride, dst_stride, src_ptr, dst_ptr);
@@ -2271,12 +2433,8 @@ void ScalePlane(const uint8* src, int src_stride,
                        0, 0, dy, 1, filtering);
   } else if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (use_reference_impl_) {
-      // For testing, allow the optimized versions to be disabled.
-      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src, dst, filtering);
-    } else if (4 * dst_width == 3 * src_width &&
-               4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
       // optimized, 3/4
       ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
                        src_stride, dst_stride, src, dst, filtering);
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index 21ed8bcb9..f00dde26e 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -74,6 +74,36 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
   }
 }
 
+// Blends 8x1 rectangle to 4x1.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                         ptrdiff_t /* src_stride */,
+                                         uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+    align      16
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+
 // Blends 8x2 rectangle to 4x1.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
 __declspec(naked) __declspec(align(16))
@@ -466,6 +496,35 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
   );
 }
 
+static void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                         ptrdiff_t /* src_stride */,
+                                         uint8* dst_argb, int dst_width) {
+  asm volatile (
+    ".p2align  4                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
 static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                                       ptrdiff_t src_stride,
                                       uint8* dst_argb, int dst_width) {
@@ -822,6 +881,19 @@ static void ScaleARGBRowDown2_C(const uint8* src_argb,
   }
 }
 
+static void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                                      ptrdiff_t /* src_stride */,
+                                      uint8* dst_argb, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
 static void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
                                    uint8* dst_argb, int dst_width) {
   for (int x = 0; x < dst_width; ++x) {
@@ -930,13 +1002,16 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
                             uint8* dst_argb, int dst_width) =
-      filtering ? ScaleARGBRowDown2Box_C : ScaleARGBRowDown2_C;
+    filtering == kFilterNone ? ScaleARGBRowDown2_C :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+        ScaleARGBRowDown2Box_C);
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_SSE2 :
-        ScaleARGBRowDown2_SSE2;
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+        ScaleARGBRowDown2Box_SSE2);
   }
 #elif defined(HAS_SCALEARGBROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
@@ -946,7 +1021,9 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
   }
 #endif
 
-  // TODO(fbarchard): Loop through source height to allow odd height.
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
   for (int y = 0; y < dst_height; ++y) {
     ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
     src_argb += row_stride;
@@ -985,6 +1062,9 @@ static void ScaleARGBDownEven(int src_width, int src_height,
   }
 #endif
 
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
   for (int y = 0; y < dst_height; ++y) {
     ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
     src_argb += row_stride;
@@ -998,7 +1078,8 @@ static void ScaleARGBBilinearDown(int src_height,
                                   int dst_width, int dst_height,
                                   int src_stride, int dst_stride,
                                   const uint8* src_argb, uint8* dst_argb,
-                                  int x, int dx, int y, int dy) {
+                                  int x, int dx, int y, int dy,
+                                  FilterMode filtering) {
   assert(src_height > 0);
   assert(dst_width > 0);
   assert(dst_height > 0);
@@ -1076,10 +1157,14 @@ static void ScaleARGBBilinearDown(int src_height,
       y = max_y;
     }
     int yi = y >> 16;
-    int yf = (y >> 8) & 255;
     const uint8* src = src_argb + yi * src_stride;
-    InterpolateRow(row, src, src_stride, clip_src_width, yf);
-    ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+    if (filtering == kFilterLinear) {
+      ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, clip_src_width, yf);
+      ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+    }
     dst_argb += dst_stride;
     y += dy;
   }
@@ -1091,7 +1176,8 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
                                 int dst_width, int dst_height,
                                 int src_stride, int dst_stride,
                                 const uint8* src_argb, uint8* dst_argb,
-                                int x, int dx, int y, int dy) {
+                                int x, int dx, int y, int dy,
+                                FilterMode filtering) {
   assert(src_width > 0);
   assert(src_height > 0);
   assert(dst_width > 0);
@@ -1180,8 +1266,12 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
         src += src_stride;
       }
     }
-    int yf = (y >> 8) & 255;
-    InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
     dst_argb += dst_stride;
     y += dy;
   }
@@ -1200,7 +1290,8 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
                                      const uint8* src_u,
                                      const uint8* src_v,
                                      uint8* dst_argb,
-                                     int x, int dx, int y, int dy) {
+                                     int x, int dx, int y, int dy,
+                                     FilterMode filtering) {
   assert(src_width > 0);
   assert(src_height > 0);
   assert(dst_width > 0);
@@ -1353,8 +1444,12 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
         }
       }
     }
-    int yf = (y >> 8) & 255;
-    InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
     dst_argb += dst_stride_argb;
     y += dy;
   }
@@ -1424,14 +1519,14 @@ static void ScaleARGBAnySize(int src_width, int src_height,
     ScaleARGBBilinearUp(src_width, src_height,
                         clip_width, clip_height,
                         src_stride, dst_stride, src_argb, dst_argb,
-                        x, dx, y, dy);
+                        x, dx, y, dy, filtering);
     return;
   }
   if (filtering && src_width * 4 < kMaxStride) {
     ScaleARGBBilinearDown(src_height,
                           clip_width, clip_height,
                           src_stride, dst_stride, src_argb, dst_argb,
-                          x, dx, y, dy);
+                          x, dx, y, dy, filtering);
     return;
   }
   ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc
index a75b8f31b..f04088b17 100644
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@@ -213,18 +213,20 @@ static int ARGBClipTestFilter(int src_width, int src_height,
 // Test a scale factor with 2 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
 #define TEST_FACTOR(name, hfactor, vfactor)                                    \
+    TEST_FACTOR1(name, None, hfactor, vfactor, 2)                              \
+    TEST_FACTOR1(name, Linear, hfactor, vfactor, 2)                            \
     TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 2)
 
 // TODO(fbarchard): ScaleDownBy1 should be lossless, but Box has error of 2.
-// TEST_FACTOR(1, 1 / 1, 1 / 1)
+TEST_FACTOR(1, 1 / 1, 1 / 1)
 TEST_FACTOR(2, 1 / 2, 1 / 2)
 TEST_FACTOR(4, 1 / 4, 1 / 4)
-// TEST_FACTOR(8, 1 / 8, 1 / 8)
-// TEST_FACTOR(16, 1 / 16, 1 / 16)
-// TEST_FACTOR(2by3, 2 / 3, 2 / 3)
+TEST_FACTOR(8, 1 / 8, 1 / 8)
+TEST_FACTOR(16, 1 / 16, 1 / 16)
+TEST_FACTOR(2by3, 2 / 3, 2 / 3)
 TEST_FACTOR(3by4, 3 / 4, 3 / 4)
-// TEST_FACTOR(3by8, 3 / 8, 3 / 8)
-// TEST_FACTOR(Vertical2by3, 1, 2 / 3)
+TEST_FACTOR(3by8, 3 / 8, 3 / 8)
+TEST_FACTOR(Vertical2by3, 1, 2 / 3)
 #undef TEST_FACTOR1
 #undef TEST_FACTOR
 
@@ -257,14 +259,15 @@ TEST_FACTOR(3by4, 3 / 4, 3 / 4)
 // Test scale to a specified size with all 3 filters.
 #define TEST_SCALETO(name, width, height)                                      \
     TEST_SCALETO1(name, width, height, None, 0)                                \
+    TEST_SCALETO1(name, width, height, Linear, 2)                              \
     TEST_SCALETO1(name, width, height, Bilinear, 2)
 
 TEST_SCALETO(ARGBScale, 640, 360)
-TEST_SCALETO(DISABLED_ARGBScale, 853, 480)
-TEST_SCALETO(DISABLED_ARGBScale, 1280, 720)
-TEST_SCALETO(DISABLED_ARGBScale, 1280, 800)
-TEST_SCALETO(DISABLED_ARGBScale, 1366, 768)
-TEST_SCALETO(DISABLED_ARGBScale, 1920, 1080)
+TEST_SCALETO(ARGBScale, 853, 480)
+TEST_SCALETO(ARGBScale, 1280, 720)
+TEST_SCALETO(ARGBScale, 1280, 800)
+TEST_SCALETO(ARGBScale, 1366, 768)
+TEST_SCALETO(ARGBScale, 1920, 1080)
 #undef TEST_SCALETO1
 #undef TEST_SCALETO
 
diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc
index d21615dda..2fa904bf8 100644
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@@ -141,23 +141,24 @@ static int TestFilter(int src_width, int src_height,
       EXPECT_LE(diff, max_diff);                                               \
     }
 
-// Test a scale factor with all 3 filters.  Expect unfiltered to be exact, but
+// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
 #define TEST_FACTOR(name, hfactor, vfactor)                                    \
     TEST_FACTOR1(name, None, hfactor, vfactor, 0)                              \
+    TEST_FACTOR1(name, Linear, hfactor, vfactor, 2)                            \
     TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 2)                          \
     TEST_FACTOR1(name, Box, hfactor, vfactor, 2)                               \
 
 // TODO(fbarchard): ScaleDownBy1 should be lossless, but Box has error of 2.
-// TEST_FACTOR(1, 1 / 1, 1 / 1)
+TEST_FACTOR(1, 1 / 1, 1 / 1)
 TEST_FACTOR(2, 1 / 2, 1 / 2)
 TEST_FACTOR(4, 1 / 4, 1 / 4)
-// TEST_FACTOR(8, 1 / 8, 1 / 8)
-// TEST_FACTOR(16, 1 / 16, 1 / 16)
-// TEST_FACTOR(2by3, 2 / 3, 2 / 3)
+TEST_FACTOR(8, 1 / 8, 1 / 8)
+TEST_FACTOR(16, 1 / 16, 1 / 16)
+TEST_FACTOR(2by3, 2 / 3, 2 / 3)
 TEST_FACTOR(3by4, 3 / 4, 3 / 4)
-// TEST_FACTOR(3by8, 3 / 8, 3 / 8)
-// TEST_FACTOR(Vertical2by3, 1, 2 / 3)
+TEST_FACTOR(3by8, 3 / 8, 3 / 8)
+TEST_FACTOR(Vertical2by3, 1, 2 / 3)
 #undef TEST_FACTOR1
 #undef TEST_FACTOR
 
@@ -175,17 +176,19 @@ TEST_FACTOR(3by4, 3 / 4, 3 / 4)
       EXPECT_LE(diff, max_diff);                                               \
     }
 
-// Test scale to a specified size with all 3 filters.
+// Test scale to a specified size with all 4 filters.
 #define TEST_SCALETO(name, width, height)                                      \
     TEST_SCALETO1(name, width, height, None, 0)                                \
+    TEST_SCALETO1(name, width, height, Linear, 0)                              \
+    TEST_SCALETO1(name, width, height, Bilinear, 2)                            \
     TEST_SCALETO1(name, width, height, Box, 2)
 
 TEST_SCALETO(Scale, 640, 360)
-TEST_SCALETO(DISABLED_Scale, 853, 480)
-TEST_SCALETO(DISABLED_Scale, 1280, 720)
-TEST_SCALETO(DISABLED_Scale, 1280, 800)
-TEST_SCALETO(DISABLED_Scale, 1366, 768)
-TEST_SCALETO(DISABLED_Scale, 1920, 1080)
+TEST_SCALETO(Scale, 853, 480)
+TEST_SCALETO(Scale, 1280, 720)
+TEST_SCALETO(Scale, 1280, 800)
+TEST_SCALETO(Scale, 1366, 768)
+TEST_SCALETO(Scale, 1920, 1080)
 #undef TEST_SCALETO1
 #undef TEST_SCALETO