From e8c74b61d3e90d0dbc8d19032d0713054fcf4538 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Thu, 14 Nov 2013 02:03:32 +0000
Subject: [PATCH] Faster point samplers using row functions and specialized 2x
 upsampler. BUG=none TEST=none R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/3859004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@854 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium          |   2 +-
 include/libyuv/version.h |   2 +-
 source/scale.cc          | 268 +++++++++++++++++++++++++++++----------
 source/scale_argb.cc     | 155 ++++++++++++----------
 4 files changed, 296 insertions(+), 131 deletions(-)

diff --git a/README.chromium b/README.chromium
index 29e4fa539..bc0b4ba6b 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 854
+Version: 855
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 437063884..e4c7afbd2 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 854
+#define LIBYUV_VERSION 855
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/scale.cc b/source/scale.cc
index 779e53a86..c148032d9 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -953,11 +953,76 @@ static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
+#define HAS_SCALECOLSUP2_SSE2
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int /* x */, int /* dx */) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_ptr
+    mov        eax, [esp + 8]    // src_ptr
+    mov        ecx, [esp + 12]   // dst_width
+
+    align      16
+  wloop:
+    movdqa     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0
+    punpckhbw  xmm1, xmm1
+    sub        ecx, 32
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    jg         wloop
+
+    ret
+  }
+}
+
 #elif !defined(LIBYUV_DISABLE_X86) && \
     ((defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
+
+// TODO(nfullagar): For Native Client: When new toolchain becomes available,
+// take advantage of bundle lock / unlock feature. This will reduce the amount
+// of manual bundle alignment done below, and bundle alignment could even be
+// moved into each macro that doesn't use %%nacl: such as MEMOPREG.
+
+#if defined(__native_client__) && defined(__x86_64__)
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " %%" #reg ",(%%r15,%%r14)\n"
+#define BUNDLEALIGN ".p2align 5 \n"
+#else
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define BUNDLEALIGN
+#endif
+
 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
 // objdump -D yuvscaler.obj >yuvscaler.txt
+
 #define HAS_SCALEROWDOWN2_SSE2
 static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
@@ -1689,6 +1754,40 @@ static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   );
 }
 
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+#define HAS_SCALECOLSUP2_SSE2
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int /* x */, int /* dx */) {
+  asm volatile (
+    ".p2align  4                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "sub       $0x20,%2                         \n"
+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width)    // %2
+  :
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
@@ -1876,6 +1975,34 @@ static void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   } while (d < dend);
 }
 
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx) {
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int, int) {
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
 // (1-f)a + fb can be replaced with a + f(b-a)
 #define BLENDER(a, b, f) (static_cast<int>(a) + \
     ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
@@ -2484,7 +2611,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
   } else if (dst_height > 1) {
     dy = FixedDiv(src_height - 1, dst_height - 1);
   }
-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  const int max_y = (src_height - 1) << 16;
   for (int j = 0; j < dst_height; ++j) {
     if (y > max_y) {
       y = max_y;
@@ -2515,6 +2642,29 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   assert(dst_width > 0);
   assert(dst_height > 0);
   assert(Abs(dst_width) <= kMaxStride);
+  int dx = 0;
+  int dy = 0;
+  int x = 0;
+  int y = 0;
+  if (dst_width <= Abs(src_width)) {
+    dx = FixedDiv(Abs(src_width), dst_width);
+    x = (dx >> 1) - 32768;
+  } else if (dst_width > 1) {
+    dx = FixedDiv(Abs(src_width) - 1, dst_width - 1);
+  }
+  // Negative src_width means horizontally mirror.
+  if (src_width < 0) {
+    x += (dst_width - 1) * dx;
+    dx = -dx;
+    src_width = -src_width;
+  }
+  if (dst_height <= src_height) {
+    dy = FixedDiv(src_height, dst_height);
+    y = (dy >> 1) - 32768;
+  } else if (dst_height > 1) {
+    dy = FixedDiv(src_height - 1, dst_height - 1);
+  }
+
   void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
       ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
       InterpolateRow_C;
@@ -2566,36 +2716,25 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
 #endif
 
   void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-       int dst_width, int x, int dx) = ScaleFilterCols_C;
+       int dst_width, int x, int dx) =
+       filtering ? ScaleFilterCols_C : ScaleCols_C;
 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (filtering && TestCpuFlag(kCpuHasSSSE3)) {
     ScaleFilterCols = ScaleFilterCols_SSSE3;
   }
 #endif
-  int dx = 0;
-  int dy = 0;
-  int x = 0;
-  int y = 0;
-  if (dst_width <= Abs(src_width)) {
-    dx = FixedDiv(Abs(src_width), dst_width);
-    x = (dx >> 1) - 32768;
-  } else if (dst_width > 1) {
-    dx = FixedDiv(Abs(src_width) - 1, dst_width - 1);
-  }
-  // Negative src_width means horizontally mirror.
-  if (src_width < 0) {
-    x += (dst_width - 1) * dx;
-    dx = -dx;
-    src_width = -src_width;
-  }
-  if (dst_height <= src_height) {
-    dy = FixedDiv(src_height, dst_height);
-    y = (dy >> 1) - 32768;
-  } else if (dst_height > 1) {
-    dy = FixedDiv(src_height - 1, dst_height - 1);
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleFilterCols = ScaleColsUp2_SSE2;
+    }
+#endif
   }
 
-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  const int max_y = (src_height - 1) << 16;
   if (y > max_y) {
     y = max_y;
   }
@@ -2616,7 +2755,11 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   for (int j = 0; j < dst_height; ++j) {
     yi = y >> 16;
     if (yi != lasty) {
-      if (y <= max_y) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+      }
+      if (yi != lasty) {
         ScaleFilterCols(rowptr, src, dst_width, x, dx);
         rowptr += rowstride;
         rowstride = -rowstride;
@@ -2635,7 +2778,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   }
 }
 
-// Scale plane to/from any dimensions, without interpolation.
+// Scale Plane to/from any dimensions, without interpolation.
 // Fixed point math is used for performance: The upper 16 bits
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
@@ -2654,47 +2797,27 @@ static void ScalePlaneSimple(int src_width, int src_height,
     dx = -dx;
     src_width = -src_width;
   }
-
-  for (int j = 0; j < dst_height; ++j) {
-    int xs = x;
-    int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
-    uint8* dst = dst_ptr;
-    for (int i = 0; i < dst_width; ++i) {
-      *dst++ = src[xs >> 16];
-      xs += dx;
+  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_C;
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleCols = ScaleColsUp2_SSE2;
     }
+#endif
+  }
+
+  for (int i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+              dst_width, x, dx);
     dst_ptr += dst_stride;
     y += dy;
   }
 }
 
-// Scale plane to/from any dimensions.
-static void ScalePlaneAnySize(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_ptr, uint8* dst_ptr,
-                              FilterMode filtering) {
-  if (filtering == kFilterBox && src_width <= kMaxStride &&
-      dst_height * 2 < src_height  ) {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
-                  src_stride, dst_stride, src_ptr, dst_ptr);
-    return;
-  }
-  if (filtering && dst_height > src_height && dst_width <= kMaxStride) {
-    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
-                         src_stride, dst_stride, src_ptr, dst_ptr, filtering);
-    return;
-  }
-  if (filtering && src_width <= kMaxStride) {
-    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
-                           src_stride, dst_stride, src_ptr, dst_ptr, filtering);
-    return;
-  }
-  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
-                   src_stride, dst_stride, src_ptr, dst_ptr);
-}
-
 // Scale a plane.
 // This function in turn calls a scaling function suitable for handling
 // the desired resolutions.
@@ -2752,9 +2875,24 @@ void ScalePlane(const uint8* src, int src_stride,
       return;
     }
   }
-  // Arbitrary scale up and/or down.
-  ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
-                    src_stride, dst_stride, src, dst, filtering);
+  if (filtering == kFilterBox && src_width <= kMaxStride &&
+      dst_height * 2 < src_height  ) {
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                  src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height && dst_width <= kMaxStride) {
+    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering && src_width <= kMaxStride) {
+    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                           src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                   src_stride, dst_stride, src, dst);
 }
 
 // Scale an I420 image.
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index 2c9fb615e..6d102c7b4 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -401,6 +401,7 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+#define HAS_SCALEARGBCOLSUP2_SSE2
 __declspec(naked) __declspec(align(16))
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                            int dst_width, int /* x */, int /* dx */) {
@@ -735,6 +736,7 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+#define HAS_SCALEARGBCOLSUP2_SSE2
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                            int dst_width, int /* x */, int /* dx */) {
   asm volatile (
@@ -945,6 +947,38 @@ static void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
   }
 }
 
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int, int) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
 // Mimics SSSE3 blender
 #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
 #define BLENDERC(a, b, f, s) static_cast<uint32>( \
@@ -1151,7 +1185,7 @@ static void ScaleARGBBilinearDown(int src_height,
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  const int max_y = (src_height - 1) << 16;
   for (int j = 0; j < dst_height; ++j) {
     if (y > max_y) {
       y = max_y;
@@ -1231,13 +1265,30 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
   }
 #endif
   void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) = ScaleARGBFilterCols_C;
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (filtering && TestCpuFlag(kCpuHasSSSE3)) {
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
   if (y > max_y) {
     y = max_y;
   }
@@ -1258,7 +1309,11 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
   for (int j = 0; j < dst_height; ++j) {
     yi = y >> 16;
     if (yi != lasty) {
-      if (y <= max_y) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+      }
+      if (yi != lasty) {
         ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
         rowptr += rowstride;
         rowstride = -rowstride;
@@ -1394,7 +1449,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
-  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  const int max_y = (src_height - 1) << 16;
   if (y > max_y) {
     y = max_y;
   }
@@ -1430,7 +1485,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
   for (int j = 0; j < dst_height; ++j) {
     yi = y >> 16;
     if (yi != lasty) {
-      if (y <= max_y) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+      }
+      if (yi != lasty) {
         // TODO(fbarchard): Convert the clipped region of row.
         I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
         ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
@@ -1456,26 +1515,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
 }
 #endif
 
-// Scales a single row of pixels using point sampling.
-// Code is adapted from libyuv bilinear yuv scaling, but with bilinear
-// interpolation off, and argb pixels instead of yuv.
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                     int dst_width, int x, int dx) {
-  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
-  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
-  for (int j = 0; j < dst_width - 1; j += 2) {
-    dst[0] = src[x >> 16];
-    x += dx;
-    dst[1] = src[x >> 16];
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[x >> 16];
-  }
-}
-
-// ScaleARGB ARGB to/from any dimensions, without interpolation.
+// Scale ARGB to/from any dimensions, without interpolation.
 // Fixed point math is used for performance: The upper 16 bits
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
@@ -1490,14 +1530,18 @@ static void ScaleARGBSimple(int src_width, int src_height,
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleARGBCols = ScaleARGBCols_SSE2;
-    if (src_width * 2 == dst_width && IS_ALIGNED(dst_width, 8) &&
-        (x >> 16) == 0 &&
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
         IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
         IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
       ScaleARGBCols = ScaleARGBColsUp2_SSE2;
     }
-  }
 #endif
+  }
 
   for (int i = 0; i < dst_height; ++i) {
     ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
@@ -1507,33 +1551,6 @@ static void ScaleARGBSimple(int src_width, int src_height,
   }
 }
 
-// ScaleARGB ARGB to/from any dimensions.
-static void ScaleARGBAnySize(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int clip_width, int clip_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_argb, uint8* dst_argb,
-                             int x, int dx, int y, int dy,
-                             FilterMode filtering) {
-  if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) {
-    ScaleARGBBilinearUp(src_width, src_height,
-                        clip_width, clip_height,
-                        src_stride, dst_stride, src_argb, dst_argb,
-                        x, dx, y, dy, filtering);
-    return;
-  }
-  if (filtering && src_width * 4 < kMaxStride) {
-    ScaleARGBBilinearDown(src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src_argb, dst_argb,
-                          x, dx, y, dy, filtering);
-    return;
-  }
-  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
-                  src_stride, dst_stride, src_argb, dst_argb,
-                  x, dx, y, dy);
-}
-
 // ScaleARGB a ARGB.
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
@@ -1631,13 +1648,23 @@ static void ScaleARGB(const uint8* src, int src_stride,
                        x, y, dy, 4, filtering);
     return;
   }
-
-  // Arbitrary scale up and/or down.
-  ScaleARGBAnySize(src_width, src_height,
-                   dst_width, dst_height,
-                   clip_width, clip_height,
-                   src_stride, dst_stride, src, dst,
-                   x, dx, y, dy, filtering);
+  if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) {
+    ScaleARGBBilinearUp(src_width, src_height,
+                        clip_width, clip_height,
+                        src_stride, dst_stride, src, dst,
+                        x, dx, y, dy, filtering);
+    return;
+  }
+  if (filtering && src_width * 4 < kMaxStride) {
+    ScaleARGBBilinearDown(src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+    return;
+  }
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+                  src_stride, dst_stride, src, dst,
+                  x, dx, y, dy);
 }
 
 LIBYUV_API