From 4339f09d60c55c58af24ff4afd53fe08fde0a9d1 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Thu, 23 Feb 2012 10:52:55 +0000
Subject: [PATCH] bilinear scaling improvements -subpixel accurate for scaling
 down by passing in x and dx. -blend for bilinear use single multiply instead
 of 2 multiplies. -filter col do 2 pixels at a time -assembly do address
 munging -assembly avoid wait on store for core2 BUG=none TEST=none Review
 URL: https://webrtc-codereview.appspot.com/397012

git-svn-id: http://libyuv.googlecode.com/svn/trunk@187 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium          |   2 +-
 include/libyuv/version.h |   2 +-
 source/scale.cc          | 283 ++++++++++++++++++++-------------------
 3 files changed, 144 insertions(+), 143 deletions(-)

diff --git a/README.chromium b/README.chromium
index 4f30242dd..074c8391c 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 185
+Version: 186
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 2213df408..254519b97 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 185
+#define LIBYUV_VERSION 186
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
 
diff --git a/source/scale.cc b/source/scale.cc
index b12238580..b31e0b694 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1316,6 +1316,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
     cmp        eax, 0
     je         xloop1
     cmp        eax, 128
@@ -1334,7 +1335,6 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
   xloop:
     movdqa     xmm0, [esi]
     movdqa     xmm2, [esi + edx]
-    lea        esi, [esi + 16]
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     punpcklbw  xmm0, xmm7
@@ -1350,43 +1350,40 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
     sub        ecx, 16
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
     ja         xloop
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    mov        al, [esi + edi - 1]
+    mov        [esi + edi], al
     pop        edi
     pop        esi
     ret
 
   xloop1:
     movdqa     xmm0, [esi]
-    lea        esi, [esi + 16]
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
     sub        ecx, 16
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
     ja         xloop1
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    mov        al, [esi + edi - 1]
+    mov        [esi + edi], al
     pop        edi
     pop        esi
     ret
 
   xloop2:
     movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    lea        esi, [esi + 16]
-    pavgb      xmm0, xmm2
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
+    pavgb      xmm0, [esi + edx]
     sub        ecx, 16
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
     ja         xloop2
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    mov        al, [esi + edi - 1]
+    mov        [esi + edi], al
     pop        edi
     pop        esi
     ret
@@ -1407,6 +1404,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
     cmp        eax, 0
     je         xloop1
     cmp        eax, 128
@@ -1423,7 +1421,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   xloop:
     movdqa     xmm0, [esi]
     movdqa     xmm2, [esi + edx]
-    lea        esi, [esi + 16]
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
@@ -1432,47 +1429,43 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     psrlw      xmm0, 7
     psrlw      xmm1, 7
     packuswb   xmm0, xmm1
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
     sub        ecx, 16
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
     ja         xloop
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    mov        al, [esi + edi - 1]
+    mov        [esi + edi], al
     pop        edi
     pop        esi
     ret
 
   xloop1:
     movdqa     xmm0, [esi]
-    lea        esi, [esi + 16]
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
     sub        ecx, 16
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
     ja         xloop1
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    mov        al, [esi + edi - 1]
+    mov        [esi + edi], al
     pop        edi
     pop        esi
     ret
 
   xloop2:
     movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    lea        esi, [esi + 16]
-    pavgb      xmm0, xmm2
-    movdqa     [edi], xmm0
-    lea        edi, [edi + 16]
+    pavgb      xmm0, [esi + edx]
     sub        ecx, 16
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
     ja         xloop2
 
-    mov        al, [edi - 1]
-    mov        [edi], al
+    mov        al, [esi + edi - 1]
+    mov        [esi + edi], al
     pop        edi
     pop        esi
     ret
-
   }
 }
 
@@ -2081,6 +2074,7 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
     "mov    0x14(%esp),%edx                    \n"
     "mov    0x18(%esp),%ecx                    \n"
     "mov    0x1c(%esp),%eax                    \n"
+    "sub    %esi, %edi                         \n"
     "cmp    $0x0,%eax                          \n"
     "je     2f                                 \n"
     "cmp    $0x80,%eax                         \n"
@@ -2098,7 +2092,6 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
 "1:"
     "movdqa (%esi),%xmm0                       \n"
     "movdqa (%esi,%edx,1),%xmm2                \n"
-    "lea    0x10(%esi),%esi                    \n"
     "movdqa %xmm0,%xmm1                        \n"
     "movdqa %xmm2,%xmm3                        \n"
     "punpcklbw %xmm7,%xmm0                     \n"
@@ -2114,42 +2107,40 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
     "psrlw  $0x8,%xmm0                         \n"
     "psrlw  $0x8,%xmm1                         \n"
     "packuswb %xmm1,%xmm0                      \n"
-    "movdqa %xmm0,(%edi)                       \n"
-    "lea    0x10(%edi),%edi                    \n"
     "sub    $0x10,%ecx                         \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
+    "lea    0x10(%esi),%esi                    \n"
     "ja     1b                                 \n"
-    "mov    -0x1(%edi),%al                     \n"
-    "mov    %al,(%edi)                         \n"
+
+    "mov    -0x1(%esi,%edi,1),%al              \n"
+    "mov    %al,(%esi,%edi,1)                  \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
 
 "2:"
     "movdqa (%esi),%xmm0                       \n"
-    "lea    0x10(%esi),%esi                    \n"
-    "movdqa %xmm0,(%edi)                       \n"
-    "lea    0x10(%edi),%edi                    \n"
     "sub    $0x10,%ecx                         \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
+    "lea    0x10(%esi),%esi                    \n"
     "ja     2b                                 \n"
 
-    "mov    -0x1(%edi),%al                     \n"
-    "mov    %al,(%edi)                         \n"
+    "mov    -0x1(%esi,%edi,1),%al              \n"
+    "mov    %al,(%esi,%edi,1)                  \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
 
 "3:"
     "movdqa (%esi),%xmm0                       \n"
-    "movdqa (%esi,%edx,1),%xmm2                \n"
-    "lea    0x10(%esi),%esi                    \n"
-    "pavgb  %xmm2,%xmm0                        \n"
-    "movdqa %xmm0,(%edi)                       \n"
-    "lea    0x10(%edi),%edi                    \n"
+    "pavgb  (%esi,%edx,1),%xmm0                \n"
     "sub    $0x10,%ecx                         \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
+    "lea    0x10(%esi),%esi                    \n"
     "ja     3b                                 \n"
 
-    "mov    -0x1(%edi),%al                     \n"
-    "mov    %al,(%edi)                         \n"
+    "mov    -0x1(%esi,%edi,1),%al              \n"
+    "mov    %al,(%esi,%edi,1)                  \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
@@ -2169,6 +2160,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     "mov    0x14(%esp),%edx                    \n"
     "mov    0x18(%esp),%ecx                    \n"
     "mov    0x1c(%esp),%eax                    \n"
+    "sub    %esi, %edi                         \n"
     "cmp    $0x0,%eax                          \n"
     "je     2f                                 \n"
     "cmp    $0x80,%eax                         \n"
@@ -2184,7 +2176,6 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
 "1:"
     "movdqa (%esi),%xmm0                       \n"
     "movdqa (%esi,%edx,1),%xmm2                \n"
-    "lea    0x10(%esi),%esi                    \n"
     "movdqa %xmm0,%xmm1                        \n"
     "punpcklbw %xmm2,%xmm0                     \n"
     "punpckhbw %xmm2,%xmm1                     \n"
@@ -2193,40 +2184,40 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     "psrlw  $0x7,%xmm0                         \n"
     "psrlw  $0x7,%xmm1                         \n"
     "packuswb %xmm1,%xmm0                      \n"
-    "movdqa %xmm0,(%edi)                       \n"
-    "lea    0x10(%edi),%edi                    \n"
     "sub    $0x10,%ecx                         \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
+    "lea    0x10(%esi),%esi                    \n"
     "ja     1b                                 \n"
-    "mov    -0x1(%edi),%al                     \n"
-    "mov    %al,(%edi)                         \n"
+
+    "mov    -0x1(%esi,%edi,1),%al              \n"
+    "mov    %al,(%esi,%edi,1)                  \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
 
 "2:"
     "movdqa (%esi),%xmm0                       \n"
-    "lea    0x10(%esi),%esi                    \n"
-    "movdqa %xmm0,(%edi)                       \n"
-    "lea    0x10(%edi),%edi                    \n"
     "sub    $0x10,%ecx                         \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
+    "lea    0x10(%esi),%esi                    \n"
     "ja     2b                                 \n"
-    "mov    -0x1(%edi),%al                     \n"
-    "mov    %al,(%edi)                         \n"
+
+    "mov    -0x1(%esi,%edi,1),%al              \n"
+    "mov    %al,(%esi,%edi,1)                  \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
 
 "3:"
     "movdqa (%esi),%xmm0                       \n"
-    "movdqa (%esi,%edx,1),%xmm2                \n"
-    "lea    0x10(%esi),%esi                    \n"
-    "pavgb  %xmm2,%xmm0                        \n"
-    "movdqa %xmm0,(%edi)                       \n"
-    "lea    0x10(%edi),%edi                    \n"
+    "pavgb  (%esi,%edx,1),%xmm0                \n"
     "sub    $0x10,%ecx                         \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
+    "lea    0x10(%esi),%esi                    \n"
     "ja     3b                                 \n"
-    "mov    -0x1(%edi),%al                     \n"
-    "mov    %al,(%edi)                         \n"
+
+    "mov    -0x1(%esi,%edi,1),%al              \n"
+    "mov    %al,(%esi,%edi,1)                  \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
@@ -2921,16 +2912,30 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif
 
-static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                              int dst_width, int dx) {
-  int x = 0;
-  for (int j = 0; j < dst_width; ++j) {
-    int xi = x >> 16;
-    int xf1 = x & 0xffff;
-    int xf0 = 65536 - xf1;
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) ((int)(a) + ((f) * ((int)(b) - (int)(a)) >> 16))
 
-    *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
+// TODO(fbarchard): consider +0x8000 for rounding if it can be done for free.
+static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int x, int dx) {
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
     x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
   }
 }
 
@@ -3340,10 +3345,9 @@ static __inline uint32 SumBox(int iboxwidth, int iboxheight,
   return sum;
 }
 
-static void ScalePlaneBoxRow(int dst_width, int boxheight,
-                             int dx, int src_stride,
-                             const uint8* src_ptr, uint8* dst_ptr) {
-  int x = 0;
+static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
+                               int x, int dx, int src_stride,
+                               const uint8* src_ptr, uint8* dst_ptr) {
   for (int i = 0; i < dst_width; ++i) {
     int ix = x >> 16;
     x += dx;
@@ -3362,14 +3366,13 @@ static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   return sum;
 }
 
-static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
+static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
                             const uint16* src_ptr, uint8* dst_ptr) {
   int scaletbl[2];
   int minboxwidth = (dx >> 16);
   scaletbl[0] = 65536 / (minboxwidth * boxheight);
   scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
   int *scaleptr = scaletbl - minboxwidth;
-  int x = 0;
   for (int i = 0; i < dst_width; ++i) {
     int ix = x >> 16;
     x += dx;
@@ -3378,11 +3381,10 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
   }
 }
 
-static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
+static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
                             const uint16* src_ptr, uint8* dst_ptr) {
   int boxwidth = (dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
-  int x = 0;
   for (int i = 0; i < dst_width; ++i) {
     *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
     x += boxwidth;
@@ -3404,33 +3406,32 @@ static void ScalePlaneBox(int src_width, int src_height,
                           const uint8* src_ptr, uint8* dst_ptr) {
   assert(dst_width > 0);
   assert(dst_height > 0);
-  int dy = (src_height << 16) / dst_height;
   int dx = (src_width << 16) / dst_width;
+  int dy = (src_height << 16) / dst_height;
+  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+  int maxy = (src_height << 16);
   if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
       dst_height * 2 > src_height) {
     uint8* dst = dst_ptr;
-    int dy = (src_height << 16) / dst_height;
-    int dx = (src_width << 16) / dst_width;
-    int y = 0;
     for (int j = 0; j < dst_height; ++j) {
       int iy = y >> 16;
-      const uint8* const src = src_ptr + iy * src_stride;
+      const uint8* src = src_ptr + iy * src_stride;
       y += dy;
-      if (y > (src_height << 16)) {
-        y = (src_height << 16);
+      if (y > maxy) {
+        y = maxy;
       }
       int boxheight = (y >> 16) - iy;
-      ScalePlaneBoxRow(dst_width, boxheight,
-                       dx, src_stride,
-                       src, dst);
-
+      ScalePlaneBoxRow_C(dst_width, boxheight,
+                         x, dx, src_stride,
+                         src, dst);
       dst += dst_stride;
     }
   } else {
     ALIGN16(uint16 row[kMaxInputWidth]);
     void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
                          uint16* dst_ptr, int src_width, int src_height);
-    void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
                          const uint16* src_ptr, uint8* dst_ptr);
 #if defined(HAS_SCALEADDROWS_SSE2)
     if (TestCpuFlag(kCpuHasSSE2) &&
@@ -3447,17 +3448,16 @@ static void ScalePlaneBox(int src_width, int src_height,
       ScaleAddCols = ScaleAddCols1_C;
     }
 
-    int y = 0;
     for (int j = 0; j < dst_height; ++j) {
       int iy = y >> 16;
-      const uint8* const src = src_ptr + iy * src_stride;
+      const uint8* src = src_ptr + iy * src_stride;
       y += dy;
       if (y > (src_height << 16)) {
         y = (src_height << 16);
       }
       int boxheight = (y >> 16) - iy;
       ScaleAddRows(src, src_stride, row, src_width, boxheight);
-      ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr);
       dst_ptr += dst_stride;
     }
   }
@@ -3470,33 +3470,34 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height,
                                      int dst_width, int dst_height,
                                      int src_stride, int dst_stride,
                                      const uint8* src_ptr, uint8* dst_ptr) {
-  uint8* dst = dst_ptr;
   int dx = (src_width << 16) / dst_width;
   int dy = (src_height << 16) / dst_height;
-  int maxx = ((src_width - 1) << 16) - 1;
-  int maxy = ((src_height - 1) << 16) - 1;
-  int y = (dst_height < src_height) ? 32768 :
-      (src_height << 16) / dst_height - 32768;
+  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+  int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0;
+  int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   for (int i = 0; i < dst_height; ++i) {
-    int cy = (y < 0) ? 0 : y;
-    int yi = cy >> 16;
-    int yf = cy & 0xffff;
-    const uint8* const src = src_ptr + yi * src_stride;
-    int x = (dst_width < src_width) ? 32768 :
-        (src_width << 16) / dst_width - 32768;
+    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+    int yi = y >> 16;
+    int yf = y & 0xffff;
+    const uint8* src0 = src_ptr + yi * src_stride;
+    const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0;
+    uint8* dst = dst_ptr;
     for (int j = 0; j < dst_width; ++j) {
-      int cx = (x < 0) ? 0 : x;
-      int xi = cx >> 16;
-      int xf = cx & 0xffff;
-      int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
-      int r1 = (src[xi + src_stride] * (65536 - xf) +
-          src[xi + src_stride + 1] * xf) >> 16;
-      *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
+      int xi = x >> 16;
+      int xf = x & 0xffff;
+      int x1 = (xi < src_width - 1) ? xi + 1 : xi;
+      int a = src0[xi];
+      int b = src0[x1];
+      int r0 = BLENDER(a, b, xf);
+      a = src1[xi];
+      b = src1[x1];
+      int r1 = BLENDER(a, b, xf);
+      *dst++ = BLENDER(r0, r1, yf);
       x += dx;
       if (x > maxx)
         x = maxx;
     }
-    dst += dst_stride - dst_width;
+    dst_ptr += dst_stride;
     y += dy;
     if (y > maxy)
       y = maxy;
@@ -3513,8 +3514,6 @@ void ScalePlaneBilinear(int src_width, int src_height,
                         const uint8* src_ptr, uint8* dst_ptr) {
   assert(dst_width > 0);
   assert(dst_height > 0);
-  int dy = (src_height << 16) / dst_height;
-  int dx = (src_width << 16) / dst_width;
   if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
     ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
                              src_stride, dst_stride, src_ptr, dst_ptr);
@@ -3524,8 +3523,6 @@ void ScalePlaneBilinear(int src_width, int src_height,
     void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
                             int src_stride,
                             int dst_width, int source_y_fraction);
-    void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-                            int dst_width, int dx);
 #if defined(HAS_SCALEFILTERROWS_NEON)
     if (TestCpuFlag(kCpuHasNEON)) {
       ScaleFilterRows = ScaleFilterRows_NEON;
@@ -3546,16 +3543,18 @@ void ScalePlaneBilinear(int src_width, int src_height,
     {
       ScaleFilterRows = ScaleFilterRows_C;
     }
-    ScaleFilterCols = ScaleFilterCols_C;
 
-    int y = 0;
-    int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
+    int dx = (src_width << 16) / dst_width;
+    int dy = (src_height << 16) / dst_height;
+    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+    int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+    int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
     for (int j = 0; j < dst_height; ++j) {
-      int iy = y >> 16;
-      int fy = (y >> 8) & 255;
-      const uint8* const src = src_ptr + iy * src_stride;
-      ScaleFilterRows(row, src, src_stride, src_width, fy);
-      ScaleFilterCols(dst_ptr, row, dst_width, dx);
+      int yi = y >> 16;
+      int yf = (y >> 8) & 255;
+      const uint8* src = src_ptr + yi * src_stride;
+      ScaleFilterRows(row, src, src_stride, src_width, yf);
+      ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
       dst_ptr += dst_stride;
       y += dy;
       if (y > maxy) {
@@ -3575,18 +3574,20 @@ static void ScalePlaneSimple(int src_width, int src_height,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
                              const uint8* src_ptr, uint8* dst_ptr) {
-  uint8* dst = dst_ptr;
   int dx = (src_width << 16) / dst_width;
-  for (int y = 0; y < dst_height; ++y) {
-    const uint8* const src = src_ptr + (y * src_height / dst_height) *
-        src_stride;
-    // TODO(fbarchard): Round X coordinate by setting x=0x8000.
-    int x = 0;
+  int dy = (src_height << 16) / dst_height;
+  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
+  for (int j = 0; j < dst_height; ++j) {
+    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+    uint8* dst = dst_ptr;
     for (int i = 0; i < dst_width; ++i) {
       *dst++ = src[x >> 16];
       x += dx;
     }
-    dst += dst_stride - dst_width;
+    dst_ptr += dst_stride;
+    y += dy;
   }
 }
 
@@ -3790,9 +3791,9 @@ int ScaleOffset(const uint8* src, int src_width, int src_height,
   int dst_halfwidth = (dst_width + 1) >> 1;
   int dst_halfheight = (dst_height + 1) >> 1;
   int aheight = dst_height - dst_yoffset * 2;  // actual output height
-  const uint8* const src_y = src;
-  const uint8* const src_u = src + src_width * src_height;
-  const uint8* const src_v = src + src_width * src_height +
+  const uint8* src_y = src;
+  const uint8* src_u = src + src_width * src_height;
+  const uint8* src_v = src + src_width * src_height +
                              src_halfwidth * src_halfheight;
   uint8* dst_y = dst + dst_yoffset * dst_width;
   uint8* dst_u = dst + dst_width * dst_height +