diff --git a/common/common.h b/common/common.h
deleted file mode 100644
index 79e9af616..000000000
--- a/common/common.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef LIBYUV_SOURCE_COMMON_H_
-#define LIBYUV_SOURCE_COMMON_H_
-
-#if defined(_MSC_VER)
-// warning C4355: 'this' : used in base member initializer list
-#pragma warning(disable:4355)
-#endif
-
-#ifndef ENABLE_DEBUG
-#define ENABLE_DEBUG _DEBUG
-#endif  // !defined(ENABLE_DEBUG)
-
-#if ENABLE_DEBUG
-
-#if defined(_MSC_VER) && _MSC_VER < 1300
-#define __FUNCTION__ ""
-#endif
-#else // !ENABLE_DEBUG
-
-#endif // !ENABLE_DEBUG
-
-// Forces compiler to inline, even against its better judgement. Use wisely.
-#if defined(__GNUC__)
-#define FORCE_INLINE __attribute__((always_inline))
-#elif defined(WIN32)
-#define FORCE_INLINE __forceinline
-#else
-#define FORCE_INLINE
-#endif
-
-#endif // LIBYUV_SOURCE_COMMON_H_
diff --git a/include/format_conversion.h b/include/format_conversion.h
deleted file mode 100644
index e73d4e50a..000000000
--- a/include/format_conversion.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef LIBYUV_INCLUDE_FORMATCONVERSION_H_
-#define LIBYUV_INCLUDE_FORMATCONVERSION_H_
-
-#include "basic_types.h"
-
-namespace libyuv {
-
-// Converts any Bayer RGB format to I420.
-void BayerRGBToI420(const uint8* src_bayer, int src_pitch_bayer,
-                    uint32 src_fourcc_bayer,
-                    uint8* dst_y, int dst_pitch_y,
-                    uint8* dst_u, int dst_pitch_u,
-                    uint8* dst_v, int dst_pitch_v,
-                    int width, int height);
-
-// Converts any Bayer RGB format to ARGB.
-void BayerRGBToARGB(const uint8* src_bayer, int src_pitch_bayer,
-                    uint32 src_fourcc_bayer,
-                    uint8* dst_rgb, int dst_pitch_rgb,
-                    int width, int height);
-
-// Converts ARGB to any Bayer RGB format.
-void ARGBToBayerRGB(const uint8* src_rgb, int src_pitch_rgb,
-                    uint8* dst_bayer, int dst_pitch_bayer,
-                    uint32 dst_fourcc_bayer,
-                    int width, int height);
-
-}  // namespace libyuv
-
-#endif  // LIBYUV_INCLUDE_FORMATCONVERSION_H_
diff --git a/include/libyuv.h b/include/libyuv.h
new file mode 100644
index 000000000..81af8c427
--- /dev/null
+++ b/include/libyuv.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef LIBYUV_INCLUDE_LIBYUV_H_
+#define LIBYUV_INCLUDE_LIBYUV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/general.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/scale.h"
+
+#endif  // LIBYUV_INCLUDE_LIBYUV_H_
diff --git a/common/basic_types.h b/include/libyuv/basic_types.h
similarity index 51%
rename from common/basic_types.h
rename to include/libyuv/basic_types.h
index a553a3961..5adc2bfdb 100644
--- a/common/basic_types.h
+++ b/include/libyuv/basic_types.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef LIBYUV_COMMON_BASIC_TYPES_H_
-#define LIBYUV_COMMON_BASIC_TYPES_H_
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
 
 #include <stddef.h>  // for NULL, size_t
 
@@ -17,11 +17,6 @@
 #include <stdint.h>  // for uintptr_t
 #endif
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-
 #ifndef INT_TYPES_DEFINED
 #define INT_TYPES_DEFINED
 #ifdef COMPILER_MSVC
@@ -59,48 +54,15 @@ typedef unsigned short uint16;
 typedef unsigned char uint8;
 #endif  // INT_TYPES_DEFINED
 
-#ifdef WIN32
-typedef int socklen_t;
-#endif
-
-namespace libyuv {
-  template<class T> inline T _min(T a, T b) { return (a > b) ? b : a; }
-  template<class T> inline T _max(T a, T b) { return (a < b) ? b : a; }
-
-  // For wait functions that take a number of milliseconds, kForever indicates
-  // unlimited time.
-  const int kForever = -1;
-}
-
 // Detect compiler is for x86 or x64.
 #if defined(__x86_64__) || defined(_M_X64) || \
     defined(__i386__) || defined(_M_IX86)
 #define CPU_X86 1
 #endif
 
-#ifdef WIN32
-#define alignof(t) __alignof(t)
-#else  // !WIN32
-#define alignof(t) __alignof__(t)
-#endif  // !WIN32
 #define IS_ALIGNED(p, a) (0==(reinterpret_cast<uintptr_t>(p) & ((a)-1)))
 #define ALIGNP(p, t) \
   (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
   ((t)-1)) & ~((t)-1))))
 
-#ifndef UNUSED
-#define UNUSED(x) Unused(static_cast<const void *>(&x))
-#define UNUSED2(x,y) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y))
-#define UNUSED3(x,y,z) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y)); Unused(static_cast<const void *>(&z))
-#define UNUSED4(x,y,z,a) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y)); Unused(static_cast<const void *>(&z)); Unused(static_cast<const void *>(&a))
-#define UNUSED5(x,y,z,a,b) Unused(static_cast<const void *>(&x)); Unused(static_cast<const void *>(&y)); Unused(static_cast<const void *>(&z)); Unused(static_cast<const void *>(&a)); Unused(static_cast<const void *>(&b))
-inline void Unused(const void *) { }
-#endif // UNUSED
-
-#if defined(__GNUC__)
-#define GCC_ATTR(x) __attribute__ ((x))
-#else  // !__GNUC__
-#define GCC_ATTR(x)
-#endif  // !__GNUC__
-
-#endif // LIBYUV_COMMON_BASIC_TYPES_H_
+#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/include/convert.h b/include/libyuv/convert.h
similarity index 96%
rename from include/convert.h
rename to include/libyuv/convert.h
index 731f624cd..c08011ef5 100644
--- a/include/convert.h
+++ b/include/libyuv/convert.h
@@ -9,10 +9,10 @@
  */
 
 
-#ifndef LIBYUV_INCLUDE_CONVERT_H_
-#define LIBYUV_INCLUDE_CONVERT_H_
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
+#define INCLUDE_LIBYUV_CONVERT_H_
 
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 
 namespace libyuv {
 
@@ -106,4 +106,4 @@ NV12ToRGB565(const uint8* src_yplane, int src_ystride,
 
 } //  namespace libyuv
 
-#endif // LIBYUV_INCLUDE_CONVERT_H_
+#endif // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/source/cpu_id.h b/include/libyuv/cpu_id.h
similarity index 88%
rename from source/cpu_id.h
rename to include/libyuv/cpu_id.h
index ae33238ba..efe17e23e 100644
--- a/source/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef LIBYUV_SOURCE_CPU_ID_H_
-#define LIBYUV_SOURCE_CPU_ID_H_
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#define INCLUDE_LIBYUV_CPU_ID_H_
 
 namespace libyuv {
 
@@ -28,4 +28,4 @@ void MaskCpuFlagsForTest(int enable_flags);
 
 }  // namespace libyuv
 
-#endif  // LIBYUV_SOURCE_CPU_ID_H_
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_
diff --git a/include/libyuv/format_conversion.h b/include/libyuv/format_conversion.h
new file mode 100644
index 000000000..d3d36f388
--- /dev/null
+++ b/include/libyuv/format_conversion.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_
+#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
+
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+// Converts any Bayer RGB format to I420.
+int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
+                   uint32 src_fourcc_bayer,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+// Converts any Bayer RGB format to ARGB.
+int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
+                   uint32 src_fourcc_bayer,
+                   uint8* dst_rgb, int dst_stride_rgb,
+                   int width, int height);
+
+// Converts ARGB to any Bayer RGB format.
+int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
+                   uint8* dst_bayer, int dst_stride_bayer,
+                   uint32 dst_fourcc_bayer,
+                   int width, int height);
+
+}  // namespace libyuv
+
+#endif  // INCLUDE_LIBYUV_FORMATCONVERSION_H_
diff --git a/include/general.h b/include/libyuv/general.h
similarity index 88%
rename from include/general.h
rename to include/libyuv/general.h
index 9450e3782..3cd9d3234 100644
--- a/include/general.h
+++ b/include/libyuv/general.h
@@ -13,23 +13,22 @@
  * General operations on YUV images.
  */
 
-#ifndef LIBYUV_INCLUDE_GENERAL_H_
-#define LIBYUV_INCLUDE_GENERAL_H_
+#ifndef INCLUDE_LIBYUV_GENERAL_H_
+#define INCLUDE_LIBYUV_GENERAL_H_
 
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 
 namespace libyuv {
 
 // Supported rotation
-enum RotationMode
-{
+enum RotationMode {
   kRotateNone = 0,
   kRotateClockwise = 90,
   kRotateCounterClockwise = 270,
   kRotate180 = 180,
 };
 
-// I420  mirror
+// I420 mirror
 int
 I420Mirror(const uint8* src_yplane, int src_ystride,
            const uint8* src_uplane, int src_ustride,
@@ -45,7 +44,7 @@ I420CropPad(const uint8* src_frame, int src_width,
            int src_height, uint8* dst_frame,
            int dst_width, int dst_height);
 
-// I420 Crop - make a center cut
+// I420 Crop - crop a rectangle from image
 int
 I420Crop(uint8* frame,
          int src_width, int src_height,
@@ -62,8 +61,6 @@ I420Rotate(const uint8* src_yplane, int src_ystride,
            int width, int height,
            RotationMode mode);
 
-
 } // namespace libyuv
 
-
-#endif // LIBYUV_INCLUDE_GENERAL_H_
+#endif // INCLUDE_LIBYUV_GENERAL_H_
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
new file mode 100644
index 000000000..8b8b25240
--- /dev/null
+++ b/include/libyuv/planar_functions.h
@@ -0,0 +1,141 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+// Copy I420 to I420.
+int I420Copy(const uint8* src_y, int src_stride_y,
+          const uint8* src_u, int src_stride_u,
+          const uint8* src_v, int src_stride_v,
+          uint8* dst_y, int dst_stride_y,
+          uint8* dst_u, int dst_stride_u,
+          uint8* dst_v, int dst_stride_v,
+          int width, int height);
+
+// Convert I422 to I420.  Used by MJPG.
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert M420 to I420.
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert Q420 to I420.
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert NV12 to I420.  Also used for NV21.
+int NV12ToI420(const uint8* src_y,
+               const uint8* src_uv, int src_stride,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert YUY2 to I420.
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I420.
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I420 to ARGB.
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I420 to BGRA.
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I420 to ABGR.
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I422 to ARGB.
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I444 to ARGB.
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I400 to ARGB.
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I400 to ARGB.
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int width, int height);
+
+// Convert RAW to ARGB.
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Convert BG24 to ARGB.
+int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert ABGR to ARGB.
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert BGRA to ARGB.
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+}  // namespace libyuv
+
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h
new file mode 100644
index 000000000..8433908b9
--- /dev/null
+++ b/include/libyuv/scale.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+// Supported filtering
+enum FilterMode {
+  kFilterNone = 0,  // Point sample; Fastest
+  kFilterBilinear = 1,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 2  // Highest quality
+};
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              FilterMode filtering);
+
+// Legacy API
+// If dst_height_offset is non-zero, the image is offset by that many pixels
+// and stretched to (dst_height - dst_height_offset * 2) pixels high,
+// instead of dst_height.
+int Scale(const uint8* src, int src_width, int src_height,
+          uint8* dst, int dst_width, int dst_height, int dst_height_offset,
+          bool interpolate);
+
+// Same, but specified src terms of each plane location and stride.
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          bool interpolate);
+
+// For testing, allow disabling of optimizations.
+void SetUseReferenceImpl(bool use);
+
+} // namespace libyuv
+
+#endif // INCLUDE_LIBYUV_SCALE_H_
diff --git a/include/planar_functions.h b/include/planar_functions.h
deleted file mode 100644
index 1a5b48380..000000000
--- a/include/planar_functions.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_
-#define LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_
-
-#include "basic_types.h"
-
-namespace libyuv {
-
-// Copy I420 to I420.
-void I420Copy(const uint8* src_y, int src_pitch_y,
-              const uint8* src_u, int src_pitch_u,
-              const uint8* src_v, int src_pitch_v,
-              uint8* dst_y, int dst_pitch_y,
-              uint8* dst_u, int dst_pitch_u,
-              uint8* dst_v, int dst_pitch_v,
-              int width, int height);
-
-// Convert I422 to I420.  Used by MJPG.
-void I422ToI420(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height);
-
-// Convert M420 to I420.
-void M420ToI420(const uint8* src_m420, int src_pitch_m420,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height);
-
-// Convert Q420 to I420.
-void Q420ToI420(const uint8* src_y, int src_pitch_y,
-                const uint8* src_yuy2, int src_pitch_yuy2,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height);
-
-// Convert NV12 to I420.  Also used for NV21.
-void NV12ToI420(const uint8* src_y,
-                const uint8* src_uv, int src_pitch,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height);
-
-// Convert YUY2 to I420.
-void YUY2ToI420(const uint8* src_yuy2, int src_pitch_yuy2,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height);
-
-// Convert UYVY to I420.
-void UYVYToI420(const uint8* src_uyvy, int src_pitch_uyvy,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height);
-
-// Convert I420 to ARGB.
-void I420ToARGB(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height);
-
-// Convert I420 to BGRA.
-void I420ToBGRA(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height);
-
-// Convert I420 to ABGR.
-void I420ToABGR(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height);
-
-// Convert I422 to ARGB.
-void I422ToARGB(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height);
-
-// Convert I444 to ARGB.
-void I444ToARGB(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height);
-
-// Convert I400 to ARGB.
-void I400ToARGB(const uint8* src_y, int src_pitch_y,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height);
-
-// Convert I400 to ARGB.
-void I400ToARGB_Reference(const uint8* src_y, int src_pitch_y,
-                          uint8* dst_argb, int dst_pitch_argb,
-                          int width, int height);
-
-// Convert RAW to ARGB.
-void RAWToARGB(const uint8* src_raw, int src_pitch_raw,
-               uint8* dst_argb, int dst_pitch_argb,
-               int width, int height);
-
-// Convert BG24 to ARGB.
-void BG24ToARGB(const uint8* src_bg24, int src_pitch_bg24,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height);
-
-// Convert ABGR to ARGB.
-void ABGRToARGB(const uint8* src_abgr, int src_pitch_abgr,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height);
-
-}  // namespace libyuv
-
-#endif  // LIBYUV_INCLUDE_PLANAR_FUNCTIONS_H_
diff --git a/include/scale.h b/include/scale.h
deleted file mode 100644
index 9cef9bce8..000000000
--- a/include/scale.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef LIBYUV_INCLUDE_SCALE_H_
-#define LIBYUV_INCLUDE_SCALE_H_
-
-#include "basic_types.h"
-
-#if defined(_MSC_VER)
-#define ALIGN16(var) __declspec(align(16)) var
-#else
-#define ALIGN16(var) var __attribute__((aligned(16)))
-#endif
-
-namespace libyuv {
-
-// Scales a YUV 4:2:0 image from the input width and height to the
-// output width and height. If outh_offset is nonzero, the image is
-// offset by that many pixels and stretched to (outh - outh_offset * 2)
-// pixels high, instead of outh.
-// If interpolate is not set, a simple nearest-neighbor algorithm is
-// used. This produces basic (blocky) quality at the fastest speed.
-// If interpolate is set, interpolation is used to produce a better
-// quality image, at the expense of speed.
-// Returns true if successful.
-bool Scale(const uint8 *in, int32 inw, int32 inh,
-           uint8 *out, int32 outw, int32 outh, int32 outh_offset,
-           bool interpolate);
-
-// Same, but specified in terms of each plane location and stride.
-bool Scale(const uint8 *inY, const uint8 *inU, const uint8 *inV,
-           int32 istrideY, int32 istrideU, int32 istrideV,
-           int32 iwidth, int32 iheight,
-           uint8 *outY, uint8 *outU, uint8 *outV,
-           int32 ostrideY, int32 ostrideU, int32 ostrideV,
-           int32 owidth, int32 oheight,
-           bool interpolate);
-
-// For testing, allow disabling of optimizations.
-void SetUseReferenceImpl(bool use);
-
-} // namespace libyuv
-
-#endif // LIBYUV_INCLUDE_SCALE_H_
diff --git a/source/conversion_tables.h b/source/conversion_tables.h
index e778fa2d0..9a328649c 100644
--- a/source/conversion_tables.h
+++ b/source/conversion_tables.h
@@ -15,11 +15,11 @@
 *
 ***************************************************************/
 
-#ifndef WEBRTC_COMMON_VIDEO_VPLIB_CONVERSION_TABLES
-#define WEBRTC_COMMON_VIDEO_VPLIB_CONVERSION_TABLES
+#ifndef LIBYUV_SOURCE_CONVERSION_TABLES_H_
+#define LIBYUV_SOURCE_CONVERSION_TABLES_H_
+
+namespace libyuv {
 
-namespace libyuv
-{
 /******************************************************************************
 * YUV TO RGB approximation
 *
@@ -97,7 +97,6 @@ namespace libyuv
         Ucg(244),Ucg(245),Ucg(246),Ucg(247),Ucg(248),Ucg(249),Ucg(250),Ucg(251),
         Ucg(252),Ucg(253),Ucg(254),Ucg(255)};
 
-
    static const int mapUcb[256] = {
         Ucb(0),Ucb(1),Ucb(2),Ucb(3),Ucb(4),Ucb(5),Ucb(6),Ucb(7),Ucb(8),Ucb(9),
         Ucb(10),Ucb(11),Ucb(12),Ucb(13),Ucb(14),Ucb(15),Ucb(16),Ucb(17),Ucb(18),
@@ -199,5 +198,6 @@ namespace libyuv
         Vcg(252),Vcg(253),Vcg(254),Vcg(255)};
 
 } // namespace libyuv
+
 #endif
 
diff --git a/source/convert.cc b/source/convert.cc
index ddaa51f22..e555c5440 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -8,13 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "libyuv/convert.h"
 
-#include "convert.h"
-#include "basic_types.h"
-
-#include <string.h>     // memcpy(), memset()
-#include <assert.h>
-#include <stdlib.h>     // abs
+#include "libyuv/basic_types.h"
+#include "conversion_tables.h"
 
 //#define SCALEOPT //Currently for windows only. June 2010
 
@@ -22,20 +19,16 @@
 #include <emmintrin.h>
 #endif
 
-#include "conversion_tables.h"
-
-namespace libyuv
-{
-
-
-// Clip value to [0,255]
-inline uint8 Clip(int32 val);
-
-#ifdef SCALEOPT
-void *memcpy_16(void * dest, const void * src, size_t n);
-void *memcpy_8(void * dest, const void * src, size_t n);
-#endif
+namespace libyuv {
 
+static inline uint8 Clip(int32 val) {
+  if (val < 0) {
+    return (uint8) 0;
+  } else if (val > 255){
+    return (uint8) 255;
+  }
+  return (uint8) val;
+}
 
 int
 I420ToRGB24(const uint8* src_yplane, int src_ystride,
@@ -344,8 +337,8 @@ I420ToYUY2(const uint8* src_yplane, int src_ystride,
 
   const uint8* in1 = src_yplane;
   const uint8* in2 = src_yplane + src_ystride ;
-  const uint8* inU = src_uplane;
-  const uint8* inV = src_vplane;
+  const uint8* src_u = src_uplane;
+  const uint8* src_v = src_vplane;
 
   uint8* out1 = dst_frame;
   uint8* out2 = dst_frame + 2 * dst_stride;
@@ -356,25 +349,25 @@ I420ToYUY2(const uint8* src_yplane, int src_ystride,
   for (int i = 0; i < ((src_height + 1) >> 1); i++){
     for (int j = 0; j < ((src_width + 1) >> 1); j++){
       out1[0] = in1[0];
-      out1[1] = *inU;
+      out1[1] = *src_u;
       out1[2] = in1[1];
-      out1[3] = *inV;
+      out1[3] = *src_v;
 
       out2[0] = in2[0];
-      out2[1] = *inU;
+      out2[1] = *src_u;
       out2[2] = in2[1];
-      out2[3] = *inV;
+      out2[3] = *src_v;
       out1 += 4;
       out2 += 4;
-      inU++;
-      inV++;
+      src_u++;
+      src_v++;
       in1 += 2;
       in2 += 2;
     }
     in1 += 2 * src_ystride - src_width;
     in2 += 2 * src_ystride - src_width;
-    inU += src_ustride - ((src_width + 1) >> 1);
-    inV += src_vstride - ((src_width + 1) >> 1);
+    src_u += src_ustride - ((src_width + 1) >> 1);
+    src_v += src_vstride - ((src_width + 1) >> 1);
     out1 += 2 * dst_stride + 2 * (dst_stride - src_width);
     out2 += 2 * dst_stride + 2 * (dst_stride - src_width);
   }
@@ -387,34 +380,34 @@ I420ToYUY2(const uint8* src_yplane, int src_ystride,
       ;pusha
       mov       eax, DWORD PTR [in1]                       ;1939.33
       mov       ecx, DWORD PTR [in2]                       ;1939.33
-      mov       ebx, DWORD PTR [inU]                       ;1939.33
-      mov       edx, DWORD PTR [inV]                       ;1939.33
+      mov       ebx, DWORD PTR [src_u]                       ;1939.33
+      mov       edx, DWORD PTR [src_v]                       ;1939.33
       loop0:
-      movq      xmm6, QWORD PTR [ebx]          ;inU
-      movq      xmm0, QWORD PTR [edx]          ;inV
-      punpcklbw xmm6, xmm0                     ;inU, inV mix
+      movq      xmm6, QWORD PTR [ebx]          ;src_u
+      movq      xmm0, QWORD PTR [edx]          ;src_v
+      punpcklbw xmm6, xmm0                     ;src_u, src_v mix
       ;movdqa    xmm1, xmm6
       ;movdqa    xmm2, xmm6
       ;movdqa    xmm4, xmm6
 
       movdqu    xmm3, XMMWORD PTR [eax]        ;in1
       movdqa    xmm1, xmm3
-      punpcklbw xmm1, xmm6                     ;in1, inU, in1, inV
+      punpcklbw xmm1, xmm6                     ;in1, src_u, in1, src_v
       mov       esi, DWORD PTR [out1]
       movdqu    XMMWORD PTR [esi], xmm1        ;write to out1
 
       movdqu    xmm5, XMMWORD PTR [ecx]        ;in2
       movdqa    xmm2, xmm5
-      punpcklbw xmm2, xmm6                     ;in2, inU, in2, inV
+      punpcklbw xmm2, xmm6                     ;in2, src_u, in2, src_v
       mov       edi, DWORD PTR [out2]
       movdqu    XMMWORD PTR [edi], xmm2        ;write to out2
 
-      punpckhbw xmm3, xmm6                     ;in1, inU, in1, inV again
+      punpckhbw xmm3, xmm6                     ;in1, src_u, in1, src_v again
       movdqu    XMMWORD PTR [esi+16], xmm3     ;write to out1 again
       add       esi, 32
       mov       DWORD PTR [out1], esi
 
-      punpckhbw xmm5, xmm6                     ;inU, in2, inV again
+      punpckhbw xmm5, xmm6                     ;src_u, in2, src_v again
       movdqu    XMMWORD PTR [edi+16], xmm5     ;write to out2 again
       add       edi, 32
       mov       DWORD PTR [out2], edi
@@ -431,8 +424,8 @@ I420ToYUY2(const uint8* src_yplane, int src_ystride,
 
       mov       DWORD PTR [in1], eax                       ;1939.33
       mov       DWORD PTR [in2], ecx                       ;1939.33
-      mov       DWORD PTR [inU], ebx                       ;1939.33
-      mov       DWORD PTR [inV], edx                       ;1939.33
+      mov       DWORD PTR [src_u], ebx                       ;1939.33
+      mov       DWORD PTR [src_v], edx                       ;1939.33
 
       ;popa
       emms
@@ -504,32 +497,32 @@ I420ToUYVY(const uint8* src_yplane, int src_ystride,
       ;pusha
       mov       eax, DWORD PTR [in1]                       ;1939.33
       mov       ecx, DWORD PTR [in2]                       ;1939.33
-      mov       ebx, DWORD PTR [inU]                       ;1939.33
-      mov       edx, DWORD PTR [inV]                       ;1939.33
+      mov       ebx, DWORD PTR [src_u]                       ;1939.33
+      mov       edx, DWORD PTR [src_v]                       ;1939.33
 loop0:
-      movq      xmm6, QWORD PTR [ebx]          ;inU
-      movq      xmm0, QWORD PTR [edx]          ;inV
-      punpcklbw xmm6, xmm0                     ;inU, inV mix
+      movq      xmm6, QWORD PTR [ebx]          ;src_u
+      movq      xmm0, QWORD PTR [edx]          ;src_v
+      punpcklbw xmm6, xmm0                     ;src_u, src_v mix
       movdqa    xmm1, xmm6
       movdqa    xmm2, xmm6
       movdqa    xmm4, xmm6
 
       movdqu    xmm3, XMMWORD PTR [eax]        ;in1
-      punpcklbw xmm1, xmm3                     ;inU, in1, inV
+      punpcklbw xmm1, xmm3                     ;src_u, in1, src_v
       mov       esi, DWORD PTR [out1]
       movdqu    XMMWORD PTR [esi], xmm1        ;write to out1
 
       movdqu    xmm5, XMMWORD PTR [ecx]        ;in2
-      punpcklbw xmm2, xmm5                     ;inU, in2, inV
+      punpcklbw xmm2, xmm5                     ;src_u, in2, src_v
       mov       edi, DWORD PTR [out2]
       movdqu    XMMWORD PTR [edi], xmm2        ;write to out2
 
-      punpckhbw xmm4, xmm3                     ;inU, in1, inV again
+      punpckhbw xmm4, xmm3                     ;src_u, in1, src_v again
       movdqu    XMMWORD PTR [esi+16], xmm4     ;write to out1 again
       add       esi, 32
       mov       DWORD PTR [out1], esi
 
-      punpckhbw xmm6, xmm5                     ;inU, in2, inV again
+      punpckhbw xmm6, xmm5                     ;src_u, in2, src_v again
       movdqu    XMMWORD PTR [edi+16], xmm6     ;write to out2 again
       add       edi, 32
       mov       DWORD PTR [out2], edi
@@ -546,8 +539,8 @@ loop0:
 
       mov       DWORD PTR [in1], eax                       ;1939.33
       mov       DWORD PTR [in2], ecx                       ;1939.33
-      mov       DWORD PTR [inU], ebx                       ;1939.33
-      mov       DWORD PTR [inV], edx                       ;1939.33
+      mov       DWORD PTR [src_u], ebx                       ;1939.33
+      mov       DWORD PTR [src_v], edx                       ;1939.33
 
       ;popa
       emms
@@ -848,62 +841,4 @@ RAWToI420(const uint8* src_frame, int src_stride,
                    src_width, src_height, RAWToI420Row_C);
 }
 
-inline
-uint8 Clip(int32 val)
-{
-  if (val < 0){
-    return (uint8)0;
-  } else if (val > 255){
-    return (uint8)255;
-  }
-  return (uint8)val;
-}
-
-#ifdef SCALEOPT
-//memcpy_16 assumes that width is an integer multiple of 16!
-void
-*memcpy_16(void * dest, const void * src, size_t n)
-{
-  _asm
-  {
-    mov eax, dword ptr [src]
-    mov ebx, dword ptr [dest]
-    mov ecx, dword ptr [n]
-
-  loop0:
-
-    movdqu    xmm0, XMMWORD PTR [eax]
-    movdqu    XMMWORD PTR [ebx], xmm0
-    add       eax, 16
-    add       ebx, 16
-    sub       ecx, 16
-    jg        loop0
-  }
-}
-
-// memcpy_8 assumes that width is an integer multiple of 8!
-void
-*memcpy_8(void * dest, const void * src, size_t n)
-{
-  _asm
-  {
-    mov eax, dword ptr [src]
-    mov ebx, dword ptr [dest]
-    mov ecx, dword ptr [n]
-
-  loop0:
-
-    movq    mm0, QWORD PTR [eax]
-    movq    QWORD PTR [ebx], mm0
-    add       eax, 8
-    add       ebx, 8
-    sub       ecx, 8
-    jg        loop0
-  emms
-  }
-
-}
-
-#endif
-
 } // namespace libyuv
diff --git a/source/cpu_id.cc b/source/cpu_id.cc
index 6d8655c12..e986015ea 100644
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "cpu_id.h"
-#include "basic_types.h"  // for CPU_X86
+#include "libyuv/cpu_id.h"
+#include "libyuv/basic_types.h"  // for CPU_X86
 
 #ifdef _MSC_VER
 #include <intrin.h>
diff --git a/source/format_conversion.cc b/source/format_conversion.cc
index 0db57ae4c..a058d6a9e 100644
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -10,8 +10,7 @@
 
 #include <assert.h>
 
-#include "common.h"
-#include "cpu_id.h"
+#include "libyuv/cpu_id.h"
 #include "video_common.h"
 
 namespace libyuv {
@@ -19,6 +18,15 @@ namespace libyuv {
 // Most code in here is inspired by the material at
 // http://www.siliconimaging.com/RGB%20Bayer.htm
 
+// Forces compiler to inline, even against its better judgement. Use wisely.
+#if defined(__GNUC__)
+#define FORCE_INLINE __attribute__((always_inline))
+#elif defined(WIN32)
+#define FORCE_INLINE __forceinline
+#else
+#define FORCE_INLINE
+#endif
+
 enum {
   RED = 0,
   BLUE = 1,
@@ -98,7 +106,7 @@ static FORCE_INLINE void InterpolateBayerRGBCorner(uint8* r,
                                                    uint8* g,
                                                    uint8* b,
                                                    const uint8* src,
-                                                   int src_pitch,
+                                                   int src_stride,
                                                    Position pos,
                                                    uint8 colour) {
 
@@ -108,20 +116,20 @@ static FORCE_INLINE void InterpolateBayerRGBCorner(uint8* r,
   int adjacent_column;
   switch (pos) {
     case TOP_LEFT:
-      adjacent_row = src_pitch;
+      adjacent_row = src_stride;
       adjacent_column = 1;
       break;
     case TOP_RIGHT:
-      adjacent_row = src_pitch;
+      adjacent_row = src_stride;
       adjacent_column = -1;
       break;
     case BOTTOM_LEFT:
-      adjacent_row = -src_pitch;
+      adjacent_row = -src_stride;
       adjacent_column = 1;
       break;
     case BOTTOM_RIGHT:
     default:
-      adjacent_row = -src_pitch;
+      adjacent_row = -src_stride;
       adjacent_column = -1;
       break;
   }
@@ -161,7 +169,7 @@ static FORCE_INLINE void InterpolateBayerRGBEdge(uint8* r,
                                                  uint8* g,
                                                  uint8* b,
                                                  const uint8* src,
-                                                 int src_pitch,
+                                                 int src_stride,
                                                  Position pos,
                                                  uint8 colour) {
 
@@ -176,21 +184,21 @@ static FORCE_INLINE void InterpolateBayerRGBEdge(uint8* r,
 
   switch (pos) {
     case TOP_EDGE:
-      inner = src_pitch;
+      inner = src_stride;
       side = 1;
       break;
     case RIGHT_EDGE:
       inner = -1;
-      side = src_pitch;
+      side = src_stride;
       break;
     case BOTTOM_EDGE:
-      inner = -src_pitch;
+      inner = -src_stride;
       side = 1;
       break;
     case LEFT_EDGE:
     default:
       inner = 1;
-      side = src_pitch;
+      side = src_stride;
       break;
   }
 
@@ -234,7 +242,7 @@ static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r,
                                                    uint8* g,
                                                    uint8* b,
                                                    const uint8* src,
-                                                   int src_pitch,
+                                                   int src_stride,
                                                    uint8 colour) {
 
   if (IsRedBlue(colour)) {
@@ -245,12 +253,12 @@ static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r,
     // quality here by using only two of the green pixels based on the
     // correlation to the nearby red/blue pixels, but that is slower and would
     // result in more edge cases.
-    *g = (src[1] + src[-1] + src[src_pitch] + src[-src_pitch]) / 4;
+    *g = (src[1] + src[-1] + src[src_stride] + src[-src_stride]) / 4;
     // Average of the oppositely-coloured corner pixels (there's four).
-    uint8 corner_average = (src[src_pitch + 1] +
-                            src[src_pitch - 1] +
-                            src[-src_pitch + 1] +
-                            src[-src_pitch - 1]) / 4;
+    uint8 corner_average = (src[src_stride + 1] +
+                            src[src_stride - 1] +
+                            src[-src_stride + 1] +
+                            src[-src_stride - 1]) / 4;
     if (colour == RED) {
       *r = current_pixel;
       *b = corner_average;
@@ -263,7 +271,7 @@ static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r,
     // Average of the adjacent same-row pixels (there's two).
     uint8 row_adjacent = (src[1] + src[-1]) / 2;
     // Average of the adjacent same-column pixels (there's two).
-    uint8 column_adjacent = (src[src_pitch] + src[-src_pitch]) / 2;
+    uint8 column_adjacent = (src[src_stride] + src[-src_stride]) / 2;
     if (colour == GREEN_BETWEEN_RED) {
       *r = row_adjacent;
       *b = column_adjacent;
@@ -275,15 +283,15 @@ static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r,
 }
 
 // Converts any Bayer RGB format to ARGB.
-void BayerRGBToARGB(const uint8* src, int src_pitch, uint32 src_fourcc,
-                    uint8* dst, int dst_pitch,
-                    int width, int height) {
+int BayerRGBToARGB(const uint8* src, int src_stride, uint32 src_fourcc,
+                   uint8* dst, int dst_stride,
+                   int width, int height) {
   assert(width % 2 == 0);
   assert(height % 2 == 0);
 
   uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc);
-  int src_row_inc = src_pitch * 2 - width;
-  int dst_row_inc = dst_pitch * 2 - width * 4;
+  int src_row_inc = src_stride * 2 - width;
+  int dst_row_inc = dst_stride * 2 - width * 4;
 
   // Iterate over the 2x2 grids.
   for (int y1 = 0; y1 < height; y1 += 2) {
@@ -297,24 +305,24 @@ void BayerRGBToARGB(const uint8* src, int src_pitch, uint32 src_fourcc,
           uint8 current_colour = static_cast<uint8>(colours);
           colours >>= 8;
           Position pos = GetPosition(x1 + x2, y1 + y2, width, height);
-          const uint8* src_pixel = &src[y2 * src_pitch + x2];
-          uint8* dst_pixel = &dst[y2 * dst_pitch + x2 * 4];
+          const uint8* src_pixel = &src[y2 * src_stride + x2];
+          uint8* dst_pixel = &dst[y2 * dst_stride + x2 * 4];
 
           // Convert from Bayer RGB to regular RGB.
           if (pos == MIDDLE) {
             // 99% of the image is the middle.
             InterpolateBayerRGBCenter(&r, &g, &b,
-                                      src_pixel, src_pitch,
+                                      src_pixel, src_stride,
                                       current_colour);
           } else if (pos >= LEFT_EDGE) {
             // Next most frequent is edges.
             InterpolateBayerRGBEdge(&r, &g, &b,
-                                    src_pixel, src_pitch, pos,
+                                    src_pixel, src_stride, pos,
                                     current_colour);
           } else {
             // Last is the corners. There are only 4.
             InterpolateBayerRGBCorner(&r, &g, &b,
-                                      src_pixel, src_pitch, pos,
+                                      src_pixel, src_stride, pos,
                                       current_colour);
           }
 
@@ -331,23 +339,24 @@ void BayerRGBToARGB(const uint8* src, int src_pitch, uint32 src_fourcc,
     src += src_row_inc;
     dst += dst_row_inc;
   }
+  return 0;
 }
 
 // Converts any Bayer RGB format to I420.
-void BayerRGBToI420(const uint8* src, int src_pitch, uint32 src_fourcc,
-                    uint8* y, int y_pitch,
-                    uint8* u, int u_pitch,
-                    uint8* v, int v_pitch,
-                    int width, int height) {
+int BayerRGBToI420(const uint8* src, int src_stride, uint32 src_fourcc,
+                   uint8* y, int y_stride,
+                   uint8* u, int u_stride,
+                   uint8* v, int v_stride,
+                   int width, int height) {
   assert(width % 2 == 0);
   assert(height % 2 == 0);
 
   uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc);
 
-  int src_row_inc = src_pitch * 2 - width;
-  int y_row_inc = y_pitch * 2 - width;
-  int u_row_inc = u_pitch - width / 2;
-  int v_row_inc = v_pitch - width / 2;
+  int src_row_inc = src_stride * 2 - width;
+  int y_row_inc = y_stride * 2 - width;
+  int u_row_inc = u_stride - width / 2;
+  int v_row_inc = v_stride - width / 2;
 
   // Iterate over the 2x2 grids.
   for (int y1 = 0; y1 < height; y1 += 2) {
@@ -363,25 +372,25 @@ void BayerRGBToI420(const uint8* src, int src_pitch, uint32 src_fourcc,
           uint8 current_colour = static_cast<uint8>(colours);
           colours >>= 8;
           Position pos = GetPosition(x1 + x2, y1 + y2, width, height);
-          const uint8* src_pixel = &src[y2 * src_pitch + x2];
-          uint8* y_pixel = &y[y2 * y_pitch + x2];
+          const uint8* src_pixel = &src[y2 * src_stride + x2];
+          uint8* y_pixel = &y[y2 * y_stride + x2];
 
           // Convert from Bayer RGB to regular RGB.
 
           if (pos == MIDDLE) {
             // 99% of the image is the middle.
             InterpolateBayerRGBCenter(&r, &g, &b,
-                                      src_pixel, src_pitch,
+                                      src_pixel, src_stride,
                                       current_colour);
           } else if (pos >= LEFT_EDGE) {
             // Next most frequent is edges.
             InterpolateBayerRGBEdge(&r, &g, &b,
-                                    src_pixel, src_pitch, pos,
+                                    src_pixel, src_stride, pos,
                                     current_colour);
           } else {
             // Last is the corners. There are only 4.
             InterpolateBayerRGBCorner(&r, &g, &b,
-                                      src_pixel, src_pitch, pos,
+                                      src_pixel, src_stride, pos,
                                       current_colour);
           }
 
@@ -405,6 +414,7 @@ void BayerRGBToI420(const uint8* src, int src_pitch, uint32 src_fourcc,
     u += u_row_inc;
     v += v_row_inc;
   }
+  return 0;
 }
 
 // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
@@ -490,18 +500,18 @@ static uint32 GenerateSelector(int select0, int select1) {
 }
 
 // Converts 32 bit ARGB to any Bayer RGB format.
-void ARGBToBayerRGB(const uint8* src_rgb, int src_pitch_rgb,
-                    uint8* dst_bayer, int dst_pitch_bayer,
-                    uint32 dst_fourcc_bayer,
-                    int width, int height) {
+int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
+                   uint8* dst_bayer, int dst_stride_bayer,
+                   uint32 dst_fourcc_bayer,
+                   int width, int height) {
   assert(width % 2 == 0);
   void (*ARGBToBayerRow)(const uint8* src_argb,
                          uint8* dst_bayer, uint32 selector, int pix);
 #if defined(HAS_ARGBTOBAYERROW_SSSE3)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
       (width % 4 == 0) &&
-      IS_ALIGNED(src_rgb, 16) && (src_pitch_rgb % 16 == 0) &&
-      IS_ALIGNED(dst_bayer, 4) && (dst_pitch_bayer % 4 == 0)) {
+      IS_ALIGNED(src_rgb, 16) && (src_stride_rgb % 16 == 0) &&
+      IS_ALIGNED(dst_bayer, 4) && (dst_stride_bayer % 4 == 0)) {
     ARGBToBayerRow = ARGBToBayerRow_SSSE3;
   } else
 #endif
@@ -540,9 +550,10 @@ void ARGBToBayerRGB(const uint8* src_rgb, int src_pitch_rgb,
   // Now convert.
   for (int y = 0; y < height; ++y) {
     ARGBToBayerRow(src_rgb, dst_bayer, index_map[y & 1], width);
-    src_rgb += src_pitch_rgb;
-    dst_bayer += dst_pitch_bayer;
+    src_rgb += src_stride_rgb;
+    dst_bayer += dst_stride_bayer;
   }
+  return 0;
 }
 
 }  // namespace libyuv
diff --git a/source/general.cc b/source/general.cc
index 0759db854..27f97bdc4 100644
--- a/source/general.cc
+++ b/source/general.cc
@@ -8,14 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "general.h"
+#include "libyuv/general.h"
 
 #include <string.h>     // memcpy(), memset()
 
-#include "planar_functions.h"
+#include "libyuv/planar_functions.h"
 #include "rotate.h"
 
-
 namespace libyuv {
 
 int
@@ -25,11 +24,11 @@ I420Mirror(const uint8* src_yplane, int src_ystride,
            uint8* dst_yplane, int dst_ystride,
            uint8* dst_uplane, int dst_ustride,
            uint8* dst_vplane, int dst_vstride,
-           int width, int height)
-{
+           int width, int height) {
   if (src_yplane == NULL || src_uplane == NULL || src_vplane == NULL ||
-      dst_yplane == NULL || dst_uplane == NULL || dst_vplane == NULL)
+      dst_yplane == NULL || dst_uplane == NULL || dst_vplane == NULL) {
     return -1;
+  }
 
   int indO = 0;
   int indS  = 0;
@@ -39,8 +38,8 @@ I420Mirror(const uint8* src_yplane, int src_ystride,
   const int halfWidth = (width + 1) >> 1;
 
   // Y
-  for (wind = 0; wind < halfWidth; wind++){
-   for (hind = 0; hind < height; hind++){
+  for (wind = 0; wind < halfWidth; wind++) {
+   for (hind = 0; hind < height; hind++) {
      indO = hind * src_ystride + wind;
      indS = hind * dst_ystride + (width - wind - 1);
      tmpVal = src_yplane[indO];
@@ -53,8 +52,8 @@ I420Mirror(const uint8* src_yplane, int src_ystride,
   const int halfSrcuvStride = (height + 1) >> 1;
   const int halfuvWidth = (width + 1) >> 2;
 
-  for (wind = 0; wind < halfuvWidth; wind++){
-   for (hind = 0; hind < halfHeight; hind++){
+  for (wind = 0; wind < halfuvWidth; wind++) {
+   for (hind = 0; hind < halfHeight; hind++) {
      indO = hind * halfSrcuvStride + wind;
      indS = hind * halfSrcuvStride + (halfuvWidth - wind - 1);
      // U
@@ -79,11 +78,11 @@ I420Crop(uint8* frame,
   if (frame == NULL)
     return -1;
 
-  if (src_width == dst_width && src_height == dst_height){
+  if (src_width == dst_width && src_height == dst_height) {
       // Nothing to do
     return 3 * dst_height * dst_width / 2;
   }
-  if (dst_width > src_width || dst_height > src_height){
+  if (dst_width > src_width || dst_height > src_height) {
       // error
       return -1;
   }
@@ -98,21 +97,21 @@ I420Crop(uint8* frame,
   int crop_width = ( src_width - dst_width ) / 2;
 
   for (i = src_width * crop_height + crop_width; loop < dst_height ;
-      loop++, i += src_width){
+      loop++, i += src_width) {
     memcpy(&frame[m],&frame[i],dst_width);
     m += dst_width;
   }
   i = src_width * src_height; // ilum
   loop = 0;
   for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2);
-        loop < halfdst_height; loop++,i += halfsrc_width){
+        loop < halfdst_height; loop++,i += halfsrc_width) {
     memcpy(&frame[m],&frame[i],half_dst_width);
     m += half_dst_width;
   }
   loop = 0;
   i = src_width * src_height + half_dst_height * halfsrc_width; // ilum + Cr
   for ( i += (halfsrc_width * crop_height / 2 + crop_width / 2);
-        loop < halfdst_height; loop++, i += halfsrc_width){
+        loop < halfdst_height; loop++, i += halfsrc_width) {
     memcpy(&frame[m],&frame[i],half_dst_width);
     m += half_dst_width;
   }
@@ -122,66 +121,59 @@ I420Crop(uint8* frame,
 
 int
 I420CropPad(const uint8* src_frame, int src_width,
-           int src_height, uint8* dst_frame,
-           int dst_width, int dst_height)
+            int src_height, uint8* dst_frame,
+            int dst_width, int dst_height)
 {
-  if (src_width < 1 || dst_width < 1 || src_height < 1 || dst_height < 1 )
+  if (src_width < 1 || dst_width < 1 || src_height < 1 || dst_height < 1) {
     return -1;
-  if (src_width == dst_width && src_height == dst_height)
+  }
+  if (src_width == dst_width && src_height == dst_height) {
     memcpy(dst_frame, src_frame, 3 * dst_width * (dst_height >> 1));
-  else
-  {
-    if ( src_height < dst_height){
+  } else {
+    if (src_height < dst_height) {
       // pad height
       int pad_height = dst_height - src_height;
       int i = 0;
       int pad_width = 0;
       int crop_width = 0;
       int width = src_width;
-      if (src_width < dst_width){
+      if (src_width < dst_width) {
         // pad width
         pad_width = dst_width - src_width;
-      } else{
-      // cut width
-      crop_width = src_width - dst_width;
-      width = dst_width;
+      } else {
+        // cut width
+        crop_width = src_width - dst_width;
+        width = dst_width;
       }
-      if (pad_height){
+      if (pad_height) {
         memset(dst_frame, 0, dst_width * (pad_height >> 1));
         dst_frame +=  dst_width * (pad_height >> 1);
       }
-      for (i = 0; i < src_height;i++)
-      {
-          if (pad_width)
-          {
-              memset(dst_frame, 0, pad_width / 2);
-              dst_frame +=  pad_width / 2;
-          }
-          src_frame += crop_width >> 1; // in case we have a cut
-          memcpy(dst_frame,src_frame ,width);
-          src_frame += crop_width >> 1;
-          dst_frame += width;
-          src_frame += width;
-          if (pad_width)
-          {
+      for (i = 0; i < src_height;i++) {
+        if (pad_width) {
             memset(dst_frame, 0, pad_width / 2);
             dst_frame +=  pad_width / 2;
-          }
+        }
+        src_frame += crop_width >> 1; // in case we have a cut
+        memcpy(dst_frame,src_frame ,width);
+        src_frame += crop_width >> 1;
+        dst_frame += width;
+        src_frame += width;
+        if (pad_width) {
+          memset(dst_frame, 0, pad_width / 2);
+          dst_frame +=  pad_width / 2;
+        }
       }
-      if (pad_height)
-      {
-          memset(dst_frame, 0, dst_width * (pad_height >> 1));
-          dst_frame +=  dst_width * (pad_height >> 1);
+      if (pad_height) {
+        memset(dst_frame, 0, dst_width * (pad_height >> 1));
+        dst_frame +=  dst_width * (pad_height >> 1);
       }
-      if (pad_height)
-      {
+      if (pad_height) {
         memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1));
         dst_frame +=  (dst_width >> 2) * (pad_height >> 1);
       }
-      for (i = 0; i < (src_height >> 1); i++)
-      {
-        if (pad_width)
-        {
+      for (i = 0; i < (src_height >> 1); i++) {
+        if (pad_width) {
           memset(dst_frame, 127, pad_width >> 2);
           dst_frame +=  pad_width >> 2;
         }
@@ -190,21 +182,17 @@ I420CropPad(const uint8* src_frame, int src_width,
         src_frame += crop_width >> 2;
         dst_frame += width >> 1;
         src_frame += width >> 1;
-        if (pad_width)
-        {
+        if (pad_width) {
           memset(dst_frame, 127, pad_width >> 2);
           dst_frame +=  pad_width >> 2;
         }
       }
-      if (pad_height)
-      {
+      if (pad_height) {
         memset(dst_frame, 127, (dst_width >> 1) * (pad_height >> 1));
         dst_frame +=  (dst_width >> 1) * (pad_height >> 1);
       }
-      for (i = 0; i < (src_height >> 1); i++)
-      {
-        if (pad_width)
-        {
+      for (i = 0; i < (src_height >> 1); i++) {
+        if (pad_width) {
           memset(dst_frame, 127, pad_width >> 2);
           dst_frame +=  pad_width >> 2;
         }
@@ -213,32 +201,26 @@ I420CropPad(const uint8* src_frame, int src_width,
         src_frame += crop_width >> 2;
         dst_frame += width >> 1;
         src_frame += width >> 1;
-        if (pad_width)
-        {
+        if (pad_width) {
           memset(dst_frame, 127, pad_width >> 2);
           dst_frame += pad_width >> 2;
         }
       }
-      if (pad_height)
-      {
+      if (pad_height) {
         memset(dst_frame, 127, (dst_width >> 2) * (pad_height >> 1));
         dst_frame +=  (dst_width >> 2) * (pad_height >> 1);
       }
-    }
-    else
-    {
+    } else {
       // cut height
       int i = 0;
       int pad_width = 0;
       int crop_width = 0;
       int width = src_width;
 
-      if (src_width < dst_width)
-      {
+      if (src_width < dst_width) {
         // pad width
         pad_width = dst_width - src_width;
-      } else
-      {
+      } else {
         // cut width
         crop_width = src_width - dst_width;
         width = dst_width;
@@ -246,10 +228,8 @@ I420CropPad(const uint8* src_frame, int src_width,
       int diff_height = src_height - dst_height;
       src_frame += src_width * (diff_height >> 1);  // skip top I
 
-      for (i = 0; i < dst_height; i++)
-      {
-        if (pad_width)
-        {
+      for (i = 0; i < dst_height; i++) {
+        if (pad_width) {
           memset(dst_frame, 0, pad_width / 2);
           dst_frame +=  pad_width / 2;
         }
@@ -258,18 +238,15 @@ I420CropPad(const uint8* src_frame, int src_width,
         src_frame += crop_width >> 1;
         dst_frame += width;
         src_frame += width;
-        if (pad_width)
-        {
+        if (pad_width) {
           memset(dst_frame, 0, pad_width / 2);
           dst_frame +=  pad_width / 2;
         }
       }
       src_frame += src_width * (diff_height >> 1);  // skip end I
       src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cr
-      for (i = 0; i < (dst_height >> 1); i++)
-      {
-        if (pad_width)
-        {
+      for (i = 0; i < (dst_height >> 1); i++) {
+        if (pad_width) {
           memset(dst_frame, 127, pad_width >> 2);
           dst_frame +=  pad_width >> 2;
         }
@@ -278,18 +255,15 @@ I420CropPad(const uint8* src_frame, int src_width,
         src_frame += crop_width >> 2;
         dst_frame += width >> 1;
         src_frame += width >> 1;
-        if (pad_width)
-        {
+        if (pad_width) {
           memset(dst_frame, 127, pad_width >> 2);
           dst_frame +=  pad_width >> 2;
         }
       }
       src_frame += (src_width >> 2) * (diff_height >> 1); // skip end of Cr
       src_frame += (src_width >> 2) * (diff_height >> 1); // skip top of Cb
-      for (i = 0; i < (dst_height >> 1); i++)
-      {
-        if (pad_width)
-        {
+      for (i = 0; i < (dst_height >> 1); i++) {
+        if (pad_width) {
           memset(dst_frame, 127, pad_width >> 2);
           dst_frame +=  pad_width >> 2;
         }
@@ -298,8 +272,7 @@ I420CropPad(const uint8* src_frame, int src_width,
         src_frame += crop_width >> 2;
         dst_frame += width >> 1;
         src_frame += width >> 1;
-        if (pad_width)
-        {
+        if (pad_width) {
           memset(dst_frame, 127, pad_width >> 2);
           dst_frame +=  pad_width >> 2;
         }
@@ -317,20 +290,17 @@ I420Rotate(const uint8* src_yplane, int src_ystride,
            uint8* dst_uplane, int dst_ustride,
            uint8* dst_vplane, int dst_vstride,
            int width, int height,
-           RotationMode mode)
-{
-  switch (mode){
-    // TODO: should return int
+           RotationMode mode) {
+  switch (mode) {
     case kRotateNone:
       // copy frame
-      I420Copy(src_yplane, src_ystride,
-               src_uplane, src_ustride,
-               src_vplane, src_vstride,
-               dst_yplane, dst_ystride,
-               dst_uplane, dst_ustride,
-               dst_vplane, dst_vstride,
-               width, height);
-      return 0;
+      return I420Copy(src_yplane, src_ystride,
+                      src_uplane, src_ustride,
+                      src_vplane, src_vstride,
+                      dst_yplane, dst_ystride,
+                      dst_uplane, dst_ustride,
+                      dst_vplane, dst_vstride,
+                      width, height);
       break;
     case kRotateClockwise:
       Rotate90(src_yplane, src_ystride,
@@ -374,4 +344,4 @@ I420Rotate(const uint8* src_yplane, int src_ystride,
   }
 }
 
-} // nmaespace libyuv
+} // namespace libyuv
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 56683f7ce..0b590c8f1 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -9,11 +9,11 @@
  */
 
 
-#include "planar_functions.h"
+#include "libyuv/planar_functions.h"
 
 #include <string.h>
 
-#include "cpu_id.h"
+#include "libyuv/cpu_id.h"
 #include "row.h"
 
 namespace libyuv {
@@ -38,19 +38,29 @@ static void SplitUV_NEON(const uint8* src_uv,
   );
 }
 
-#elif (defined(WIN32) || defined(__i386__)) && !defined(COVERAGE_ENABLED) && \
-    !defined(__PIC__) && !TARGET_IPHONE_SIMULATOR
+#elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
+    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #if defined(_MSC_VER)
 #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
-#elif defined(OSX)
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
 #else
-#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
+#define TALIGN16(t, var) t var __attribute__((aligned(16)))
 #endif
 
-// shuffle constant to put even bytes in low 8 and odd bytes in high 8 bytes
-extern "C" TALIGN16(const uint8, shufevenodd[16]) =
-  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+// Shuffle table for converting ABGR to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) =
+  { 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u };
+
+// Shuffle table for converting BGRA to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) =
+  { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u };
+
+// Shuffle table for converting BG24 to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) =
+  { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u };
+
+// Shuffle table for converting RAW to ARGB.
+extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) =
+  { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u };
 
 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
 #define HAS_SPLITUV_SSE2
@@ -89,118 +99,40 @@ static void SplitUV_SSE2(const uint8* src_uv,
   }
 }
 
-#define HAS_SPLITUV_SSSE3
-__declspec(naked)
-static void SplitUV_SSSE3(const uint8* src_uv,
-                          uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]       // src_uv
-    mov        edx, [esp + 4 + 8]       // dst_u
-    mov        edi, [esp + 4 + 12]      // dst_v
-    mov        ecx, [esp + 4 + 16]      // pix
-    movdqa     xmm7, _shufevenodd
-
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm7               // 8 u's and 8 v's
-    pshufb     xmm1, xmm7               // 8 u's and 8 v's
-    movdqa     xmm2, xmm0
-    punpcklqdq xmm0, xmm1               // 16 u's
-    punpckhqdq xmm2, xmm1               // 16 v's
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    movdqa     [edi], xmm2
-    lea        edi, [edi + 16]
-    sub        ecx, 16
-    ja         wloop
-    pop        edi
-    ret
-  }
-}
-#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
-    !TARGET_IPHONE_SIMULATOR
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_SPLITUV_SSE2
-extern "C" void SplitUV_SSE2(const uint8* src_uv,
-                             uint8* dst_u, uint8* dst_v, int pix);
+static void SplitUV_SSE2(const uint8* src_uv,
+                         uint8* dst_u, uint8* dst_v, int pix) {
   asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _SplitUV_SSE2\n"
-"_SplitUV_SSE2:\n"
-#else
-    ".global SplitUV_SSE2\n"
-"SplitUV_SSE2:\n"
-#endif
-    "push   %edi\n"
-    "mov    0x8(%esp),%eax\n"
-    "mov    0xc(%esp),%edx\n"
-    "mov    0x10(%esp),%edi\n"
-    "mov    0x14(%esp),%ecx\n"
-    "pcmpeqb %xmm7,%xmm7\n"
-    "psrlw  $0x8,%xmm7\n"
-
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
 "1:"
-    "movdqa (%eax),%xmm0\n"
-    "movdqa 0x10(%eax),%xmm1\n"
-    "lea    0x20(%eax),%eax\n"
-    "movdqa %xmm0,%xmm2\n"
-    "movdqa %xmm1,%xmm3\n"
-    "pand   %xmm7,%xmm0\n"
-    "pand   %xmm7,%xmm1\n"
-    "packuswb %xmm1,%xmm0\n"
-    "movdqa %xmm0,(%edx)\n"
-    "lea    0x10(%edx),%edx\n"
-    "psrlw  $0x8,%xmm2\n"
-    "psrlw  $0x8,%xmm3\n"
-    "packuswb %xmm3,%xmm2\n"
-    "movdqa %xmm2,(%edi)\n"
-    "lea    0x10(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     1b\n"
-    "pop    %edi\n"
-    "ret\n"
-);
-
-#define HAS_SPLITUV_SSSE3
-extern "C" void SplitUV_SSSE3(const uint8* src_uv,
-                             uint8* dst_u, uint8* dst_v, int pix);
-  asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _SplitUV_SSSE3\n"
-"_SplitUV_SSSE3:\n"
-#else
-    ".global SplitUV_SSSE3\n"
-"SplitUV_SSSE3:\n"
-#endif
-    "push   %edi\n"
-    "mov    0x8(%esp),%eax\n"
-    "mov    0xc(%esp),%edx\n"
-    "mov    0x10(%esp),%edi\n"
-    "mov    0x14(%esp),%ecx\n"
-    "movdqa _shufevenodd,%xmm7\n"
-
-"1:"
-    "movdqa (%eax),%xmm0\n"
-    "movdqa 0x10(%eax),%xmm1\n"
-    "lea    0x20(%eax),%eax\n"
-    "pshufb %xmm7,%xmm0\n"
-    "pshufb %xmm7,%xmm1\n"
-    "movdqa %xmm0,%xmm2\n"
-    "punpcklqdq %xmm1,%xmm0\n"
-    "punpckhqdq %xmm1,%xmm2\n"
-    "movdqa %xmm0,(%edx)\n"
-    "lea    0x10(%edx),%edx\n"
-    "movdqa %xmm2,(%edi)\n"
-    "lea    0x10(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     1b\n"
-    "pop    %edi\n"
-    "ret\n"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "movdqa     %%xmm0,%%xmm2\n"
+  "movdqa     %%xmm1,%%xmm3\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "pand       %%xmm7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "psrlw      $0x8,%%xmm2\n"
+  "psrlw      $0x8,%%xmm3\n"
+  "packuswb   %%xmm3,%%xmm2\n"
+  "movdqa     %%xmm2,(%2)\n"
+  "lea        0x10(%2),%2\n"
+  "sub        $0x10,%3\n"
+  "ja         1b\n"
+  :
+  : "r"(src_uv),     // %0
+    "r"(dst_u),      // %1
+    "r"(dst_v),      // %2
+    "r"(pix)         // %3
+  : "memory"
 );
+}
 #endif
 #endif
 
@@ -216,28 +148,28 @@ static void SplitUV_C(const uint8* src_uv,
   }
 }
 
-static void I420CopyPlane(const uint8* src_y, int src_pitch_y,
-                          uint8* dst_y, int dst_pitch_y,
+static void I420CopyPlane(const uint8* src_y, int src_stride_y,
+                          uint8* dst_y, int dst_stride_y,
                           int width, int height) {
   // Copy plane
   for (int y = 0; y < height; ++y) {
     memcpy(dst_y, src_y, width);
-    src_y += src_pitch_y;
-    dst_y += dst_pitch_y;
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
   }
 }
 
-static void I420CopyPlane2(const uint8* src, int src_pitch_0, int src_pitch_1,
-                           uint8* dst, int dst_pitch,
+static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+                           uint8* dst, int dst_stride,
                            int width, int height) {
   // Copy plane
   for (int y = 0; y < height; y += 2) {
     memcpy(dst, src, width);
-    src += src_pitch_0;
-    dst += dst_pitch;
+    src += src_stride_0;
+    dst += dst_stride;
     memcpy(dst, src, width);
-    src += src_pitch_1;
-    dst += dst_pitch;
+    src += src_stride_1;
+    dst += dst_stride;
   }
 }
 
@@ -249,81 +181,83 @@ static void I420CopyPlane2(const uint8* src, int src_pitch_0, int src_pitch_1,
 
 // Helper function to copy yuv data without scaling.  Used
 // by our jpeg conversion callbacks to incrementally fill a yuv image.
-void I420Copy(const uint8* src_y, int src_pitch_y,
-              const uint8* src_u, int src_pitch_u,
-              const uint8* src_v, int src_pitch_v,
-              uint8* dst_y, int dst_pitch_y,
-              uint8* dst_u, int dst_pitch_u,
-              uint8* dst_v, int dst_pitch_v,
-              int width, int height) {
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (height - 1) * src_pitch_y;
-    src_u = src_u + (height - 1) * src_pitch_u;
-    src_v = src_v + (height - 1) * src_pitch_v;
-    src_pitch_y = -src_pitch_y;
-    src_pitch_u = -src_pitch_u;
-    src_pitch_v = -src_pitch_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
   }
 
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  I420CopyPlane(src_y, src_pitch_y, dst_y, dst_pitch_y, width, height);
-  I420CopyPlane(src_u, src_pitch_u, dst_u, dst_pitch_u, halfwidth, halfheight);
-  I420CopyPlane(src_v, src_pitch_v, dst_v, dst_pitch_v, halfwidth, halfheight);
+  I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
 }
 
 // Helper function to copy yuv data without scaling.  Used
 // by our jpeg conversion callbacks to incrementally fill a yuv image.
-void I422ToI420(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height) {
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_y = src_y + (height - 1) * src_pitch_y;
-    src_u = src_u + (height - 1) * src_pitch_u;
-    src_v = src_v + (height - 1) * src_pitch_v;
-    src_pitch_y = -src_pitch_y;
-    src_pitch_u = -src_pitch_u;
-    src_pitch_v = -src_pitch_v;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
   }
 
   // Copy Y plane
-  I420CopyPlane(src_y, src_pitch_y, dst_y, dst_pitch_y, width, height);
+  I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
 
   // SubSample UV planes.
   int x, y;
   int halfwidth = (width + 1) >> 1;
   for (y = 0; y < height; y += 2) {
     const uint8* u0 = src_u;
-    const uint8* u1 = src_u + src_pitch_u;
+    const uint8* u1 = src_u + src_stride_u;
     if ((y + 1) >= height) {
       u1 = u0;
     }
     for (x = 0; x < halfwidth; ++x) {
       dst_u[x] = (u0[x] + u1[x] + 1) >> 1;
     }
-    src_u += src_pitch_u * 2;
-    dst_u += dst_pitch_u;
+    src_u += src_stride_u * 2;
+    dst_u += dst_stride_u;
   }
   for (y = 0; y < height; y += 2) {
     const uint8* v0 = src_v;
-    const uint8* v1 = src_v + src_pitch_v;
+    const uint8* v1 = src_v + src_stride_v;
     if ((y + 1) >= height) {
       v1 = v0;
     }
     for (x = 0; x < halfwidth; ++x) {
       dst_v[x] = (v0[x] + v1[x] + 1) >> 1;
     }
-    src_v += src_pitch_v * 2;
-    dst_v += dst_pitch_v;
+    src_v += src_stride_v * 2;
+    dst_v += dst_stride_v;
   }
+  return 0;
 }
 
 // Support converting from FOURCC_M420
@@ -332,26 +266,26 @@ void I422ToI420(const uint8* src_y, int src_pitch_y,
 // M420 format description:
 // M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
 // Chroma is half width / half height. (420)
-// src_pitch_m420 is row planar.  Normally this will be the width in pixels.
-//   The UV plane is half width, but 2 values, so src_pitch_m420 applies to this
+// src_stride_m420 is row planar.  Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so src_stride_m420 applies to this
 //   as well as the two Y planes.
 // TODO(fbarchard): Do NV21/NV12 formats with this function
-static void X420ToI420(const uint8* src_y,
-                       int src_pitch_y0, int src_pitch_y1,
-                       const uint8* src_uv, int src_pitch_uv,
-                       uint8* dst_y, int dst_pitch_y,
-                       uint8* dst_u, int dst_pitch_u,
-                       uint8* dst_v, int dst_pitch_v,
-                       int width, int height) {
+static int X420ToI420(const uint8* src_y,
+                      int src_stride_y0, int src_stride_y1,
+                      const uint8* src_uv, int src_stride_uv,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int width, int height) {
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_y = dst_y + (height - 1) * dst_pitch_y;
-    dst_u = dst_u + (height - 1) * dst_pitch_u;
-    dst_v = dst_v + (height - 1) * dst_pitch_v;
-    dst_pitch_y = -dst_pitch_y;
-    dst_pitch_u = -dst_pitch_u;
-    dst_pitch_v = -dst_pitch_v;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (height - 1) * dst_stride_u;
+    dst_v = dst_v + (height - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
   }
 
   int halfwidth = (width + 1) >> 1;
@@ -359,25 +293,17 @@ static void X420ToI420(const uint8* src_y,
 #if defined(HAS_SPLITUV_NEON)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
       (halfwidth % 16 == 0) &&
-      IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) &&
-      IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) &&
-      IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) {
+      IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
+      IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
+      IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
     SplitUV = SplitUV_NEON;
   } else
-#elif defined(HAS_SPLITUV_SSSE3)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (halfwidth % 16 == 0) &&
-      IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) &&
-      IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) &&
-      IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) {
-    SplitUV = SplitUV_SSSE3;
-  } else
 #elif defined(HAS_SPLITUV_SSE2)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
       (halfwidth % 16 == 0) &&
-      IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) &&
-      IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) &&
-      IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) {
+      IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
+      IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
+      IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
     SplitUV = SplitUV_SSE2;
   } else
 #endif
@@ -385,43 +311,48 @@ static void X420ToI420(const uint8* src_y,
     SplitUV = SplitUV_C;
   }
 
-  I420CopyPlane2(src_y, src_pitch_y0, src_pitch_y1, dst_y, dst_pitch_y,
+  I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
                  width, height);
 
   int halfheight = (height + 1) >> 1;
   for (int y = 0; y < halfheight; ++y) {
     // Copy a row of UV.
     SplitUV(src_uv, dst_u, dst_v, halfwidth);
-    dst_u += dst_pitch_u;
-    dst_v += dst_pitch_v;
-    src_uv += src_pitch_uv;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
   }
+  return 0;
 }
 
 // Convert M420 to I420.
-void M420ToI420(const uint8* src_m420, int src_pitch_m420,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height) {
-  X420ToI420(src_m420, src_pitch_m420, src_pitch_m420 * 2,
-             src_m420 + src_pitch_m420 * 2, src_pitch_m420 * 3,
-             dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v,
-             width, height);
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
 }
 
 // Convert NV12 to I420.
-void NV12ToI420(const uint8* src_y,
-                const uint8* src_uv,
-                int src_pitch,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height) {
-  X420ToI420(src_y, src_pitch, src_pitch,
-             src_uv, src_pitch,
-             dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v,
-             width, height);
+int NV12ToI420(const uint8* src_y,
+               const uint8* src_uv,
+               int src_stride,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride, src_stride,
+                    src_uv, src_stride,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
 }
 
 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
@@ -471,59 +402,48 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
     ret
   }
 }
-#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
-    !TARGET_IPHONE_SIMULATOR
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_SPLITYUY2_SSE2
-extern "C" void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
-                               uint8* dst_u, uint8* dst_v, int pix);
+static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
+                           uint8* dst_u, uint8* dst_v, int pix) {
   asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _SplitYUY2_SSE2\n"
-"_SplitYUY2_SSE2:\n"
-#else
-    ".global SplitYUY2_SSE2\n"
-"SplitYUY2_SSE2:\n"
-#endif
-    "push   %esi\n"
-    "push   %edi\n"
-    "mov    0xc(%esp),%eax\n"
-    "mov    0x10(%esp),%edx\n"
-    "mov    0x14(%esp),%esi\n"
-    "mov    0x18(%esp),%edi\n"
-    "mov    0x1c(%esp),%ecx\n"
-    "pcmpeqb %xmm7,%xmm7\n"
-    "psrlw  $0x8,%xmm7\n"
-
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
 "1:"
-    "movdqa (%eax),%xmm0\n"
-    "movdqa 0x10(%eax),%xmm1\n"
-    "lea    0x20(%eax),%eax\n"
-    "movdqa %xmm0,%xmm2\n"
-    "movdqa %xmm1,%xmm3\n"
-    "pand   %xmm7,%xmm2\n"
-    "pand   %xmm7,%xmm3\n"
-    "packuswb %xmm3,%xmm2\n"
-    "movdqa %xmm2,(%edx)\n"
-    "lea    0x10(%edx),%edx\n"
-    "psrlw  $0x8,%xmm0\n"
-    "psrlw  $0x8,%xmm1\n"
-    "packuswb %xmm1,%xmm0\n"
-    "movdqa %xmm0,%xmm1\n"
-    "pand   %xmm7,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movq   %xmm0,(%esi)\n"
-    "lea    0x8(%esi),%esi\n"
-    "psrlw  $0x8,%xmm1\n"
-    "packuswb %xmm1,%xmm1\n"
-    "movq   %xmm1,(%edi)\n"
-    "lea    0x8(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     1b\n"
-    "pop    %edi\n"
-    "pop    %esi\n"
-    "ret\n"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "movdqa     %%xmm0,%%xmm2\n"
+  "movdqa     %%xmm1,%%xmm3\n"
+  "pand       %%xmm7,%%xmm2\n"
+  "pand       %%xmm7,%%xmm3\n"
+  "packuswb   %%xmm3,%%xmm2\n"
+  "movdqa     %%xmm2,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "psrlw      $0x8,%%xmm0\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,(%2)\n"
+  "lea        0x8(%2),%2\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm1\n"
+  "movq       %%xmm1,(%3)\n"
+  "lea        0x8(%3),%3\n"
+  "sub        $0x10,%4\n"
+  "ja         1b\n"
+  :
+  : "r"(src_yuy2),    // %0
+    "r"(dst_y),       // %1
+    "r"(dst_u),       // %2
+    "r"(dst_v),       // %3
+    "r"(pix)          // %4
+  : "memory"
 );
+}
 #endif
 
 static void SplitYUY2_C(const uint8* src_yuy2,
@@ -543,21 +463,21 @@ static void SplitYUY2_C(const uint8* src_yuy2,
 
 // Convert Q420 to I420.
 // Format is rows of YY/YUYV
-void Q420ToI420(const uint8* src_y, int src_pitch_y,
-                const uint8* src_yuy2, int src_pitch_yuy2,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height) {
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
   void (*SplitYUY2)(const uint8* src_yuy2,
                     uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
 #if defined(HAS_SPLITYUY2_SSE2)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
       (width % 16 == 0) &&
-      IS_ALIGNED(src_yuy2, 16) && (src_pitch_yuy2 % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) {
+      IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
     SplitYUY2 = SplitYUY2_SSE2;
   } else
 #endif
@@ -566,16 +486,17 @@ void Q420ToI420(const uint8* src_y, int src_pitch_y,
   }
   for (int y = 0; y < height; y += 2) {
     memcpy(dst_y, src_y, width);
-    dst_y += dst_pitch_y;
-    src_y += src_pitch_y;
+    dst_y += dst_stride_y;
+    src_y += src_stride_y;
 
     // Copy a row of YUY2.
     SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width);
-    dst_y += dst_pitch_y;
-    dst_u += dst_pitch_u;
-    dst_v += dst_pitch_v;
-    src_yuy2 += src_pitch_yuy2;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_yuy2 += src_stride_yuy2;
   }
+  return 0;
 }
 
 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
@@ -606,13 +527,13 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
 }
 
 __declspec(naked)
-void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int pitch_yuy2,
+void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
                           uint8* dst_u, uint8* dst_y, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // pitch_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
     mov        edi, [esp + 8 + 16]   // dst_v
     mov        ecx, [esp + 8 + 20]   // pix
@@ -673,13 +594,13 @@ void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
 }
 
 __declspec(naked)
-void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int pitch_uyvy,
+void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
                           uint8* dst_u, uint8* dst_y, int pix) {
   __asm {
     push       esi
     push       edi
     mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // pitch_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
     mov        edx, [esp + 8 + 12]   // dst_u
     mov        edi, [esp + 8 + 16]   // dst_v
     mov        ecx, [esp + 8 + 20]   // pix
@@ -714,174 +635,138 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int pitch_uyvy,
     ret
   }
 }
-#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
-    !TARGET_IPHONE_SIMULATOR
+
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 
 #define HAS_YUY2TOI420ROW_SSE2
-extern "C" void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
-                                    uint8* dst_y, int pix);
+static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
+                                uint8* dst_y, int pix) {
   asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _YUY2ToI420RowY_SSE2\n"
-"_YUY2ToI420RowY_SSE2:\n"
-#else
-    ".global YUY2ToI420RowY_SSE2\n"
-"YUY2ToI420RowY_SSE2:\n"
-#endif
-    "mov    0x4(%esp),%eax\n"
-    "mov    0x8(%esp),%edx\n"
-    "mov    0xc(%esp),%ecx\n"
-    "pcmpeqb %xmm7,%xmm7\n"
-    "psrlw  $0x8,%xmm7\n"
-
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
 "1:"
-    "movdqa (%eax),%xmm0\n"
-    "movdqa 0x10(%eax),%xmm1\n"
-    "lea    0x20(%eax),%eax\n"
-    "pand   %xmm7,%xmm0\n"
-    "pand   %xmm7,%xmm1\n"
-    "packuswb %xmm1,%xmm0\n"
-    "movdqa %xmm0,(%edx)\n"
-    "lea    0x10(%edx),%edx\n"
-    "sub    $0x10,%ecx\n"
-    "ja     1b\n"
-    "ret\n"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "pand       %%xmm7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  :
+  : "r"(src_yuy2),  // %0
+    "r"(dst_y),     // %1
+    "r"(pix)        // %2
+  : "memory"
 );
+}
 
-extern "C" void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int pitch_yuy2,
-                                     uint8* dst_u, uint8* dst_y, int pix);
+static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                                 uint8* dst_u, uint8* dst_y, int pix) {
   asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _YUY2ToI420RowUV_SSE2\n"
-"_YUY2ToI420RowUV_SSE2:\n"
-#else
-    ".global YUY2ToI420RowUV_SSE2\n"
-"YUY2ToI420RowUV_SSE2:\n"
-#endif
-    "push   %esi\n"
-    "push   %edi\n"
-    "mov    0xc(%esp),%eax\n"
-    "mov    0x10(%esp),%esi\n"
-    "mov    0x14(%esp),%edx\n"
-    "mov    0x18(%esp),%edi\n"
-    "mov    0x1c(%esp),%ecx\n"
-    "pcmpeqb %xmm7,%xmm7\n"
-    "psrlw  $0x8,%xmm7\n"
-
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
 "1:"
-    "movdqa (%eax),%xmm0\n"
-    "movdqa 0x10(%eax),%xmm1\n"
-    "movdqa (%eax,%esi,1),%xmm2\n"
-    "movdqa 0x10(%eax,%esi,1),%xmm3\n"
-    "lea    0x20(%eax),%eax\n"
-    "pavgb  %xmm2,%xmm0\n"
-    "pavgb  %xmm3,%xmm1\n"
-    "psrlw  $0x8,%xmm0\n"
-    "psrlw  $0x8,%xmm1\n"
-    "packuswb %xmm1,%xmm0\n"
-    "movdqa %xmm0,%xmm1\n"
-    "pand   %xmm7,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movq   %xmm0,(%edx)\n"
-    "lea    0x8(%edx),%edx\n"
-    "psrlw  $0x8,%xmm1\n"
-    "packuswb %xmm1,%xmm1\n"
-    "movq   %xmm1,(%edi)\n"
-    "lea    0x8(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     1b\n"
-    "pop    %edi\n"
-    "pop    %esi\n"
-    "ret\n"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     (%0,%1,1),%%xmm2\n"
+  "movdqa     0x10(%0,%1,1),%%xmm3\n"
+  "lea        0x20(%0),%0\n"
+  "pavgb      %%xmm2,%%xmm0\n"
+  "pavgb      %%xmm3,%%xmm1\n"
+  "psrlw      $0x8,%%xmm0\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,(%2)\n"
+  "lea        0x8(%2),%2\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm1\n"
+  "movq       %%xmm1,(%3)\n"
+  "lea        0x8(%3),%3\n"
+  "sub        $0x10,%4\n"
+  "ja         1b\n"
+  :
+  : "r"(src_yuy2),    // %0
+    "r"((intptr_t)stride_yuy2),  // %1
+    "r"(dst_u),       // %2
+    "r"(dst_y),       // %3
+    "r"(pix)          // %4
+  : "memory"
 );
-
+}
 #define HAS_UYVYTOI420ROW_SSE2
-extern "C" void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
-                                    uint8* dst_y, int pix);
+static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
+                                uint8* dst_y, int pix) {
   asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _UYVYToI420RowY_SSE2\n"
-"_UYVYToI420RowY_SSE2:\n"
-#else
-    ".global UYVYToI420RowY_SSE2\n"
-"UYVYToI420RowY_SSE2:\n"
-#endif
-    "mov    0x4(%esp),%eax\n"
-    "mov    0x8(%esp),%edx\n"
-    "mov    0xc(%esp),%ecx\n"
-
 "1:"
-    "movdqa (%eax),%xmm0\n"
-    "movdqa 0x10(%eax),%xmm1\n"
-    "lea    0x20(%eax),%eax\n"
-    "psrlw  $0x8,%xmm0\n"
-    "psrlw  $0x8,%xmm1\n"
-    "packuswb %xmm1,%xmm0\n"
-    "movdqa %xmm0,(%edx)\n"
-    "lea    0x10(%edx),%edx\n"
-    "sub    $0x10,%ecx\n"
-    "ja     1b\n"
-    "ret\n"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "lea        0x20(%0),%0\n"
+  "psrlw      $0x8,%%xmm0\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  :
+  : "r"(src_uyvy),  // %0
+    "r"(dst_y),     // %1
+    "r"(pix)        // %2
+  : "memory"
 );
+}
 
-extern "C" void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int pitch_uyvy,
-                                     uint8* dst_u, uint8* dst_y, int pix);
+static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                                 uint8* dst_u, uint8* dst_y, int pix) {
   asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _UYVYToI420RowUV_SSE2\n"
-"_UYVYToI420RowUV_SSE2:\n"
-#else
-    ".global UYVYToI420RowUV_SSE2\n"
-"UYVYToI420RowUV_SSE2:\n"
-#endif
-    "push   %esi\n"
-    "push   %edi\n"
-    "mov    0xc(%esp),%eax\n"
-    "mov    0x10(%esp),%esi\n"
-    "mov    0x14(%esp),%edx\n"
-    "mov    0x18(%esp),%edi\n"
-    "mov    0x1c(%esp),%ecx\n"
-    "pcmpeqb %xmm7,%xmm7\n"
-    "psrlw  $0x8,%xmm7\n"
-
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "psrlw      $0x8,%%xmm7\n"
 "1:"
-    "movdqa (%eax),%xmm0\n"
-    "movdqa 0x10(%eax),%xmm1\n"
-    "movdqa (%eax,%esi,1),%xmm2\n"
-    "movdqa 0x10(%eax,%esi,1),%xmm3\n"
-    "lea    0x20(%eax),%eax\n"
-    "pavgb  %xmm2,%xmm0\n"
-    "pavgb  %xmm3,%xmm1\n"
-    "pand   %xmm7,%xmm0\n"
-    "pand   %xmm7,%xmm1\n"
-    "packuswb %xmm1,%xmm0\n"
-    "movdqa %xmm0,%xmm1\n"
-    "pand   %xmm7,%xmm0\n"
-    "packuswb %xmm0,%xmm0\n"
-    "movq   %xmm0,(%edx)\n"
-    "lea    0x8(%edx),%edx\n"
-    "psrlw  $0x8,%xmm1\n"
-    "packuswb %xmm1,%xmm1\n"
-    "movq   %xmm1,(%edi)\n"
-    "lea    0x8(%edi),%edi\n"
-    "sub    $0x10,%ecx\n"
-    "ja     1b\n"
-    "pop    %edi\n"
-    "pop    %esi\n"
-    "ret\n"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     (%0,%1,1),%%xmm2\n"
+  "movdqa     0x10(%0,%1,1),%%xmm3\n"
+  "lea        0x20(%0),%0\n"
+  "pavgb      %%xmm2,%%xmm0\n"
+  "pavgb      %%xmm3,%%xmm1\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "pand       %%xmm7,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm0\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "pand       %%xmm7,%%xmm0\n"
+  "packuswb   %%xmm0,%%xmm0\n"
+  "movq       %%xmm0,(%2)\n"
+  "lea        0x8(%2),%2\n"
+  "psrlw      $0x8,%%xmm1\n"
+  "packuswb   %%xmm1,%%xmm1\n"
+  "movq       %%xmm1,(%3)\n"
+  "lea        0x8(%3),%3\n"
+  "sub        $0x10,%4\n"
+  "ja         1b\n"
+  :
+  : "r"(src_uyvy),    // %0
+    "r"((intptr_t)stride_uyvy),  // %1
+    "r"(dst_u),       // %2
+    "r"(dst_y),       // %3
+    "r"(pix)          // %4
+  : "memory"
 );
+}
 #endif
 
-void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_pitch_yuy2,
+void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2,
                        uint8* dst_u, uint8* dst_v, int pix) {
   // Copy a row of yuy2 UV values
   for (int x = 0; x < pix; x += 2) {
-    dst_u[0] = (src_yuy2[1] + src_yuy2[src_pitch_yuy2 + 1] + 1) >> 1;
-    dst_v[0] = (src_yuy2[3] + src_yuy2[src_pitch_yuy2 + 3] + 1) >> 1;
+    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
     src_yuy2 += 4;
     dst_u += 1;
     dst_v += 1;
@@ -898,12 +783,12 @@ void YUY2ToI420RowY_C(const uint8* src_yuy2,
   }
 }
 
-void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_pitch_uyvy,
+void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy,
                        uint8* dst_u, uint8* dst_v, int pix) {
   // Copy a row of uyvy UV values
   for (int x = 0; x < pix; x += 2) {
-    dst_u[0] = (src_uyvy[0] + src_uyvy[src_pitch_uyvy + 0] + 1) >> 1;
-    dst_v[0] = (src_uyvy[2] + src_uyvy[src_pitch_uyvy + 2] + 1) >> 1;
+    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
     src_uyvy += 4;
     dst_u += 1;
     dst_v += 1;
@@ -921,22 +806,22 @@ void UYVYToI420RowY_C(const uint8* src_uyvy,
 }
 
 // Convert YUY2 to I420.
-void YUY2ToI420(const uint8* src_yuy2, int src_pitch_yuy2,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height) {
-  void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_pitch_yuy2,
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2,
                           uint8* dst_u, uint8* dst_v, int pix);
   void (*YUY2ToI420RowY)(const uint8* src_yuy2,
                          uint8* dst_y, int pix);
 #if defined(HAS_YUY2TOI420ROW_SSE2)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
       (width % 16 == 0) &&
-      IS_ALIGNED(src_yuy2, 16) && (src_pitch_yuy2 % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) {
+      IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
     YUY2ToI420RowY = YUY2ToI420RowY_SSE2;
     YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2;
   } else
@@ -948,35 +833,36 @@ void YUY2ToI420(const uint8* src_yuy2, int src_pitch_yuy2,
   for (int y = 0; y < height; ++y) {
     if ((y & 1) == 0) {
       if (y >= (height - 1) ) {  // last chroma on odd height clamp height
-        src_pitch_yuy2 = 0;
+        src_stride_yuy2 = 0;
       }
-      YUY2ToI420RowUV(src_yuy2, src_pitch_yuy2, dst_u, dst_v, width);
-      dst_u += dst_pitch_u;
-      dst_v += dst_pitch_v;
+      YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
     }
     YUY2ToI420RowY(src_yuy2, dst_y, width);
-    dst_y += dst_pitch_y;
-    src_yuy2 += src_pitch_yuy2;
+    dst_y += dst_stride_y;
+    src_yuy2 += src_stride_yuy2;
   }
+  return 0;
 }
 
 // Convert UYVY to I420.
-void UYVYToI420(const uint8* src_uyvy, int src_pitch_uyvy,
-                uint8* dst_y, int dst_pitch_y,
-                uint8* dst_u, int dst_pitch_u,
-                uint8* dst_v, int dst_pitch_v,
-                int width, int height) {
-  void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_pitch_uyvy,
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy,
                           uint8* dst_u, uint8* dst_v, int pix);
   void (*UYVYToI420RowY)(const uint8* src_uyvy,
                          uint8* dst_y, int pix);
 #if defined(HAS_UYVYTOI420ROW_SSE2)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
       (width % 16 == 0) &&
-      IS_ALIGNED(src_uyvy, 16) && (src_pitch_uyvy % 16 == 0) &&
-      IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) &&
-      IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) &&
-      IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) {
+      IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) &&
+      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
+      IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
+      IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
     UYVYToI420RowY = UYVYToI420RowY_SSE2;
     UYVYToI420RowUV = UYVYToI420RowUV_SSE2;
   } else
@@ -988,119 +874,126 @@ void UYVYToI420(const uint8* src_uyvy, int src_pitch_uyvy,
   for (int y = 0; y < height; ++y) {
     if ((y & 1) == 0) {
       if (y >= (height - 1) ) {  // last chroma on odd height clamp height
-        src_pitch_uyvy = 0;
+        src_stride_uyvy = 0;
       }
-      UYVYToI420RowUV(src_uyvy, src_pitch_uyvy, dst_u, dst_v, width);
-      dst_u += dst_pitch_u;
-      dst_v += dst_pitch_v;
+      UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
     }
     UYVYToI420RowY(src_uyvy, dst_y, width);
-    dst_y += dst_pitch_y;
-    src_uyvy += src_pitch_uyvy;
+    dst_y += dst_stride_y;
+    src_uyvy += src_stride_uyvy;
   }
+  return 0;
 }
 
 // Convert I420 to ARGB.
-// TODO(fbarchard): Add SSSE3 version and supply C version for fallback.
-void I420ToARGB(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height) {
+// TODO(fbarchard): Add SSE2 version and supply C version for fallback.
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
   for (int y = 0; y < height; ++y) {
     FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_pitch_argb;
-    src_y += src_pitch_y;
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
     if (y & 1) {
-      src_u += src_pitch_u;
-      src_v += src_pitch_v;
+      src_u += src_stride_u;
+      src_v += src_stride_v;
     }
   }
   // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
   EMMS();
+  return 0;
 }
 
 // Convert I420 to BGRA.
-void I420ToBGRA(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height) {
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
   for (int y = 0; y < height; ++y) {
     FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_pitch_argb;
-    src_y += src_pitch_y;
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
     if (y & 1) {
-      src_u += src_pitch_u;
-      src_v += src_pitch_v;
+      src_u += src_stride_u;
+      src_v += src_stride_v;
     }
   }
   EMMS();
+  return 0;
 }
 
 // Convert I420 to BGRA.
-void I420ToABGR(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height) {
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
   for (int y = 0; y < height; ++y) {
     FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_pitch_argb;
-    src_y += src_pitch_y;
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
     if (y & 1) {
-      src_u += src_pitch_u;
-      src_v += src_pitch_v;
+      src_u += src_stride_u;
+      src_v += src_stride_v;
     }
   }
   EMMS();
+  return 0;
 }
 
 // Convert I422 to ARGB.
-void I422ToARGB(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height) {
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
   for (int y = 0; y < height; ++y) {
     FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_pitch_argb;
-    src_y += src_pitch_y;
-    src_u += src_pitch_u;
-    src_v += src_pitch_v;
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
   }
   // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
   EMMS();
+  return 0;
 }
 
 // Convert I444 to ARGB.
-void I444ToARGB(const uint8* src_y, int src_pitch_y,
-                const uint8* src_u, int src_pitch_u,
-                const uint8* src_v, int src_pitch_v,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height) {
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
   for (int y = 0; y < height; ++y) {
     FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width);
-    dst_argb += dst_pitch_argb;
-    src_y += src_pitch_y;
-    src_u += src_pitch_u;
-    src_v += src_pitch_v;
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
   }
   // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
   EMMS();
+  return 0;
 }
 
 // Convert I400 to ARGB.
-void I400ToARGB_Reference(const uint8* src_y, int src_pitch_y,
-                          uint8* dst_argb, int dst_pitch_argb,
-                          int width, int height) {
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int width, int height) {
   for (int y = 0; y < height; ++y) {
     FastConvertYToRGB32Row(src_y, dst_argb, width);
-    dst_argb += dst_pitch_argb;
-    src_y += src_pitch_y;
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
   }
   // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
   EMMS();
+  return 0;
 }
 
 // TODO(fbarchard): 64 bit version
@@ -1134,64 +1027,312 @@ static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   }
 }
 
-#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
-    !TARGET_IPHONE_SIMULATOR
+#define HAS_ABGRTOARGBROW_SSSE3
+__declspec(naked)
+static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
+                                int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_abgr
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm7, byte ptr [_kShuffleMaskABGRToARGB]
 
+ convertloop :
+    movdqa    xmm0, qword ptr [eax]
+    lea       eax, [eax + 16]
+    pshufb    xmm0, xmm7
+    movdqa    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 4
+    ja        convertloop
+    ret
+  }
+}
+
+#define HAS_BGRATOARGBROW_SSSE3
+__declspec(naked)
+static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
+                                int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_bgra
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm7, byte ptr [_kShuffleMaskBGRAToARGB]
+
+ convertloop :
+    movdqa    xmm0, qword ptr [eax]
+    lea       eax, [eax + 16]
+    pshufb    xmm0, xmm7
+    movdqa    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 4
+    ja        convertloop
+    ret
+  }
+}
+
+#define HAS_BG24TOARGBROW_SSSE3
+__declspec(naked)
+static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
+                                int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_bg24
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
+    pslld     xmm7, 24
+    movdqa    xmm6, byte ptr [_kShuffleMaskBG24ToARGB]
+
+ convertloop :
+    movdqa    xmm0, qword ptr [eax]
+    movdqa    xmm1, qword ptr [eax + 16]
+    movdqa    xmm3, qword ptr [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm6
+    por       xmm2, xmm7
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm6
+    movdqa    [edx + 32], xmm2
+    por       xmm0, xmm7
+    pshufb    xmm1, xmm6
+    movdqa    [edx], xmm0
+    por       xmm1, xmm7
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm6
+    movdqa    [edx + 16], xmm1
+    por       xmm3, xmm7
+    movdqa    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    ja        convertloop
+    ret
+  }
+}
+
+#define HAS_RAWTOARGBROW_SSSE3
+__declspec(naked)
+static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+                               int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_raw
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
+    pslld     xmm7, 24
+    movdqa    xmm6, byte ptr [_kShuffleMaskRAWToARGB]
+
+ convertloop :
+    movdqa    xmm0, qword ptr [eax]
+    movdqa    xmm1, qword ptr [eax + 16]
+    movdqa    xmm3, qword ptr [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm6
+    por       xmm2, xmm7
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm6
+    movdqa    [edx + 32], xmm2
+    por       xmm0, xmm7
+    pshufb    xmm1, xmm6
+    movdqa    [edx], xmm0
+    por       xmm1, xmm7
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm6
+    movdqa    [edx + 16], xmm1
+    por       xmm3, xmm7
+    movdqa    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    ja        convertloop
+    ret
+  }
+}
+
+#elif (defined(__x86_64__) || defined(__i386__)) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+
+// TODO(yuche): consider moving ARGB related codes to a separate file.
 #define HAS_I400TOARGBROW_SSE2
-extern "C" void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb,
-                                   int pix);
+static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   asm(
-    ".text\n"
-#if defined(OSX)
-    ".globl _I400ToARGBRow_SSE2\n"
-"_I400ToARGBRow_SSE2:\n"
-#else
-    ".global I400ToARGBRow_SSE2\n"
-"I400ToARGBRow_SSE2:\n"
-#endif
-    "mov    0x4(%esp),%eax\n"
-    "mov    0x8(%esp),%edx\n"
-    "mov    0xc(%esp),%ecx\n"
-    "pcmpeqb %xmm7,%xmm7\n"
-    "pslld  $0x18,%xmm7\n"
+  "pcmpeqb    %%xmm7,%%xmm7\n"
+  "pslld      $0x18,%%xmm7\n"
 "1:"
-    "movq   (%eax),%xmm0\n"
-    "lea    0x8(%eax),%eax\n"
-    "punpcklbw %xmm0,%xmm0\n"
-    "movdqa %xmm0,%xmm1\n"
-    "punpcklwd %xmm0,%xmm0\n"
-    "punpckhwd %xmm1,%xmm1\n"
-    "por    %xmm7,%xmm0\n"
-    "por    %xmm7,%xmm1\n"
-    "movdqa %xmm0,(%edx)\n"
-    "movdqa %xmm1,0x10(%edx)\n"
-    "lea    0x20(%edx),%edx\n"
-    "sub    $0x8,%ecx\n"
-    "ja     1b\n"
-    "ret\n"
+  "movq       (%0),%%xmm0\n"
+  "lea        0x8(%0),%0\n"
+  "punpcklbw  %%xmm0,%%xmm0\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "punpcklwd  %%xmm0,%%xmm0\n"
+  "punpckhwd  %%xmm1,%%xmm1\n"
+  "por        %%xmm7,%%xmm0\n"
+  "por        %%xmm7,%%xmm1\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "movdqa     %%xmm1,0x10(%1)\n"
+  "lea        0x20(%1),%1\n"
+  "sub        $0x8,%2\n"
+  "ja         1b\n"
+  :
+  : "r"(src_y),     // %0
+    "r"(dst_argb),  // %1
+    "r"(pix)        // %2
+  : "memory"
 );
+}
+
+#define HAS_ABGRTOARGBROW_SSSE3
+static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
+                                int pix) {
+  asm(
+  "movdqa     (%3),%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "lea        0x10(%0),%0\n"
+  "pshufb     %%xmm7,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x4,%2\n"
+  "ja         1b\n"
+  :
+  : "r"(src_abgr),  // %0
+    "r"(dst_argb),  // %1
+    "r"(pix),       // %2
+    "r"(kShuffleMaskABGRToARGB)  // %3
+  : "memory"
+);
+}
+
+#define HAS_BGRATOARGBROW_SSSE3
+static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
+                                int pix) {
+  asm(
+  "movdqa     (%3),%%xmm7\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "lea        0x10(%0),%0\n"
+  "pshufb     %%xmm7,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x4,%2\n"
+  "ja         1b\n"
+  :
+  : "r"(src_bgra),  // %0
+    "r"(dst_argb),  // %1
+    "r"(pix),       // %2
+    "r"(kShuffleMaskBGRAToARGB)  // %3
+  : "memory"
+);
+}
+
+#define HAS_BG24TOARGBROW_SSSE3
+static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
+                                int pix) {
+  asm(
+  "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
+  "pslld      $0x18,%%xmm7\n"
+  "movdqa     (%3),%%xmm6\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     0x20(%0),%%xmm3\n"
+  "lea        0x30(%0),%0\n"
+  "movdqa     %%xmm3,%%xmm2\n"
+  "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
+  "pshufb     %%xmm6,%%xmm2\n"
+  "por        %%xmm7,%%xmm2\n"
+  "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
+  "pshufb     %%xmm6,%%xmm0\n"
+  "movdqa     %%xmm2,0x20(%1)\n"
+  "por        %%xmm7,%%xmm0\n"
+  "pshufb     %%xmm6,%%xmm1\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "por        %%xmm7,%%xmm1\n"
+  "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
+  "pshufb     %%xmm6,%%xmm3\n"
+  "movdqa     %%xmm1,0x10(%1)\n"
+  "por        %%xmm7,%%xmm3\n"
+  "movdqa     %%xmm3,0x30(%1)\n"
+  "lea        0x40(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  :
+  : "r"(src_bg24),  // %0
+    "r"(dst_argb),  // %1
+    "r"(pix),       // %2
+    "r"(kShuffleMaskBG24ToARGB)  // %3
+  : "memory"
+);
+}
+
+#define HAS_RAWTOARGBROW_SSSE3
+static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+                               int pix) {
+  asm(
+  "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
+  "pslld      $0x18,%%xmm7\n"
+  "movdqa     (%3),%%xmm6\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     0x20(%0),%%xmm3\n"
+  "lea        0x30(%0),%0\n"
+  "movdqa     %%xmm3,%%xmm2\n"
+  "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
+  "pshufb     %%xmm6,%%xmm2\n"
+  "por        %%xmm7,%%xmm2\n"
+  "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
+  "pshufb     %%xmm6,%%xmm0\n"
+  "movdqa     %%xmm2,0x20(%1)\n"
+  "por        %%xmm7,%%xmm0\n"
+  "pshufb     %%xmm6,%%xmm1\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "por        %%xmm7,%%xmm1\n"
+  "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
+  "pshufb     %%xmm6,%%xmm3\n"
+  "movdqa     %%xmm1,0x10(%1)\n"
+  "por        %%xmm7,%%xmm3\n"
+  "movdqa     %%xmm3,0x30(%1)\n"
+  "lea        0x40(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  :
+  : "r"(src_raw),   // %0
+    "r"(dst_argb),  // %1
+    "r"(pix),       // %2
+    "r"(kShuffleMaskRAWToARGB)  // %3
+  : "memory"
+);
+}
 #endif
 
 static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
   // Copy a Y to RGB.
   for (int x = 0; x < pix; ++x) {
-    dst_argb[2] = dst_argb[1] = dst_argb[0] = src_y[0];
+    uint8 y = src_y[0];
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
     dst_argb[3] = 255u;
     dst_argb += 4;
-    src_y += 1;
+    ++src_y;
   }
 }
 
 // Convert I400 to ARGB.
-void I400ToARGB(const uint8* src_y, int src_pitch_y,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height) {
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
   void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix);
 #if defined(HAS_I400TOARGBROW_SSE2)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
       (width % 8 == 0) &&
-      IS_ALIGNED(src_y, 8) && (src_pitch_y % 8 == 0) &&
-      IS_ALIGNED(dst_argb, 16) && (dst_pitch_argb % 16 == 0)) {
+      IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
     I400ToARGBRow = I400ToARGBRow_SSE2;
   } else
 #endif
@@ -1201,16 +1342,21 @@ void I400ToARGB(const uint8* src_y, int src_pitch_y,
 
   for (int y = 0; y < height; ++y) {
     I400ToARGBRow(src_y, dst_argb, width);
-    src_y += src_pitch_y;
-    dst_argb += dst_pitch_argb;
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
   }
+  return 0;
 }
 
+
 static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
   for (int x = 0; x < pix; ++x) {
-    dst_argb[0] = src_raw[2];
-    dst_argb[1] = src_raw[1];
-    dst_argb[2] = src_raw[0];
+    uint8 r = src_raw[0];
+    uint8 g = src_raw[1];
+    uint8 b = src_raw[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
     dst_argb[3] = 255u;
     dst_argb += 4;
     src_raw += 3;
@@ -1218,21 +1364,44 @@ static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
 }
 
 // Convert RAW to ARGB.
-void RAWToARGB(const uint8* src_raw, int src_pitch_raw,
-               uint8* dst_argb, int dst_pitch_argb,
-               int width, int height) {
-  for (int y = 0; y < height; ++y) {
-    RAWToARGBRow_C(src_raw, dst_argb, width);
-    src_raw += src_pitch_raw;
-    dst_argb += dst_pitch_argb;
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
   }
+  void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+    RAWToARGBRow = RAWToARGBRow_SSSE3;
+  } else
+#endif
+  {
+    RAWToARGBRow = RAWToARGBRow_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
 }
 
 static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
   for (int x = 0; x < pix; ++x) {
-    dst_argb[0] = src_bg24[0];
-    dst_argb[1] = src_bg24[1];
-    dst_argb[2] = src_bg24[2];
+    uint8 b = src_bg24[0];
+    uint8 g = src_bg24[1];
+    uint8 r = src_bg24[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
     dst_argb[3] = 255u;
     dst_argb += 4;
     src_bg24 += 3;
@@ -1240,36 +1409,127 @@ static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
 }
 
 // Convert BG24 to ARGB.
-void BG24ToARGB(const uint8* src_bg24, int src_pitch_bg24,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height) {
-  for (int y = 0; y < height; ++y) {
-    BG24ToARGBRow_C(src_bg24, dst_argb, width);
-    src_bg24 += src_pitch_bg24;
-    dst_argb += dst_pitch_argb;
+int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
+    src_stride_bg24 = -src_stride_bg24;
   }
+  void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
+#if defined(HAS_BG24TOARGBROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 16 == 0) &&
+      IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+    BG24ToARGBRow = BG24ToARGBRow_SSSE3;
+  } else
+#endif
+  {
+    BG24ToARGBRow = BG24ToARGBRow_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    BG24ToARGBRow(src_bg24, dst_argb, width);
+    src_bg24 += src_stride_bg24;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
 }
 
+
 static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
   for (int x = 0; x < pix; ++x) {
-    dst_argb[0] = src_abgr[2];
-    dst_argb[1] = src_abgr[1];
-    dst_argb[2] = src_abgr[0];
-    dst_argb[3] = src_abgr[3];
+    // To support in-place conversion.
+    uint8 r = src_abgr[0];
+    uint8 g = src_abgr[1];
+    uint8 b = src_abgr[2];
+    uint8 a = src_abgr[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
     dst_argb += 4;
     src_abgr += 4;
   }
 }
 
-// Convert ABGR to ARGB.
-void ABGRToARGB(const uint8* src_abgr, int src_pitch_abgr,
-                uint8* dst_argb, int dst_pitch_argb,
-                int width, int height) {
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix);
+#if defined(HAS_ABGRTOARGBROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 4 == 0) &&
+      IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+    ABGRToARGBRow = ABGRToARGBRow_SSSE3;
+  } else
+#endif
+  {
+    ABGRToARGBRow = ABGRToARGBRow_C;
+  }
+
   for (int y = 0; y < height; ++y) {
-    ABGRToARGBRow_C(src_abgr, dst_argb, width);
-    src_abgr += src_pitch_abgr;
-    dst_argb += dst_pitch_argb;
+    ABGRToARGBRow(src_abgr, dst_argb, width);
+    src_abgr += src_stride_abgr;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    // To support in-place conversion.
+    uint8 a = src_bgra[0];
+    uint8 r = src_bgra[1];
+    uint8 g = src_bgra[2];
+    uint8 b = src_bgra[3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    dst_argb += 4;
+    src_bgra += 4;
   }
 }
 
+// Convert BGRA to ARGB.
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+  void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix);
+#if defined(HAS_BGRATOARGBROW_SSSE3)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
+      (width % 4 == 0) &&
+      IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) &&
+      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
+    BGRAToARGBRow = BGRAToARGBRow_SSSE3;
+  } else
+#endif
+  {
+    BGRAToARGBRow = BGRAToARGBRow_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    BGRAToARGBRow(src_bgra, dst_argb, width);
+    src_bgra += src_stride_bgra;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
 }  // namespace libyuv
+
+
diff --git a/source/rotate.cc b/source/rotate.cc
index 8075d47fb..7d2c512c8 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -19,31 +19,31 @@ typedef void (*rotate_wxhfunc)(const uint8*, int, uint8*, int, int, int);
 #ifdef __ARM_NEON__
 extern "C" {
 void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
-void Transpose_wx8_NEON(const uint8* src, int src_pitch,
-                        uint8* dst, int dst_pitch, int width);
+void Transpose_wx8_NEON(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
 }  // extern "C"
 #endif
 
-static void Transpose_wx8_C(const uint8* src, int src_pitch,
-                            uint8* dst, int dst_pitch,
+static void Transpose_wx8_C(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride,
                             int w) {
   int i, j;
   for (i = 0; i < w; ++i)
     for (j = 0; j < 8; ++j)
-      dst[i * dst_pitch + j] = src[j * src_pitch + i];
+      dst[i * dst_stride + j] = src[j * src_stride + i];
 }
 
-static void Transpose_wxh_C(const uint8* src, int src_pitch,
-                            uint8* dst, int dst_pitch,
+static void Transpose_wxh_C(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride,
                             int width, int height) {
   int i, j;
   for (i = 0; i < width; ++i)
     for (j = 0; j < height; ++j)
-      dst[i * dst_pitch + j] = src[j * src_pitch + i];
+      dst[i * dst_stride + j] = src[j * src_stride + i];
 }
 
-void Transpose(const uint8* src, int src_pitch,
-               uint8* dst, int dst_pitch,
+void Transpose(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
                int width, int height) {
   int i = height;
   rotate_wx8func Transpose_wx8;
@@ -60,33 +60,33 @@ void Transpose(const uint8* src, int src_pitch,
 
   // work across the source in 8x8 tiles
   do {
-    Transpose_wx8(src, src_pitch, dst, dst_pitch, width);
+    Transpose_wx8(src, src_stride, dst, dst_stride, width);
 
-    src += 8 * src_pitch;
+    src += 8 * src_stride;
     dst += 8;
     i   -= 8;
   } while (i >= 8);
 
 // TODO(frkoenig): Have wx4 and maybe wx2
-  Transpose_wxh(src, src_pitch, dst, dst_pitch, width, i);
+  Transpose_wxh(src, src_stride, dst, dst_stride, width, i);
 }
 
-void Rotate90(const uint8* src, int src_pitch,
-              uint8* dst, int dst_pitch,
+void Rotate90(const uint8* src, int src_stride,
+              uint8* dst, int dst_stride,
               int width, int height) {
-  src += src_pitch*(height-1);
-  src_pitch = -src_pitch;
+  src += src_stride*(height-1);
+  src_stride = -src_stride;
 
-  Transpose(src, src_pitch, dst, dst_pitch, width, height);
+  Transpose(src, src_stride, dst, dst_stride, width, height);
 }
 
-void Rotate270(const uint8* src, int src_pitch,
-               uint8* dst, int dst_pitch,
+void Rotate270(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
                int width, int height) {
-  dst += dst_pitch*(width-1);
-  dst_pitch = -dst_pitch;
+  dst += dst_stride*(width-1);
+  dst_stride = -dst_stride;
 
-  Transpose(src, src_pitch, dst, dst_pitch, width, height);
+  Transpose(src, src_stride, dst, dst_stride, width, height);
 }
 
 void ReverseLine_C(const uint8* src, uint8* dst, int width) {
@@ -95,8 +95,8 @@ void ReverseLine_C(const uint8* src, uint8* dst, int width) {
     dst[width-1 - i] = src[i];
 }
 
-void Rotate180(const uint8* src, int src_pitch,
-               uint8* dst, int dst_pitch,
+void Rotate180(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
                int width, int height) {
   int i;
   reverse_func ReverseLine;
@@ -108,13 +108,13 @@ void Rotate180(const uint8* src, int src_pitch,
   ReverseLine = ReverseLine_C;
 #endif
 
-  dst += dst_pitch*(height-1);
+  dst += dst_stride*(height-1);
 
   for (i = 0; i < height; ++i) {
     ReverseLine(src, dst, width);
 
-    src += src_pitch;
-    dst -= dst_pitch;
+    src += src_stride;
+    dst -= dst_stride;
   }
 }
 
diff --git a/source/rotate.h b/source/rotate.h
index d15ad6709..f6a90ffe3 100644
--- a/source/rotate.h
+++ b/source/rotate.h
@@ -11,34 +11,35 @@
 #ifndef LIBYUV_SOURCE_ROTATE_H_
 #define LIBYUV_SOURCE_ROTATE_H_
 
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 
 namespace libyuv {
-void Rotate90(const uint8* src, int src_pitch,
-              uint8* dst, int dst_pitch,
+
+void Rotate90(const uint8* src, int src_stride,
+              uint8* dst, int dst_stride,
               int width, int height);
-void Rotate180(const uint8* src, int src_pitch,
-               uint8* dst, int dst_pitch,
+void Rotate180(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
                int width, int height);
-void Rotate270(const uint8* src, int src_pitch,
-               uint8* dst, int dst_pitch,
+void Rotate270(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
                int width, int height);
 
-void Rotate90_deinterleave(const uint8* src, int src_pitch,
-                           uint8* dst_a, int dst_pitch_a,
-                           uint8* dst_b, int dst_pitch_b,
+void Rotate90_deinterleave(const uint8* src, int src_stride,
+                           uint8* dst_a, int dst_stride_a,
+                           uint8* dst_b, int dst_stride_b,
                            int width, int height);
-void Rotate180_deinterleave(const uint8* src, int src_pitch,
-                            uint8* dst_a, int dst_pitch_a,
-                            uint8* dst_b, int dst_pitch_b,
+void Rotate180_deinterleave(const uint8* src, int src_stride,
+                            uint8* dst_a, int dst_stride_a,
+                            uint8* dst_b, int dst_stride_b,
                             int width, int height);
-void Rotate270_deinterleave(const uint8* src, int src_pitch,
-                            uint8* dst_a, int dst_pitch_a,
-                            uint8* dst_b, int dst_pitch_b,
+void Rotate270_deinterleave(const uint8* src, int src_stride,
+                            uint8* dst_a, int dst_stride_a,
+                            uint8* dst_b, int dst_stride_b,
                             int width, int height);
 
-void Transpose(const uint8* src, int src_pitch,
-               uint8* dst, int dst_pitch,
+void Transpose(const uint8* src, int src_stride,
+               uint8* dst, int dst_stride,
                int width, int height);
 }  // namespace libyuv
 
diff --git a/source/rotate_deinterleave.cc b/source/rotate_deinterleave.cc
index fcbb0d42b..071335d73 100644
--- a/source/rotate_deinterleave.cc
+++ b/source/rotate_deinterleave.cc
@@ -27,40 +27,40 @@ void ReverseLine_di_NEON(const uint8* src,
                          uint8* dst_a, uint8* dst_b,
                          int width);
 void SaveRegisters_NEON(unsigned long long *store);
-void Transpose_di_wx8_NEON(const uint8* src, int src_pitch,
-                           uint8* dst_a, int dst_pitch_a,
-                           uint8* dst_b, int dst_pitch_b,
+void Transpose_di_wx8_NEON(const uint8* src, int src_stride,
+                           uint8* dst_a, int dst_stride_a,
+                           uint8* dst_b, int dst_stride_b,
                            int width);
 }  // extern "C"
 #endif
 
-static void Transpose_di_wx8_C(const uint8* src, int src_pitch,
-                               uint8* dst_a, int dst_pitch_a,
-                               uint8* dst_b, int dst_pitch_b,
+static void Transpose_di_wx8_C(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
                                int w) {
   int i, j;
   for (i = 0; i < w*2; i += 2)
     for (j = 0; j < 8; ++j) {
-      dst_a[j + (i>>1)*dst_pitch_a] = src[i + j*src_pitch];
-      dst_b[j + (i>>1)*dst_pitch_b] = src[i + j*src_pitch + 1];
+      dst_a[j + (i>>1)*dst_stride_a] = src[i + j*src_stride];
+      dst_b[j + (i>>1)*dst_stride_b] = src[i + j*src_stride + 1];
     }
 }
 
-static void Transpose_di_wxh_C(const uint8* src, int src_pitch,
-                               uint8* dst_a, int dst_pitch_a,
-                               uint8* dst_b, int dst_pitch_b,
+static void Transpose_di_wxh_C(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
                                int w, int h) {
   int i, j;
   for (i = 0; i < w*2; i += 2)
     for (j = 0; j < h; ++j) {
-      dst_a[j + (i>>1)*dst_pitch_a] = src[i + j*src_pitch];
-      dst_b[j + (i>>1)*dst_pitch_b] = src[i + j*src_pitch + 1];
+      dst_a[j + (i>>1)*dst_stride_a] = src[i + j*src_stride];
+      dst_b[j + (i>>1)*dst_stride_b] = src[i + j*src_stride + 1];
     }
 }
 
-void Transpose_deinterleave(const uint8* src, int src_pitch,
-                            uint8* dst_a, int dst_pitch_a,
-                            uint8* dst_b, int dst_pitch_b,
+void Transpose_deinterleave(const uint8* src, int src_stride,
+                            uint8* dst_a, int dst_stride_a,
+                            uint8* dst_b, int dst_stride_b,
                             int width, int height) {
   int i = height;
   rotate_wx8func Transpose_wx8;
@@ -81,20 +81,20 @@ void Transpose_deinterleave(const uint8* src, int src_pitch,
 
   // work across the source in 8x8 tiles
   do {
-    Transpose_wx8(src, src_pitch,
-                  dst_a, dst_pitch_a,
-                  dst_b, dst_pitch_b,
+    Transpose_wx8(src, src_stride,
+                  dst_a, dst_stride_a,
+                  dst_b, dst_stride_b,
                   width);
 
-    src   += 8 * src_pitch;
+    src   += 8 * src_stride;
     dst_a += 8;
     dst_b += 8;
     i     -= 8;
   } while (i >= 8);
 
-  Transpose_wxh(src, src_pitch,
-                dst_a, dst_pitch_a,
-                dst_b, dst_pitch_b,
+  Transpose_wxh(src, src_stride,
+                dst_a, dst_stride_a,
+                dst_b, dst_stride_b,
                 width, i);
 
 #ifdef __ARM_NEON__
@@ -102,31 +102,31 @@ void Transpose_deinterleave(const uint8* src, int src_pitch,
 #endif
 }
 
-void Rotate90_deinterleave(const uint8* src, int src_pitch,
-                           uint8* dst_a, int dst_pitch_a,
-                           uint8* dst_b, int dst_pitch_b,
+void Rotate90_deinterleave(const uint8* src, int src_stride,
+                           uint8* dst_a, int dst_stride_a,
+                           uint8* dst_b, int dst_stride_b,
                             int width, int height) {
-  src += src_pitch*(height-1);
-  src_pitch = -src_pitch;
+  src += src_stride*(height-1);
+  src_stride = -src_stride;
 
-  Transpose_deinterleave(src, src_pitch,
-                         dst_a, dst_pitch_a,
-                         dst_b, dst_pitch_b,
+  Transpose_deinterleave(src, src_stride,
+                         dst_a, dst_stride_a,
+                         dst_b, dst_stride_b,
                          width, height);
 }
 
-void Rotate270_deinterleave(const uint8* src, int src_pitch,
-                            uint8* dst_a, int dst_pitch_a,
-                            uint8* dst_b, int dst_pitch_b,
+void Rotate270_deinterleave(const uint8* src, int src_stride,
+                            uint8* dst_a, int dst_stride_a,
+                            uint8* dst_b, int dst_stride_b,
                             int width, int height) {
-  dst_a += dst_pitch_a*((width>>1)-1);
-  dst_b += dst_pitch_b*((width>>1)-1);
-  dst_pitch_a = -dst_pitch_a;
-  dst_pitch_b = -dst_pitch_b;
+  dst_a += dst_stride_a*((width>>1)-1);
+  dst_b += dst_stride_b*((width>>1)-1);
+  dst_stride_a = -dst_stride_a;
+  dst_stride_b = -dst_stride_b;
 
-  Transpose_deinterleave(src, src_pitch,
-                         dst_a, dst_pitch_a,
-                         dst_b, dst_pitch_b,
+  Transpose_deinterleave(src, src_stride,
+                         dst_a, dst_stride_a,
+                         dst_b, dst_stride_b,
                          width, height);
 }
 
@@ -140,9 +140,9 @@ static void ReverseLine_di_C(const uint8* src,
   }
 }
 
-void Rotate180_deinterleave(const uint8* src, int src_pitch,
-                            uint8* dst_a, int dst_pitch_a,
-                            uint8* dst_b, int dst_pitch_b,
+void Rotate180_deinterleave(const uint8* src, int src_stride,
+                            uint8* dst_a, int dst_stride_a,
+                            uint8* dst_b, int dst_stride_b,
                             int width, int height) {
   int i;
   reverse_func ReverseLine;
@@ -154,17 +154,17 @@ void Rotate180_deinterleave(const uint8* src, int src_pitch,
   ReverseLine = ReverseLine_di_C;
 #endif
 
-  dst_a += dst_pitch_a*(height-1);
-  dst_b += dst_pitch_b*(height-1);
+  dst_a += dst_stride_a*(height-1);
+  dst_b += dst_stride_b*(height-1);
 
   width >>= 1;
 
   for (i = 0; i < height; ++i) {
     ReverseLine(src, dst_a, dst_b, width);
 
-    src   += src_pitch;
-    dst_a -= dst_pitch_a;
-    dst_b -= dst_pitch_b;
+    src   += src_stride;
+    dst_a -= dst_stride_a;
+    dst_b -= dst_stride_b;
   }
 }
 
diff --git a/source/row.h b/source/row.h
index 67119b553..a11d80251 100644
--- a/source/row.h
+++ b/source/row.h
@@ -11,7 +11,7 @@
 #ifndef LIBYUV_SOURCE_ROW_H_
 #define LIBYUV_SOURCE_ROW_H_
 
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 
 extern "C" {
 void FastConvertYUVToRGB32Row(const uint8* y_buf,
diff --git a/source/scale.cc b/source/scale.cc
index 2f802c93e..c87621294 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -8,37 +8,24 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "scale.h"
+#include "libyuv/scale.h"
 
 #include <assert.h>
 #include <string.h>
 
-#include "cpu_id.h"
+#include "libyuv/cpu_id.h"
+
+#if defined(_MSC_VER)
+#define ALIGN16(var) __declspec(align(16)) var
+#else
+#define ALIGN16(var) var __attribute__((aligned(16)))
+#endif
 
 // Note: A Neon reference manual
 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
 // Note: Some SSE2 reference manuals
 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
 
-// TODO(fbarchard): Remove once performance is known
-//#define TEST_RSTSC
-
-#if defined(TEST_RSTSC)
-#include <iomanip>
-#include <iostream>
-#ifdef _MSC_VER
-#include <emmintrin.h>
-#endif
-
-#if defined(__GNUC__) && defined(__i386__)
-static inline uint64 __rdtsc(void) {
-  uint32_t a, d;
-  __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
-  return ((uint64)d << 32) + a;
-}
-#endif
-#endif
-
 namespace libyuv {
 
 // Set the following flag to true to revert to only
@@ -47,7 +34,7 @@ namespace libyuv {
 // when comparing the quality of the resulting YUV planes
 // as produced by the optimized and non-optimized versions.
 
-bool use_reference_impl_ = false;
+static bool use_reference_impl_ = false;
 
 void SetUseReferenceImpl(bool use) {
   use_reference_impl_ = use;
@@ -62,8 +49,8 @@ void SetUseReferenceImpl(bool use) {
 
 #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
 #define HAS_SCALEROWDOWN2_NEON
-void ScaleRowDown2_NEON(const uint8* iptr, int32 /* istride */,
-                        uint8* dst, int32 owidth) {
+void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
+                        uint8* dst, int dst_width) {
   __asm__ volatile
   (
     "1:\n"
@@ -72,13 +59,13 @@ void ScaleRowDown2_NEON(const uint8* iptr, int32 /* istride */,
     "subs       %2, %2, #16       \n"  // 16 processed per loop
     "bhi        1b                \n"
     :                                    // Output registers
-    : "r"(iptr), "r"(dst), "r"(owidth)   // Input registers
+    : "r"(src_ptr), "r"(dst), "r"(dst_width)   // Input registers
     : "r4", "q0", "q1"                   // Clobber List
   );
 }
 
-void ScaleRowDown2Int_NEON(const uint8* iptr, int32 istride,
-                           uint8* dst, int32 owidth) {
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
+                           uint8* dst, int dst_width) {
   __asm__ volatile
   (
     "mov        r4, #2            \n"  // rounding constant
@@ -99,7 +86,7 @@ void ScaleRowDown2Int_NEON(const uint8* iptr, int32 istride,
     "subs       %3, %3, #16       \n"  // 16 processed per loop
     "bhi        1b                \n"
     :                                                 // Output registers
-    : "r"(iptr), "r"(istride), "r"(dst), "r"(owidth)  // Input registers
+    : "r"(src_ptr), "r"(src_stride), "r"(dst), "r"(dst_width)  // Input registers
     : "r4", "q0", "q1", "q2", "q3", "q4"              // Clobber List
    );
 }
@@ -201,15 +188,15 @@ extern "C" TALIGN16(const uint16, scaleab2[8]) =
 
 #define HAS_SCALEROWDOWN2_SSE2
 // Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: iptr 16 byte aligned, optr 16 byte aligned.
+// Alignment requirement: src_ptr 16 byte aligned, optr 16 byte aligned.
 __declspec(naked)
-static void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride,
-                               uint8* optr, int32 owidth) {
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* optr, int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // iptr
-                                     // istride ignored
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
     mov        edx, [esp + 12]       // optr
-    mov        ecx, [esp + 16]       // owidth
+    mov        ecx, [esp + 16]       // dst_width
     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
     psrlw      xmm7, 8
 
@@ -229,16 +216,16 @@ static void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride,
   }
 }
 // Blends 32x2 rectangle to 16x1.
-// Alignment requirement: iptr 16 byte aligned, optr 16 byte aligned.
+// Alignment requirement: src_ptr 16 byte aligned, optr 16 byte aligned.
 __declspec(naked)
-static void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride,
-                                  uint8* optr, int32 owidth) {
+static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* optr, int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // iptr
-    mov        esi, [esp + 4 + 8]    // istride
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
     mov        edx, [esp + 4 + 12]   // optr
-    mov        ecx, [esp + 4 + 16]   // owidth
+    mov        ecx, [esp + 4 + 16]   // dst_width
     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
     psrlw      xmm7, 8
 
@@ -273,16 +260,16 @@ static void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride,
 
 #define HAS_SCALEROWDOWN4_SSE2
 // Point samples 32 pixels to 8 pixels.
-// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned.
 __declspec(naked)
-static void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride,
-                               uint8* orow, int32 owidth) {
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* orow, int dst_width) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-                                     // istride ignored
+    mov        esi, [esp + 32 + 4]   // src_ptr
+                                     // src_stride ignored
     mov        edi, [esp + 32 + 12]  // orow
-    mov        ecx, [esp + 32 + 16]  // owidth
+    mov        ecx, [esp + 32 + 16]  // dst_width
     pcmpeqb    xmm7, xmm7            // generate mask 0x000000ff
     psrld      xmm7, 24
 
@@ -305,19 +292,19 @@ static void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride,
 }
 
 // Blends 32x4 rectangle to 8x1.
-// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned.
 __declspec(naked)
-static void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride,
-                                  uint8* orow, int32 owidth) {
+static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* orow, int dst_width) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-    mov        ebx, [esp + 32 + 8]   // istride
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
     mov        edi, [esp + 32 + 12]  // orow
-    mov        ecx, [esp + 32 + 16]  // owidth
+    mov        ecx, [esp + 32 + 16]  // dst_width
     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
     psrlw      xmm7, 8
-    lea        edx, [ebx + ebx * 2]  // istride * 3
+    lea        edx, [ebx + ebx * 2]  // src_stride * 3
 
   wloop:
     movdqa     xmm0, [esi]
@@ -364,17 +351,17 @@ static void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride,
 
 #define HAS_SCALEROWDOWN8_SSE2
 // Point samples 32 pixels to 4 pixels.
-// Alignment requirement: iptr 16 byte aligned, optr 4 byte aligned.
+// Alignment requirement: src_ptr 16 byte aligned, optr 4 byte aligned.
 __declspec(naked)
-static void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride,
-                               uint8* orow, int32 owidth) {
+static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+                               uint8* orow, int dst_width) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-                                     // istride ignored
+    mov        esi, [esp + 32 + 4]   // src_ptr
+                                     // src_stride ignored
     mov        edi, [esp + 32 + 12]  // orow
-    mov        ecx, [esp + 32 + 16]  // owidth
-    pcmpeqb    xmm7, xmm7            // generate mask isolating 1 in 8 bytes
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    pcmpeqb    xmm7, xmm7            // generate mask isolating 1 src 8 bytes
     psrlq      xmm7, 56
 
   wloop:
@@ -397,17 +384,17 @@ static void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride,
 }
 
 // Blends 32x8 rectangle to 4x1.
-// Alignment requirement: iptr 16 byte aligned, optr 4 byte aligned.
+// Alignment requirement: src_ptr 16 byte aligned, optr 4 byte aligned.
 __declspec(naked)
-static void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride,
-                                  uint8* orow, int32 owidth) {
+static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint8* orow, int dst_width) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-    mov        ebx, [esp + 32 + 8]   // istride
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
     mov        edi, [esp + 32 + 12]  // orow
-    mov        ecx, [esp + 32 + 16]  // owidth
-    lea        edx, [ebx + ebx * 2]  // istride * 3
+    mov        ecx, [esp + 32 + 16]  // dst_width
+    lea        edx, [ebx + ebx * 2]  // src_stride * 3
     pxor       xmm7, xmm7
 
   wloop:
@@ -470,16 +457,16 @@ static void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride,
 // Then shuffled to do the scaling.
 
 // Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned.
 __declspec(naked)
-static void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride,
-                                 uint8* orow, int32 owidth) {
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+                                 uint8* orow, int dst_width) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-                                     // istride ignored
+    mov        esi, [esp + 32 + 4]   // src_ptr
+                                     // src_stride ignored
     mov        edi, [esp + 32 + 12]  // orow
-    mov        ecx, [esp + 32 + 16]  // owidth
+    mov        ecx, [esp + 32 + 16]  // dst_width
     movdqa     xmm3, _shuf0
     movdqa     xmm4, _shuf1
     movdqa     xmm5, _shuf2
@@ -520,16 +507,16 @@ static void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride,
 // xmm7 round34
 
 // Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned.
 __declspec(naked)
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride,
-                                       uint8* orow, int32 owidth) {
+static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* orow, int dst_width) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-    mov        ebx, [esp + 32 + 8]   // istride
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
     mov        edi, [esp + 32 + 12]  // orow
-    mov        ecx, [esp + 32 + 16]  // owidth
+    mov        ecx, [esp + 32 + 16]  // dst_width
     movdqa     xmm2, _shuf01
     movdqa     xmm3, _shuf11
     movdqa     xmm4, _shuf21
@@ -577,16 +564,16 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride,
 }
 
 // Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned.
 __declspec(naked)
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride,
-                                       uint8* orow, int32 owidth) {
+static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* orow, int dst_width) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-    mov        ebx, [esp + 32 + 8]   // istride
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        ebx, [esp + 32 + 8]   // src_stride
     mov        edi, [esp + 32 + 12]  // orow
-    mov        ecx, [esp + 32 + 16]  // owidth
+    mov        ecx, [esp + 32 + 16]  // dst_width
     movdqa     xmm2, _shuf01
     movdqa     xmm3, _shuf11
     movdqa     xmm4, _shuf21
@@ -641,14 +628,14 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride,
 
 // Scale 32 pixels to 12
 __declspec(naked)
-static void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride,
-                                 uint8* optr, int32 owidth) {
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+                                 uint8* optr, int dst_width) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-    mov        edx, [esp + 32 + 8]   // istride
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
     mov        edi, [esp + 32 + 12]  // optr
-    mov        ecx, [esp + 32 + 16]  // owidth
+    mov        ecx, [esp + 32 + 16]  // dst_width
     movdqa     xmm5, _shuf38a
     movdqa     xmm6, _shuf38b
     pxor       xmm7, xmm7
@@ -675,14 +662,14 @@ static void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride,
 
 // Scale 16x3 pixels to 6x1 with interpolation
 __declspec(naked)
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride,
-                                       uint8* optr, int32 owidth) {
+static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* optr, int dst_width) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-    mov        edx, [esp + 32 + 8]   // istride
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
     mov        edi, [esp + 32 + 12]  // optr
-    mov        ecx, [esp + 32 + 16]  // owidth
+    mov        ecx, [esp + 32 + 16]  // dst_width
     movdqa     xmm4, _shufac0
     movdqa     xmm5, _shufac3
     movdqa     xmm6, _scaleac3
@@ -739,14 +726,14 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride,
 
 // Scale 16x2 pixels to 6x1 with interpolation
 __declspec(naked)
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride,
-                                       uint8* optr, int32 owidth) {
+static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                       uint8* optr, int dst_width) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-    mov        edx, [esp + 32 + 8]   // istride
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
     mov        edi, [esp + 32 + 12]  // optr
-    mov        ecx, [esp + 32 + 16]  // owidth
+    mov        ecx, [esp + 32 + 16]  // dst_width
     movdqa     xmm4, _shufab0
     movdqa     xmm5, _shufab1
     movdqa     xmm6, _shufab2
@@ -784,14 +771,14 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride,
 
 // Reads 8xN bytes and produces 16 shorts at a time.
 __declspec(naked)
-static void ScaleAddRows_SSE2(const uint8* iptr, int32 istride,
-                              uint16* orow, int32 iwidth, int32 iheight) {
+static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+                              uint16* orow, int src_width, int src_height) {
   __asm {
     pushad
-    mov        esi, [esp + 32 + 4]   // iptr
-    mov        edx, [esp + 32 + 8]   // istride
+    mov        esi, [esp + 32 + 4]   // src_ptr
+    mov        edx, [esp + 32 + 8]   // src_stride
     mov        edi, [esp + 32 + 12]  // orow
-    mov        ecx, [esp + 32 + 16]  // owidth
+    mov        ecx, [esp + 32 + 16]  // dst_width
     mov        ebx, [esp + 32 + 20]  // height
     pxor       xmm7, xmm7
     dec        ebx
@@ -833,15 +820,15 @@ static void ScaleAddRows_SSE2(const uint8* iptr, int32 istride,
 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
 #define HAS_SCALEFILTERROWS_SSE2
 __declspec(naked)
-static void ScaleFilterRows_SSE2(uint8* optr, const uint8* iptr0, int32 istride,
-                                 int owidth, int source_y_fraction) {
+static void ScaleFilterRows_SSE2(uint8* optr, const uint8* iptr0, int src_stride,
+                                 int dst_width, int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // optr
     mov        esi, [esp + 8 + 8]   // iptr0
-    mov        edx, [esp + 8 + 12]  // istride
-    mov        ecx, [esp + 8 + 16]  // owidth
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
     cmp        eax, 0
     je         xloop1
@@ -923,15 +910,15 @@ static void ScaleFilterRows_SSE2(uint8* optr, const uint8* iptr0, int32 istride,
 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
 #define HAS_SCALEFILTERROWS_SSSE3
 __declspec(naked)
-static void ScaleFilterRows_SSSE3(uint8* optr, const uint8* iptr0, int32 istride,
-                                  int owidth, int source_y_fraction) {
+static void ScaleFilterRows_SSSE3(uint8* optr, const uint8* iptr0, int src_stride,
+                                  int dst_width, int source_y_fraction) {
   __asm {
     push       esi
     push       edi
     mov        edi, [esp + 8 + 4]   // optr
     mov        esi, [esp + 8 + 8]   // iptr0
-    mov        edx, [esp + 8 + 12]  // istride
-    mov        ecx, [esp + 8 + 16]  // owidth
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
     cmp        eax, 0
     je         xloop1
@@ -1003,14 +990,14 @@ static void ScaleFilterRows_SSSE3(uint8* optr, const uint8* iptr0, int32 istride
 }
 
 // Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: iptr 16 byte aligned, optr 8 byte aligned.
+// Alignment requirement: src_ptr 16 byte aligned, optr 8 byte aligned.
 __declspec(naked)
-static void ScaleFilterCols34_SSSE3(uint8* optr, const uint8* iptr,
-                                    int owidth) {
+static void ScaleFilterCols34_SSSE3(uint8* optr, const uint8* src_ptr,
+                                    int dst_width) {
   __asm {
     mov        edx, [esp + 4]    // optr
-    mov        eax, [esp + 8]    // iptr
-    mov        ecx, [esp + 12]   // owidth
+    mov        eax, [esp + 8]    // src_ptr
+    mov        ecx, [esp + 12]   // dst_width
     movdqa     xmm1, _round34
     movdqa     xmm2, _shuf01
     movdqa     xmm3, _shuf11
@@ -1056,8 +1043,8 @@ static void ScaleFilterCols34_SSSE3(uint8* optr, const uint8* iptr,
 // Generated using gcc disassembly on Visual C object file:
 // objdump -D yuvscaler.obj >yuvscaler.txt
 #define HAS_SCALEROWDOWN2_SSE2
-extern "C" void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride,
-                                   uint8* orow, int32 owidth);
+extern "C" void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
+                                   uint8* orow, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1087,8 +1074,8 @@ extern "C" void ScaleRowDown2_SSE2(const uint8* iptr, int32 istride,
     "ret\n"
 );
 
-extern "C" void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride,
-                                      uint8* orow, int32 owidth);
+extern "C" void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* orow, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1132,8 +1119,8 @@ extern "C" void ScaleRowDown2Int_SSE2(const uint8* iptr, int32 istride,
 );
 
 #define HAS_SCALEROWDOWN4_SSE2
-extern "C" void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride,
-                                   uint8* orow, int32 owidth);
+extern "C" void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
+                                   uint8* orow, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1166,8 +1153,8 @@ extern "C" void ScaleRowDown4_SSE2(const uint8* iptr, int32 istride,
     "ret\n"
 );
 
-extern "C" void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride,
-                                      uint8* orow, int32 owidth);
+extern "C" void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* orow, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1225,8 +1212,8 @@ extern "C" void ScaleRowDown4Int_SSE2(const uint8* iptr, int32 istride,
 );
 
 #define HAS_SCALEROWDOWN8_SSE2
-extern "C" void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride,
-                                   uint8* orow, int32 owidth);
+extern "C" void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
+                                   uint8* orow, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1260,8 +1247,8 @@ extern "C" void ScaleRowDown8_SSE2(const uint8* iptr, int32 istride,
     "ret\n"
 );
 
-extern "C" void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride,
-                                      uint8* orow, int32 owidth);
+extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* orow, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1331,8 +1318,8 @@ extern "C" void ScaleRowDown8Int_SSE2(const uint8* iptr, int32 istride,
 // fpic is used for magiccam plugin
 #if !defined(__PIC__)
 #define HAS_SCALEROWDOWN34_SSSE3
-extern "C" void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride,
-                                     uint8* orow, int32 owidth);
+extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
+                                     uint8* orow, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1369,8 +1356,8 @@ extern "C" void ScaleRowDown34_SSSE3(const uint8* iptr, int32 istride,
     "ret\n"
 );
 
-extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride,
-                                           uint8* orow, int32 owidth);
+extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* orow, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1430,8 +1417,8 @@ extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* iptr, int32 istride,
     "ret\n"
 );
 
-extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride,
-                                           uint8* orow, int32 owidth);
+extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* orow, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1494,8 +1481,8 @@ extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* iptr, int32 istride,
 );
 
 #define HAS_SCALEROWDOWN38_SSSE3
-extern "C" void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride,
-                                     uint8* optr, int32 owidth);
+extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
+                                     uint8* optr, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1531,8 +1518,8 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* iptr, int32 istride,
     "ret\n"
 );
 
-extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride,
-                                           uint8* optr, int32 owidth);
+extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* optr, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1595,8 +1582,8 @@ extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* iptr, int32 istride,
     "ret\n"
 );
 
-extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride,
-                                           uint8* optr, int32 owidth);
+extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
+                                           uint8* optr, int dst_width);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1641,8 +1628,8 @@ extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* iptr, int32 istride,
 #endif // __PIC__
 
 #define HAS_SCALEADDROWS_SSE2
-extern "C" void ScaleAddRows_SSE2(const uint8* iptr, int32 istride,
-                                  uint16* orow, int32 iwidth, int32 iheight);
+extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
+                                  uint16* orow, int src_width, int src_height);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1692,8 +1679,8 @@ extern "C" void ScaleAddRows_SSE2(const uint8* iptr, int32 istride,
 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
 #define HAS_SCALEFILTERROWS_SSE2
 extern "C" void ScaleFilterRows_SSE2(uint8* optr,
-                                     const uint8* iptr0, int32 istride,
-                                     int owidth, int source_y_fraction);
+                                     const uint8* iptr0, int src_stride,
+                                     int dst_width, int source_y_fraction);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1787,8 +1774,8 @@ extern "C" void ScaleFilterRows_SSE2(uint8* optr,
 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
 #define HAS_SCALEFILTERROWS_SSSE3
 extern "C" void ScaleFilterRows_SSSE3(uint8* optr,
-                                      const uint8* iptr0, int32 istride,
-                                      int owidth, int source_y_fraction);
+                                      const uint8* iptr0, int src_stride,
+                                      int dst_width, int source_y_fraction);
   asm(
     ".text\n"
 #if defined(OSX)
@@ -1870,42 +1857,42 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* optr,
 #endif
 
 // CPU agnostic row functions
-static void ScaleRowDown2_C(const uint8* iptr, int32,
-                            uint8* dst, int32 owidth) {
-  for (int x = 0; x < owidth; ++x) {
-    *dst++ = *iptr;
-    iptr += 2;
+static void ScaleRowDown2_C(const uint8* src_ptr, int,
+                            uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    *dst++ = *src_ptr;
+    src_ptr += 2;
   }
 }
 
-static void ScaleRowDown2Int_C(const uint8* iptr, int32 istride,
-                               uint8* dst, int32 owidth) {
-  for (int x = 0; x < owidth; ++x) {
-    *dst++ = (iptr[0] + iptr[1] +
-              iptr[istride] + iptr[istride + 1] + 2) >> 2;
-    iptr += 2;
+static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
+                               uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    *dst++ = (src_ptr[0] + src_ptr[1] +
+              src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
+    src_ptr += 2;
   }
 }
 
-static void ScaleRowDown4_C(const uint8* iptr, int32,
-                            uint8* dst, int32 owidth) {
-  for (int x = 0; x < owidth; ++x) {
-    *dst++ = *iptr;
-    iptr += 4;
+static void ScaleRowDown4_C(const uint8* src_ptr, int,
+                            uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    *dst++ = *src_ptr;
+    src_ptr += 4;
   }
 }
 
-static void ScaleRowDown4Int_C(const uint8* iptr, int32 istride,
-                               uint8* dst, int32 owidth) {
-  for (int x = 0; x < owidth; ++x) {
-    *dst++ = (iptr[0] + iptr[1] + iptr[2] + iptr[3] +
-              iptr[istride + 0] + iptr[istride + 1] +
-              iptr[istride + 2] + iptr[istride + 3] +
-              iptr[istride * 2 + 0] + iptr[istride * 2 + 1] +
-              iptr[istride * 2 + 2] + iptr[istride * 2 + 3] +
-              iptr[istride * 3 + 0] + iptr[istride * 3 + 1] +
-              iptr[istride * 3 + 2] + iptr[istride * 3 + 3] + 8) >> 4;
-    iptr += 4;
+static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
+                               uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
+              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
+              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
+              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
+              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
+              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >> 4;
+    src_ptr += 4;
   }
 }
 
@@ -1914,46 +1901,46 @@ static void ScaleRowDown4Int_C(const uint8* iptr, int32 istride,
 static const int kMaxOutputWidth = 640;
 static const int kMaxRow12 = kMaxOutputWidth * 2;
 
-static void ScaleRowDown8_C(const uint8* iptr, int32,
-                            uint8* dst, int32 owidth) {
-  for (int x = 0; x < owidth; ++x) {
-    *dst++ = *iptr;
-    iptr += 8;
+static void ScaleRowDown8_C(const uint8* src_ptr, int,
+                            uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    *dst++ = *src_ptr;
+    src_ptr += 8;
   }
 }
 
 // Note calling code checks width is less than max and if not
 // uses ScaleRowDown8_C instead.
-static void ScaleRowDown8Int_C(const uint8* iptr, int32 istride,
-                               uint8* dst, int32 owidth) {
+static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
+                               uint8* dst, int dst_width) {
   ALIGN16(uint8 irow[kMaxRow12 * 2]);
-  assert(owidth <= kMaxOutputWidth);
-  ScaleRowDown4Int_C(iptr, istride, irow, owidth * 2);
-  ScaleRowDown4Int_C(iptr + istride * 4, istride, irow + kMaxOutputWidth,
-                     owidth * 2);
-  ScaleRowDown2Int_C(irow, kMaxOutputWidth, dst, owidth);
+  assert(dst_width <= kMaxOutputWidth);
+  ScaleRowDown4Int_C(src_ptr, src_stride, irow, dst_width * 2);
+  ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, irow + kMaxOutputWidth,
+                     dst_width * 2);
+  ScaleRowDown2Int_C(irow, kMaxOutputWidth, dst, dst_width);
 }
 
-static void ScaleRowDown34_C(const uint8* iptr, int32,
-                             uint8* dst, int32 owidth) {
-  assert((owidth % 3 == 0) && (owidth > 0));
-  uint8* dend = dst + owidth;
+static void ScaleRowDown34_C(const uint8* src_ptr, int,
+                             uint8* dst, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  uint8* dend = dst + dst_width;
   do {
-    dst[0] = iptr[0];
-    dst[1] = iptr[1];
-    dst[2] = iptr[3];
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
     dst += 3;
-    iptr += 4;
+    src_ptr += 4;
   } while (dst < dend);
 }
 
 // Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_C(const uint8* iptr, int32 istride,
-                                   uint8* d, int32 owidth) {
-  assert((owidth % 3 == 0) && (owidth > 0));
-  uint8* dend = d + owidth;
-  const uint8* s = iptr;
-  const uint8* t = iptr + istride;
+static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* d, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  uint8* dend = d + dst_width;
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
   do {
     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -1971,12 +1958,12 @@ static void ScaleRowDown34_0_Int_C(const uint8* iptr, int32 istride,
 }
 
 // Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_C(const uint8* iptr, int32 istride,
-                                   uint8* d, int32 owidth) {
-  assert((owidth % 3 == 0) && (owidth > 0));
-  uint8* dend = d + owidth;
-  const uint8* s = iptr;
-  const uint8* t = iptr + istride;
+static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* d, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  uint8* dend = d + dst_width;
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
   do {
     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -1995,10 +1982,10 @@ static void ScaleRowDown34_1_Int_C(const uint8* iptr, int32 istride,
 
 #if defined(HAS_SCALEFILTERROWS_SSE2)
 // Filter row to 3/4
-static void ScaleFilterCols34_C(uint8* optr, const uint8* iptr, int owidth) {
-  assert((owidth % 3 == 0) && (owidth > 0));
-  uint8* dend = optr + owidth;
-  const uint8* s = iptr;
+static void ScaleFilterCols34_C(uint8* optr, const uint8* src_ptr, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  uint8* dend = optr + dst_width;
+  const uint8* s = src_ptr;
   do {
     optr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
     optr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -2009,130 +1996,103 @@ static void ScaleFilterCols34_C(uint8* optr, const uint8* iptr, int owidth) {
 }
 #endif
 
-static void ScaleFilterCols_C(uint8* optr, const uint8* iptr,
-                              int owidth, int dx) {
+static void ScaleFilterCols_C(uint8* optr, const uint8* src_ptr,
+                              int dst_width, int dx) {
   int x = 0;
-  for (int j = 0; j < owidth; ++j) {
+  for (int j = 0; j < dst_width; ++j) {
     int xi = x >> 16;
     int xf1 = x & 0xffff;
     int xf0 = 65536 - xf1;
 
-    *optr++ = (iptr[xi] * xf0 + iptr[xi + 1] * xf1) >> 16;
+    *optr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
     x += dx;
   }
 }
 
-#ifdef TEST_RSTSC
-uint64 timers34[4] = { 0, };
-#endif
-
 static const int kMaxInputWidth = 2560;
 #if defined(HAS_SCALEFILTERROWS_SSE2)
 #define HAS_SCALEROWDOWN34_SSE2
 // Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_SSE2(const uint8* iptr, int32 istride,
-                                      uint8* d, int32 owidth) {
-  assert((owidth % 3 == 0) && (owidth > 0));
+static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* d, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
   ALIGN16(uint8 row[kMaxInputWidth]);
-#ifdef TEST_RSTSC
-  uint64 t1 = __rdtsc();
-#endif
-  ScaleFilterRows_SSE2(row, iptr, istride, owidth * 4 / 3, 256 / 4);
-#ifdef TEST_RSTSC
-  uint64 t2 = __rdtsc();
-#endif
-  ScaleFilterCols34_C(d, row, owidth);
-
-#ifdef TEST_RSTSC
-  uint64 t3 = __rdtsc();
-  timers34[0] += t2 - t1;
-  timers34[1] += t3 - t2;
-#endif
+  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
+  ScaleFilterCols34_C(d, row, dst_width);
 }
 
 // Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_SSE2(const uint8* iptr, int32 istride,
-                                      uint8* d, int32 owidth) {
-  assert((owidth % 3 == 0) && (owidth > 0));
+static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
+                                      uint8* d, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
   ALIGN16(uint8 row[kMaxInputWidth]);
-#ifdef TEST_RSTSC
-  uint64 t1 = __rdtsc();
-#endif
-  ScaleFilterRows_SSE2(row, iptr, istride, owidth * 4 / 3, 256 / 2);
-#ifdef TEST_RSTSC
-  uint64 t2 = __rdtsc();
-#endif
-  ScaleFilterCols34_C(d, row, owidth);
-#ifdef TEST_RSTSC
-  uint64 t3 = __rdtsc();
-  timers34[2] += t2 - t1;
-  timers34[3] += t3 - t2;
-#endif
+  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
+  ScaleFilterCols34_C(d, row, dst_width);
 }
 #endif
 
-static void ScaleRowDown38_C(const uint8* iptr, int32,
-                             uint8* dst, int32 owidth) {
-  assert(owidth % 3 == 0);
-  for (int x = 0; x < owidth; x += 3) {
-    dst[0] = iptr[0];
-    dst[1] = iptr[3];
-    dst[2] = iptr[6];
+static void ScaleRowDown38_C(const uint8* src_ptr, int,
+                             uint8* dst, int dst_width) {
+  assert(dst_width % 3 == 0);
+  for (int x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
     dst += 3;
-    iptr += 8;
+    src_ptr += 8;
   }
 }
 
 // 8x3 -> 3x1
-static void ScaleRowDown38_3_Int_C(const uint8* iptr, int32 istride,
-                                   uint8* optr, int32 owidth) {
-  assert((owidth % 3 == 0) && (owidth > 0));
-  for (int i = 0; i < owidth; i+=3) {
-    optr[0] = (iptr[0] + iptr[1] + iptr[2] +
-        iptr[istride + 0] + iptr[istride + 1] + iptr[istride + 2] +
-        iptr[istride * 2 + 0] + iptr[istride * 2 + 1] + iptr[istride * 2 + 2]) *
+static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* optr, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (int i = 0; i < dst_width; i+=3) {
+    optr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + src_ptr[src_stride + 2] +
+        src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
         (65536 / 9) >> 16;
-    optr[1] = (iptr[3] + iptr[4] + iptr[5] +
-        iptr[istride + 3] + iptr[istride + 4] + iptr[istride + 5] +
-        iptr[istride * 2 + 3] + iptr[istride * 2 + 4] + iptr[istride * 2 + 5]) *
+    optr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + src_ptr[src_stride + 5] +
+        src_ptr[src_stride * 2 + 3] + src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
         (65536 / 9) >> 16;
-    optr[2] = (iptr[6] + iptr[7] +
-        iptr[istride + 6] + iptr[istride + 7] +
-        iptr[istride * 2 + 6] + iptr[istride * 2 + 7]) *
+    optr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
+        src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
         (65536 / 6) >> 16;
-    iptr += 8;
+    src_ptr += 8;
     optr += 3;
   }
 }
 
 // 8x2 -> 3x1
-static void ScaleRowDown38_2_Int_C(const uint8* iptr, int32 istride,
-                                   uint8* optr, int32 owidth) {
-  assert((owidth % 3 == 0) && (owidth > 0));
-  for (int i = 0; i < owidth; i+=3) {
-    optr[0] = (iptr[0] + iptr[1] + iptr[2] +
-        iptr[istride + 0] + iptr[istride + 1] + iptr[istride + 2]) *
+static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
+                                   uint8* optr, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (int i = 0; i < dst_width; i+=3) {
+    optr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) *
         (65536 / 6) >> 16;
-    optr[1] = (iptr[3] + iptr[4] + iptr[5] +
-        iptr[istride + 3] + iptr[istride + 4] + iptr[istride + 5]) *
+    optr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) *
         (65536 / 6) >> 16;
-    optr[2] = (iptr[6] + iptr[7] +
-        iptr[istride + 6] + iptr[istride + 7]) *
+    optr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
         (65536 / 4) >> 16;
-    iptr += 8;
+    src_ptr += 8;
     optr += 3;
   }
 }
 
 // C version 8x2 -> 8x1
 static void ScaleFilterRows_C(uint8* optr,
-                              const uint8* iptr0, int32 istride,
-                              int owidth, int source_y_fraction) {
-  assert(owidth > 0);
+                              const uint8* iptr0, int src_stride,
+                              int dst_width, int source_y_fraction) {
+  assert(dst_width > 0);
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
-  const uint8* iptr1 = iptr0 + istride;
-  uint8* end = optr + owidth;
+  const uint8* iptr1 = iptr0 + src_stride;
+  uint8* end = optr + dst_width;
   do {
     optr[0] = (iptr0[0] * y0_fraction + iptr1[0] * y1_fraction) >> 8;
     optr[1] = (iptr0[1] * y0_fraction + iptr1[1] * y1_fraction) >> 8;
@@ -2149,16 +2109,16 @@ static void ScaleFilterRows_C(uint8* optr,
   optr[0] = optr[-1];
 }
 
-void ScaleAddRows_C(const uint8* iptr, int32 istride,
-                    uint16* orow, int32 iwidth, int32 iheight) {
-  assert(iwidth > 0);
-  assert(iheight > 0);
-  for (int x = 0; x < iwidth; ++x) {
-    const uint8* s = iptr + x;
+void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
+                    uint16* orow, int src_width, int src_height) {
+  assert(src_width > 0);
+  assert(src_height > 0);
+  for (int x = 0; x < src_width; ++x) {
+    const uint8* s = src_ptr + x;
     int sum = 0;
-    for (int y = 0; y < iheight; ++y) {
+    for (int y = 0; y < src_height; ++y) {
       sum += s[0];
-      s += istride;
+      s += src_stride;
     }
     orow[x] = sum;
   }
@@ -2171,36 +2131,36 @@ void ScaleAddRows_C(const uint8* iptr, int32 istride,
  * its original size.
  *
  */
-static void ScalePlaneDown2(int32 iwidth, int32 iheight,
-                            int32 owidth, int32 oheight,
-                            int32 istride, int32 ostride,
-                            const uint8 *iptr, uint8 *optr,
-                            bool interpolate) {
-  assert(iwidth % 2 == 0);
-  assert(iheight % 2 == 0);
-  void (*ScaleRowDown2)(const uint8* iptr, int32 istride,
-                        uint8* orow, int32 owidth);
+static void ScalePlaneDown2(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int ostride,
+                            const uint8* src_ptr, uint8* optr,
+                            FilterMode filtering) {
+  assert(src_width % 2 == 0);
+  assert(src_height % 2 == 0);
+  void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
+                        uint8* orow, int dst_width);
 
 #if defined(HAS_SCALEROWDOWN2_NEON)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
-      (owidth % 16 == 0) && (istride % 16 == 0) && (ostride % 16 == 0) &&
-      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) {
-    ScaleRowDown2 = interpolate ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
+      (dst_width % 16 == 0) && (src_stride % 16 == 0) && (ostride % 16 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
   } else
 #endif
 #if defined(HAS_SCALEROWDOWN2_SSE2)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (owidth % 16 == 0) && IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) {
-    ScaleRowDown2 = interpolate ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
+      (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
   } else
 #endif
   {
-    ScaleRowDown2 = interpolate ? ScaleRowDown2Int_C : ScaleRowDown2_C;
+    ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
   }
 
-  for (int y = 0; y < oheight; ++y) {
-    ScaleRowDown2(iptr, istride, optr, owidth);
-    iptr += (istride << 1);
+  for (int y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, optr, dst_width);
+    src_ptr += (src_stride << 1);
     optr += ostride;
   }
 }
@@ -2211,30 +2171,30 @@ static void ScalePlaneDown2(int32 iwidth, int32 iheight,
  * This is an optimized version for scaling down a plane to 1/4 of
  * its original size.
  */
-static void ScalePlaneDown4(int32 iwidth, int32 iheight,
-                            int32 owidth, int32 oheight,
-                            int32 istride, int32 ostride,
-                            const uint8 *iptr, uint8 *optr,
-                            bool interpolate) {
-  assert(iwidth % 4 == 0);
-  assert(iheight % 4 == 0);
-  void (*ScaleRowDown4)(const uint8* iptr, int32 istride,
-                        uint8* orow, int32 owidth);
+static void ScalePlaneDown4(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int ostride,
+                            const uint8* src_ptr, uint8* optr,
+                            FilterMode filtering) {
+  assert(src_width % 4 == 0);
+  assert(src_height % 4 == 0);
+  void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
+                        uint8* orow, int dst_width);
 
 #if defined(HAS_SCALEROWDOWN4_SSE2)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (owidth % 8 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) &&
-      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) {
-    ScaleRowDown4 = interpolate ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
+      (dst_width % 8 == 0) && (src_stride % 16 == 0) && (ostride % 8 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
   } else
 #endif
   {
-    ScaleRowDown4 = interpolate ? ScaleRowDown4Int_C : ScaleRowDown4_C;
+    ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
   }
 
-  for (int y = 0; y < oheight; ++y) {
-    ScaleRowDown4(iptr, istride, optr, owidth);
-    iptr += (istride << 2);
+  for (int y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, optr, dst_width);
+    src_ptr += (src_stride << 2);
     optr += ostride;
   }
 }
@@ -2246,30 +2206,30 @@ static void ScalePlaneDown4(int32 iwidth, int32 iheight,
  * of its original size.
  *
  */
-static void ScalePlaneDown8(int32 iwidth, int32 iheight,
-                            int32 owidth, int32 oheight,
-                            int32 istride, int32 ostride,
-                            const uint8 *iptr, uint8 *optr,
-                            bool interpolate) {
-  assert(iwidth % 8 == 0);
-  assert(iheight % 8 == 0);
-  void (*ScaleRowDown8)(const uint8* iptr, int32 istride,
-                        uint8* orow, int32 owidth);
+static void ScalePlaneDown8(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int ostride,
+                            const uint8* src_ptr, uint8* optr,
+                            FilterMode filtering) {
+  assert(src_width % 8 == 0);
+  assert(src_height % 8 == 0);
+  void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
+                        uint8* orow, int dst_width);
 #if defined(HAS_SCALEROWDOWN8_SSE2)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (owidth % 16 == 0) && owidth <= kMaxOutputWidth &&
-      (istride % 16 == 0) && (ostride % 16 == 0) &&
-      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 16)) {
-    ScaleRowDown8 = interpolate ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
+      (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth &&
+      (src_stride % 16 == 0) && (ostride % 16 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 16)) {
+    ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
   } else
 #endif
   {
-    ScaleRowDown8 = interpolate && (owidth <= kMaxOutputWidth) ?
+    ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
         ScaleRowDown8Int_C : ScaleRowDown8_C;
   }
-  for (int y = 0; y < oheight; ++y) {
-    ScaleRowDown8(iptr, istride, optr, owidth);
-    iptr += (istride << 3);
+  for (int y = 0; y < dst_height; ++y) {
+    ScaleRowDown8(src_ptr, src_stride, optr, dst_width);
+    src_ptr += (src_stride << 3);
     optr += ostride;
   }
 }
@@ -2280,21 +2240,21 @@ static void ScalePlaneDown8(int32 iwidth, int32 iheight,
  * Provided by Frank Barchard (fbarchard@google.com)
  *
  */
-static void ScalePlaneDown34(int32 iwidth, int32 iheight,
-                             int32 owidth, int32 oheight,
-                             int32 istride, int32 ostride,
-                             const uint8* iptr, uint8* optr,
-                             bool interpolate) {
-  assert(owidth % 3 == 0);
-  void (*ScaleRowDown34_0)(const uint8* iptr, int32 istride,
-                           uint8* orow, int32 owidth);
-  void (*ScaleRowDown34_1)(const uint8* iptr, int32 istride,
-                           uint8* orow, int32 owidth);
+static void ScalePlaneDown34(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int ostride,
+                             const uint8* src_ptr, uint8* optr,
+                             FilterMode filtering) {
+  assert(dst_width % 3 == 0);
+  void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
+                           uint8* orow, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
+                           uint8* orow, int dst_width);
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) &&
-      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) {
-    if (!interpolate) {
+      (dst_width % 24 == 0) && (src_stride % 16 == 0) && (ostride % 8 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 8)) {
+    if (!filtering) {
       ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
       ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
     } else {
@@ -2305,15 +2265,15 @@ static void ScalePlaneDown34(int32 iwidth, int32 iheight,
 #endif
 #if defined(HAS_SCALEROWDOWN34_SSE2)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-      (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) &&
-      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8) &&
-      interpolate) {
+      (dst_width % 24 == 0) && (src_stride % 16 == 0) && (ostride % 8 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 8) &&
+      filtering) {
     ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
     ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
   } else
 #endif
   {
-    if (!interpolate) {
+    if (!filtering) {
       ScaleRowDown34_0 = ScaleRowDown34_C;
       ScaleRowDown34_1 = ScaleRowDown34_C;
     } else {
@@ -2322,35 +2282,28 @@ static void ScalePlaneDown34(int32 iwidth, int32 iheight,
     }
   }
   int irow = 0;
-  for (int y = 0; y < oheight; ++y) {
+  for (int y = 0; y < dst_height; ++y) {
     switch (irow) {
       case 0:
-        ScaleRowDown34_0(iptr, istride, optr, owidth);
+        ScaleRowDown34_0(src_ptr, src_stride, optr, dst_width);
         break;
 
       case 1:
-        ScaleRowDown34_1(iptr, istride, optr, owidth);
+        ScaleRowDown34_1(src_ptr, src_stride, optr, dst_width);
         break;
 
       case 2:
-        ScaleRowDown34_0(iptr + istride, -istride, optr, owidth);
+        ScaleRowDown34_0(src_ptr + src_stride, -src_stride, optr, dst_width);
         break;
     }
     ++irow;
-    iptr += istride;
+    src_ptr += src_stride;
     optr += ostride;
     if (irow >= 3) {
-      iptr += istride;
+      src_ptr += src_stride;
       irow = 0;
     }
   }
-
-#ifdef TEST_RSTSC
-  std::cout << "Timer34_0 Row " << std::setw(9) << timers34[0]
-            << " Column " << std::setw(9) << timers34[1]
-            << " Timer34_1 Row " << std::setw(9) << timers34[2]
-            << " Column " << std::setw(9) << timers34[3] << std::endl;
-#endif
 }
 
 /**
@@ -2361,21 +2314,21 @@ static void ScalePlaneDown34(int32 iwidth, int32 iheight,
  *
  * Reduces 16x3 to 6x1
  */
-static void ScalePlaneDown38(int32 iwidth, int32 iheight,
-                             int32 owidth, int32 oheight,
-                             int32 istride, int32 ostride,
-                             const uint8* iptr, uint8* optr,
-                             bool interpolate) {
-  assert(owidth % 3 == 0);
-  void (*ScaleRowDown38_3)(const uint8* iptr, int32 istride,
-                           uint8* orow, int32 owidth);
-  void (*ScaleRowDown38_2)(const uint8* iptr, int32 istride,
-                           uint8* orow, int32 owidth);
+static void ScalePlaneDown38(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int ostride,
+                             const uint8* src_ptr, uint8* optr,
+                             FilterMode filtering) {
+  assert(dst_width % 3 == 0);
+  void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
+                           uint8* orow, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
+                           uint8* orow, int dst_width);
 #if defined(HAS_SCALEROWDOWN38_SSSE3)
   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-      (owidth % 24 == 0) && (istride % 16 == 0) && (ostride % 8 == 0) &&
-      IS_ALIGNED(iptr, 16) && IS_ALIGNED(optr, 8)) {
-    if (!interpolate) {
+      (dst_width % 24 == 0) && (src_stride % 16 == 0) && (ostride % 8 == 0) &&
+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(optr, 8)) {
+    if (!filtering) {
       ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
       ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
     } else {
@@ -2385,7 +2338,7 @@ static void ScalePlaneDown38(int32 iwidth, int32 iheight,
   } else
 #endif
   {
-    if (!interpolate) {
+    if (!filtering) {
       ScaleRowDown38_3 = ScaleRowDown38_C;
       ScaleRowDown38_2 = ScaleRowDown38_C;
     } else {
@@ -2394,18 +2347,18 @@ static void ScalePlaneDown38(int32 iwidth, int32 iheight,
     }
   }
   int irow = 0;
-  for (int y = 0; y < oheight; ++y) {
+  for (int y = 0; y < dst_height; ++y) {
     switch (irow) {
       case 0:
       case 1:
-        ScaleRowDown38_3(iptr, istride, optr, owidth);
-        iptr += istride * 3;
+        ScaleRowDown38_3(src_ptr, src_stride, optr, dst_width);
+        src_ptr += src_stride * 3;
         ++irow;
         break;
 
       case 2:
-        ScaleRowDown38_2(iptr, istride, optr, owidth);
-        iptr += istride * 2;
+        ScaleRowDown38_2(src_ptr, src_stride, optr, dst_width);
+        src_ptr += src_stride * 2;
         irow = 0;
         break;
     }
@@ -2413,65 +2366,65 @@ static void ScalePlaneDown38(int32 iwidth, int32 iheight,
   }
 }
 
-inline static uint32 SumBox(int32 iboxwidth, int32 iboxheight,
-                            int32 istride, const uint8 *iptr) {
+inline static uint32 SumBox(int iboxwidth, int iboxheight,
+                            int src_stride, const uint8* src_ptr) {
   assert(iboxwidth > 0);
   assert(iboxheight > 0);
   uint32 sum = 0u;
   for (int y = 0; y < iboxheight; ++y) {
     for (int x = 0; x < iboxwidth; ++x) {
-      sum += iptr[x];
+      sum += src_ptr[x];
     }
-    iptr += istride;
+    src_ptr += src_stride;
   }
   return sum;
 }
 
-static void ScalePlaneBoxRow(int32 owidth, int32 boxheight,
-                             int dx, int32 istride,
-                             const uint8 *iptr, uint8 *optr) {
+static void ScalePlaneBoxRow(int dst_width, int boxheight,
+                             int dx, int src_stride,
+                             const uint8* src_ptr, uint8* optr) {
   int x = 0;
-  for (int i = 0; i < owidth; ++i) {
+  for (int i = 0; i < dst_width; ++i) {
     int ix = x >> 16;
     x += dx;
     int boxwidth = (x >> 16) - ix;
-    *optr++ = SumBox(boxwidth, boxheight, istride, iptr + ix) /
+    *optr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
         (boxwidth * boxheight);
   }
 }
 
-inline static uint32 SumPixels(int32 iboxwidth, const uint16 *iptr) {
+inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   assert(iboxwidth > 0);
   uint32 sum = 0u;
   for (int x = 0; x < iboxwidth; ++x) {
-    sum += iptr[x];
+    sum += src_ptr[x];
   }
   return sum;
 }
 
-static void ScaleAddCols2_C(int32 owidth, int32 boxheight, int dx,
-                            const uint16 *iptr, uint8 *optr) {
+static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
+                            const uint16* src_ptr, uint8* optr) {
   int scaletbl[2];
   int minboxwidth = (dx >> 16);
   scaletbl[0] = 65536 / (minboxwidth * boxheight);
   scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
   int *scaleptr = scaletbl - minboxwidth;
   int x = 0;
-  for (int i = 0; i < owidth; ++i) {
+  for (int i = 0; i < dst_width; ++i) {
     int ix = x >> 16;
     x += dx;
     int boxwidth = (x >> 16) - ix;
-    *optr++ = SumPixels(boxwidth, iptr + ix) * scaleptr[boxwidth] >> 16;
+    *optr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
   }
 }
 
-static void ScaleAddCols1_C(int32 owidth, int32 boxheight, int dx,
-                            const uint16 *iptr, uint8 *optr) {
+static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
+                            const uint16* src_ptr, uint8* optr) {
   int boxwidth = (dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int x = 0;
-  for (int i = 0; i < owidth; ++i) {
-    *optr++ = SumPixels(boxwidth, iptr + x) * scaleval >> 16;
+  for (int i = 0; i < dst_width; ++i) {
+    *optr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
     x += boxwidth;
   }
 }
@@ -2485,43 +2438,43 @@ static void ScaleAddCols1_C(int32 owidth, int32 boxheight, int dx,
  * through source, sampling a box of pixel with simple
  * averaging.
  */
-static void ScalePlaneBox(int32 iwidth, int32 iheight,
-                          int32 owidth, int32 oheight,
-                          int32 istride, int32 ostride,
-                          const uint8 *iptr, uint8 *optr) {
-  assert(owidth > 0);
-  assert(oheight > 0);
-  int dy = (iheight << 16) / oheight;
-  int dx = (iwidth << 16) / owidth;
-  if ((iwidth % 16 != 0) || (iwidth > kMaxInputWidth) ||
-      oheight * 2 > iheight) {
-    uint8 *dst = optr;
-    int dy = (iheight << 16) / oheight;
-    int dx = (iwidth << 16) / owidth;
+static void ScalePlaneBox(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int ostride,
+                          const uint8* src_ptr, uint8* optr) {
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  int dy = (src_height << 16) / dst_height;
+  int dx = (src_width << 16) / dst_width;
+  if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) ||
+      dst_height * 2 > src_height) {
+    uint8* dst = optr;
+    int dy = (src_height << 16) / dst_height;
+    int dx = (src_width << 16) / dst_width;
     int y = 0;
-    for (int j = 0; j < oheight; ++j) {
+    for (int j = 0; j < dst_height; ++j) {
       int iy = y >> 16;
-      const uint8 *const src = iptr + iy * istride;
+      const uint8* const src = src_ptr + iy * src_stride;
       y += dy;
-      if (y > (iheight << 16)) {
-        y = (iheight << 16);
+      if (y > (src_height << 16)) {
+        y = (src_height << 16);
       }
       int boxheight = (y >> 16) - iy;
-      ScalePlaneBoxRow(owidth, boxheight,
-                       dx, istride,
+      ScalePlaneBoxRow(dst_width, boxheight,
+                       dx, src_stride,
                        src, dst);
 
       dst += ostride;
     }
   } else {
     ALIGN16(uint16 row[kMaxInputWidth]);
-    void (*ScaleAddRows)(const uint8* iptr, int32 istride,
-                         uint16* orow, int32 iwidth, int32 iheight);
-    void (*ScaleAddCols)(int32 owidth, int32 boxheight, int dx,
-                         const uint16 *iptr, uint8 *optr);
+    void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
+                         uint16* orow, int src_width, int src_height);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
+                         const uint16* src_ptr, uint8* optr);
 #if defined(HAS_SCALEADDROWS_SSE2)
     if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-        (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) {
+        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && (src_width % 16) == 0) {
       ScaleAddRows = ScaleAddRows_SSE2;
     } else
 #endif
@@ -2535,16 +2488,16 @@ static void ScalePlaneBox(int32 iwidth, int32 iheight,
     }
 
     int y = 0;
-    for (int j = 0; j < oheight; ++j) {
+    for (int j = 0; j < dst_height; ++j) {
       int iy = y >> 16;
-      const uint8 *const src = iptr + iy * istride;
+      const uint8* const src = src_ptr + iy * src_stride;
       y += dy;
-      if (y > (iheight << 16)) {
-        y = (iheight << 16);
+      if (y > (src_height << 16)) {
+        y = (src_height << 16);
       }
       int boxheight = (y >> 16) - iy;
-      ScaleAddRows(src, istride, row, iwidth, boxheight);
-      ScaleAddCols(owidth, boxheight, dx, row, optr);
+      ScaleAddRows(src, src_stride, row, src_width, boxheight);
+      ScaleAddCols(dst_width, boxheight, dx, row, optr);
       optr += ostride;
     }
   }
@@ -2553,35 +2506,35 @@ static void ScalePlaneBox(int32 iwidth, int32 iheight,
 /**
  * Scale plane to/from any dimensions, with interpolation.
  */
-static void ScalePlaneBilinearSimple(int32 iwidth, int32 iheight,
-                                     int32 owidth, int32 oheight,
-                                     int32 istride, int32 ostride,
-                                     const uint8 *iptr, uint8 *optr) {
-  uint8 *dst = optr;
-  int dx = (iwidth << 16) / owidth;
-  int dy = (iheight << 16) / oheight;
-  int maxx = ((iwidth - 1) << 16) - 1;
-  int maxy = ((iheight - 1) << 16) - 1;
-  int y = (oheight < iheight) ? 32768 : (iheight << 16) / oheight - 32768;
-  for (int i = 0; i < oheight; ++i) {
+static void ScalePlaneBilinearSimple(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride, int ostride,
+                                     const uint8* src_ptr, uint8* optr) {
+  uint8* dst = optr;
+  int dx = (src_width << 16) / dst_width;
+  int dy = (src_height << 16) / dst_height;
+  int maxx = ((src_width - 1) << 16) - 1;
+  int maxy = ((src_height - 1) << 16) - 1;
+  int y = (dst_height < src_height) ? 32768 : (src_height << 16) / dst_height - 32768;
+  for (int i = 0; i < dst_height; ++i) {
     int cy = (y < 0) ? 0 : y;
     int yi = cy >> 16;
     int yf = cy & 0xffff;
-    const uint8 *const src = iptr + yi * istride;
-    int x = (owidth < iwidth) ? 32768 : (iwidth << 16) / owidth - 32768;
-    for (int j = 0; j < owidth; ++j) {
+    const uint8* const src = src_ptr + yi * src_stride;
+    int x = (dst_width < src_width) ? 32768 : (src_width << 16) / dst_width - 32768;
+    for (int j = 0; j < dst_width; ++j) {
       int cx = (x < 0) ? 0 : x;
       int xi = cx >> 16;
       int xf = cx & 0xffff;
       int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
-      int r1 = (src[xi + istride] * (65536 - xf) + src[xi + istride + 1] * xf)
+      int r1 = (src[xi + src_stride] * (65536 - xf) + src[xi + src_stride + 1] * xf)
                 >> 16;
       *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
       x += dx;
       if (x > maxx)
         x = maxx;
     }
-    dst += ostride - owidth;
+    dst += ostride - dst_width;
     y += dy;
     if (y > maxy)
       y = maxy;
@@ -2592,33 +2545,33 @@ static void ScalePlaneBilinearSimple(int32 iwidth, int32 iheight,
  * Scale plane to/from any dimensions, with bilinear
  * interpolation.
  */
-static void ScalePlaneBilinear(int32 iwidth, int32 iheight,
-                               int32 owidth, int32 oheight,
-                               int32 istride, int32 ostride,
-                               const uint8 *iptr, uint8 *optr) {
-  assert(owidth > 0);
-  assert(oheight > 0);
-  int dy = (iheight << 16) / oheight;
-  int dx = (iwidth << 16) / owidth;
-  if ((iwidth % 8 != 0) || (iwidth > kMaxInputWidth)) {
-    ScalePlaneBilinearSimple(iwidth, iheight, owidth, oheight, istride, ostride,
-                             iptr, optr);
+static void ScalePlaneBilinear(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int ostride,
+                               const uint8* src_ptr, uint8* optr) {
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  int dy = (src_height << 16) / dst_height;
+  int dx = (src_width << 16) / dst_width;
+  if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) {
+    ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                             src_ptr, optr);
 
   } else {
     ALIGN16(uint8 row[kMaxInputWidth + 1]);
-    void (*ScaleFilterRows)(uint8* optr, const uint8* iptr0, int32 istride,
-                            int owidth, int source_y_fraction);
-    void (*ScaleFilterCols)(uint8* optr, const uint8* iptr,
-                            int owidth, int dx);
+    void (*ScaleFilterRows)(uint8* optr, const uint8* iptr0, int src_stride,
+                            int dst_width, int source_y_fraction);
+    void (*ScaleFilterCols)(uint8* optr, const uint8* src_ptr,
+                            int dst_width, int dx);
 #if defined(HAS_SCALEFILTERROWS_SSSE3)
     if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
-        (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) {
+        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && (src_width % 16) == 0) {
       ScaleFilterRows = ScaleFilterRows_SSSE3;
     } else
 #endif
 #if defined(HAS_SCALEFILTERROWS_SSE2)
     if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
-        (istride % 16 == 0) && IS_ALIGNED(iptr, 16) && (iwidth % 16) == 0) {
+        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && (src_width % 16) == 0) {
       ScaleFilterRows = ScaleFilterRows_SSE2;
     } else
 #endif
@@ -2628,13 +2581,13 @@ static void ScalePlaneBilinear(int32 iwidth, int32 iheight,
     ScaleFilterCols = ScaleFilterCols_C;
 
     int y = 0;
-    int maxy = ((iheight - 1) << 16) - 1; // max is filter of last 2 rows.
-    for (int j = 0; j < oheight; ++j) {
+    int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
+    for (int j = 0; j < dst_height; ++j) {
       int iy = y >> 16;
       int fy = (y >> 8) & 255;
-      const uint8 *const src = iptr + iy * istride;
-      ScaleFilterRows(row, src, istride, iwidth, fy);
-      ScaleFilterCols(optr, row, owidth, dx);
+      const uint8* const src = src_ptr + iy * src_stride;
+      ScaleFilterRows(row, src, src_stride, src_width, fy);
+      ScaleFilterCols(optr, row, dst_width, dx);
       optr += ostride;
       y += dy;
       if (y > maxy) {
@@ -2650,39 +2603,39 @@ static void ScalePlaneBilinear(int32 iwidth, int32 iheight,
  * of x and dx is the integer part of the source position and
  * the lower 16 bits are the fixed decimal part.
  */
-static void ScalePlaneSimple(int32 iwidth, int32 iheight,
-                             int32 owidth, int32 oheight,
-                             int32 istride, int32 ostride,
-                             const uint8 *iptr, uint8 *optr) {
-  uint8 *dst = optr;
-  int dx = (iwidth << 16) / owidth;
-  for (int y = 0; y < oheight; ++y) {
-    const uint8 *const src = iptr + (y * iheight / oheight) * istride;
+static void ScalePlaneSimple(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int ostride,
+                             const uint8* src_ptr, uint8* optr) {
+  uint8* dst = optr;
+  int dx = (src_width << 16) / dst_width;
+  for (int y = 0; y < dst_height; ++y) {
+    const uint8* const src = src_ptr + (y * src_height / dst_height) * src_stride;
     // TODO(fbarchard): Round X coordinate by setting x=0x8000.
     int x = 0;
-    for (int i = 0; i < owidth; ++i) {
+    for (int i = 0; i < dst_width; ++i) {
       *dst++ = src[x >> 16];
       x += dx;
     }
-    dst += ostride - owidth;
+    dst += ostride - dst_width;
   }
 }
 
 /**
  * Scale plane to/from any dimensions.
  */
-static void ScalePlaneAnySize(int32 iwidth, int32 iheight,
-                              int32 owidth, int32 oheight,
-                              int32 istride, int32 ostride,
-                              const uint8 *iptr, uint8 *optr,
-                              bool interpolate) {
-  if (!interpolate) {
-    ScalePlaneSimple(iwidth, iheight, owidth, oheight, istride, ostride,
-                     iptr, optr);
+static void ScalePlaneAnySize(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int ostride,
+                              const uint8* src_ptr, uint8* optr,
+                              FilterMode filtering) {
+  if (!filtering) {
+    ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                     src_ptr, optr);
   } else {
     // fall back to non-optimized version
-    ScalePlaneBilinear(iwidth, iheight, owidth, oheight, istride, ostride,
-                       iptr, optr);
+    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                       src_ptr, optr);
   }
 }
 
@@ -2694,20 +2647,21 @@ static void ScalePlaneAnySize(int32 iwidth, int32 iheight,
  * reference implementation for e.g. XGA->LowResPAL
  *
  */
-static void ScalePlaneDown(int32 iwidth, int32 iheight,
-                           int32 owidth, int32 oheight,
-                           int32 istride, int32 ostride,
-                           const uint8 *iptr, uint8 *optr,
-                           bool interpolate) {
-  if (!interpolate) {
-    ScalePlaneSimple(iwidth, iheight, owidth, oheight, istride, ostride,
-                     iptr, optr);
-  } else if (iheight * 2 > oheight) {  // between 1/2x and 1x use bilinear
-    ScalePlaneBilinear(iwidth, iheight, owidth, oheight, istride, ostride,
-                       iptr, optr);
+static void ScalePlaneDown(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int ostride,
+                           const uint8* src_ptr, uint8* optr,
+                           FilterMode filtering) {
+  if (!filtering) {
+    ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                     src_ptr, optr);
+  } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
+    // between 1/2x and 1x use bilinear
+    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                       src_ptr, optr);
   } else {
-    ScalePlaneBox(iwidth, iheight, owidth, oheight, istride, ostride,
-                  iptr, optr);
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                  src_ptr, optr);
   }
 }
 
@@ -2719,71 +2673,71 @@ static void ScalePlaneDown(int32 iwidth, int32 iheight,
  * compared to the reference implementation.
  *
  */
-static void CopyPlane(int32 iwidth, int32 iheight,
-                      int32 owidth, int32 oheight,
-                      int32 istride, int32 ostride,
-                      const uint8 *iptr, uint8 *optr) {
-  if (istride == iwidth && ostride == owidth) {
+static void CopyPlane(int src_width, int src_height,
+                      int dst_width, int dst_height,
+                      int src_stride, int ostride,
+                      const uint8* src_ptr, uint8* optr) {
+  if (src_stride == src_width && ostride == dst_width) {
     // All contiguous, so can use REALLY fast path.
-    memcpy(optr, iptr, iwidth * iheight);
+    memcpy(optr, src_ptr, src_width * src_height);
   } else {
     // Not all contiguous; must copy scanlines individually
-    const uint8 *src = iptr;
-    uint8 *dst = optr;
-    for (int i = 0; i < iheight; ++i) {
-      memcpy(dst, src, iwidth);
+    const uint8* src = src_ptr;
+    uint8* dst = optr;
+    for (int i = 0; i < src_height; ++i) {
+      memcpy(dst, src, src_width);
       dst += ostride;
-      src += istride;
+      src += src_stride;
     }
   }
 }
 
-static void ScalePlane(const uint8 *in, int32 istride,
-                       int32 iwidth, int32 iheight,
-                       uint8 *out, int32 ostride,
-                       int32 owidth, int32 oheight,
-                       bool interpolate, bool use_ref) {
+static void ScalePlane(const uint8* src, int src_stride,
+                       int src_width, int src_height,
+                       uint8* dst, int ostride,
+                       int dst_width, int dst_height,
+                       FilterMode filtering, bool use_ref) {
   // Use specialized scales to improve performance for common resolutions.
   // For example, all the 1/2 scalings will use ScalePlaneDown2()
-  if (owidth == iwidth && oheight == iheight) {
+  if (dst_width == src_width && dst_height == src_height) {
     // Straight copy.
-    CopyPlane(iwidth, iheight, owidth, oheight, istride, ostride, in, out);
-  } else if (owidth <= iwidth && oheight <= iheight) {
+    CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, ostride, src, dst);
+  } else if (dst_width <= src_width && dst_height <= src_height) {
     // Scale down.
     if (use_ref) {
       // For testing, allow the optimized versions to be disabled.
-      ScalePlaneDown(iwidth, iheight, owidth, oheight, istride, ostride,
-                     in, out, interpolate);
-    } else if (4 * owidth == 3 * iwidth && 4 * oheight == 3 * iheight) {
+      ScalePlaneDown(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                     src, dst, filtering);
+    } else if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
       // optimized, 3/4
-      ScalePlaneDown34(iwidth, iheight, owidth, oheight, istride, ostride,
-                       in, out, interpolate);
-    } else if (2 * owidth == iwidth && 2 * oheight == iheight) {
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                       src, dst, filtering);
+    } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
-      ScalePlaneDown2(iwidth, iheight, owidth, oheight, istride, ostride,
-                      in, out, interpolate);
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                      src, dst, filtering);
     // 3/8 rounded up for odd sized chroma height.
-    } else if (8 * owidth == 3 * iwidth && oheight == ((iheight * 3 + 7) / 8)) {
+    } else if (8 * dst_width == 3 * src_width && dst_height == ((src_height * 3 + 7) / 8)) {
       // optimized, 3/8
-      ScalePlaneDown38(iwidth, iheight, owidth, oheight, istride, ostride,
-                       in, out, interpolate);
-    } else if (4 * owidth == iwidth && 4 * oheight == iheight) {
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                       src, dst, filtering);
+    } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
       // optimized, 1/4
-      ScalePlaneDown4(iwidth, iheight, owidth, oheight, istride, ostride,
-                      in, out, interpolate);
-    } else if (8 * owidth == iwidth && 8 * oheight == iheight) {
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                      src, dst, filtering);
+    } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
       // optimized, 1/8
-      ScalePlaneDown8(iwidth, iheight, owidth, oheight, istride, ostride,
-                      in, out, interpolate);
+      ScalePlaneDown8(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                      src, dst, filtering);
     } else {
       // Arbitrary downsample
-      ScalePlaneDown(iwidth, iheight, owidth, oheight, istride, ostride,
-                     in, out, interpolate);
+      ScalePlaneDown(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                     src, dst, filtering);
     }
   } else {
     // Arbitrary scale up and/or down.
-    ScalePlaneAnySize(iwidth, iheight, owidth, oheight, istride, ostride,
-                      in, out, interpolate);
+    ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, src_stride, ostride,
+                      src, dst, filtering);
   }
 }
 
@@ -2794,59 +2748,91 @@ static void ScalePlane(const uint8 *in, int32 istride,
  * suitable for handling the desired resolutions.
  *
  */
-bool Scale(const uint8 *inY, const uint8 *inU, const uint8 *inV,
-           int32 istrideY, int32 istrideU, int32 istrideV,
-           int32 iwidth, int32 iheight,
-           uint8 *outY, uint8 *outU, uint8 *outV,
-           int32 ostrideY, int32 ostrideU, int32 ostrideV,
-           int32 owidth, int32 oheight,
-           bool interpolate) {
-  if (!inY || !inU || !inV || iwidth <= 0 || iheight <= 0 ||
-      !outY || !outU || !outV || owidth <= 0 || oheight <= 0) {
-    return false;
-  }
-  int32 halfiwidth = (iwidth + 1) >> 1;
-  int32 halfiheight = (iheight + 1) >> 1;
-  int32 halfowidth = (owidth + 1) >> 1;
-  int32 halfoheight = (oheight + 1) >> 1;
 
-  ScalePlane(inY, istrideY, iwidth, iheight,
-             outY, ostrideY, owidth, oheight,
-             interpolate, use_reference_impl_);
-  ScalePlane(inU, istrideU, halfiwidth, halfiheight,
-             outU, ostrideU, halfowidth, halfoheight,
-             interpolate, use_reference_impl_);
-  ScalePlane(inV, istrideV, halfiwidth, halfiheight,
-             outV, ostrideV, halfowidth, halfoheight,
-             interpolate, use_reference_impl_);
-  return true;
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height <= 0 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  int halfiwidth = (src_width + 1) >> 1;
+  int halfiheight = (src_height + 1) >> 1;
+  int halfowidth = (dst_width + 1) >> 1;
+  int halfoheight = (dst_height + 1) >> 1;
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height,
+             dst_y, dst_stride_y, dst_width, dst_height,
+             filtering, use_reference_impl_);
+  ScalePlane(src_u, src_stride_u, halfiwidth, halfiheight,
+             dst_u, dst_stride_u, halfowidth, halfoheight,
+             filtering, use_reference_impl_);
+  ScalePlane(src_v, src_stride_v, halfiwidth, halfiheight,
+             dst_v, dst_stride_v, halfowidth, halfoheight,
+             filtering, use_reference_impl_);
+  return 0;
 }
 
-bool Scale(const uint8 *in, int32 iwidth, int32 iheight,
-           uint8 *out, int32 owidth, int32 oheight, int32 ooffset,
-           bool interpolate) {
-  if (!in || iwidth <= 0 || iheight <= 0 ||
-      !out || owidth <= 0 || oheight <= 0 || ooffset < 0 ||
-      ooffset >= oheight) {
-    return false;
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          bool interpolate) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height <= 0 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  int halfiwidth = (src_width + 1) >> 1;
+  int halfiheight = (src_height + 1) >> 1;
+  int halfowidth = (dst_width + 1) >> 1;
+  int halfoheight = (dst_height + 1) >> 1;
+  FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height,
+             dst_y, dst_stride_y, dst_width, dst_height,
+             filtering, use_reference_impl_);
+  ScalePlane(src_u, src_stride_u, halfiwidth, halfiheight,
+             dst_u, dst_stride_u, halfowidth, halfoheight,
+             filtering, use_reference_impl_);
+  ScalePlane(src_v, src_stride_v, halfiwidth, halfiheight,
+             dst_v, dst_stride_v, halfowidth, halfoheight,
+             filtering, use_reference_impl_);
+  return 0;
+}
+
+int Scale(const uint8* src, int src_width, int src_height,
+          uint8* dst, int dst_width, int dst_height, int ooffset,
+          bool interpolate) {
+  if (!src || src_width <= 0 || src_height <= 0 ||
+      !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 ||
+      ooffset >= dst_height) {
+    return -1;
   }
   ooffset = ooffset & ~1;  // chroma requires offset to multiple of 2.
-  int32 halfiwidth = (iwidth + 1) >> 1;
-  int32 halfiheight = (iheight + 1) >> 1;
-  int32 halfowidth = (owidth + 1) >> 1;
-  int32 halfoheight = (oheight + 1) >> 1;
-  int32 aheight = oheight - ooffset * 2;  // actual output height
-  const uint8 *const iyptr = in;
-  uint8 *oyptr = out + ooffset * owidth;
-  const uint8 *const iuptr = in + iwidth * iheight;
-  uint8 *ouptr = out + owidth * oheight + (ooffset >> 1) * halfowidth;
-  const uint8 *const ivptr = in + iwidth * iheight +
+  int halfiwidth = (src_width + 1) >> 1;
+  int halfiheight = (src_height + 1) >> 1;
+  int halfowidth = (dst_width + 1) >> 1;
+  int halfoheight = (dst_height + 1) >> 1;
+  int aheight = dst_height - ooffset * 2;  // actual output height
+  const uint8* const iyptr = src;
+  uint8* oyptr = dst + ooffset * dst_width;
+  const uint8* const iuptr = src + src_width * src_height;
+  uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfowidth;
+  const uint8* const ivptr = src + src_width * src_height +
                              halfiwidth * halfiheight;
-  uint8 *ovptr = out + owidth * oheight + halfowidth * halfoheight +
+  uint8* ovptr = dst + dst_width * dst_height + halfowidth * halfoheight +
                  (ooffset >> 1) * halfowidth;
-  return Scale(iyptr, iuptr, ivptr, iwidth, halfiwidth, halfiwidth,
-               iwidth, iheight, oyptr, ouptr, ovptr, owidth,
-               halfowidth, halfowidth, owidth, aheight, interpolate);
+  return Scale(iyptr, iuptr, ivptr, src_width, halfiwidth, halfiwidth,
+               src_width, src_height, oyptr, ouptr, ovptr, dst_width,
+               halfowidth, halfowidth, dst_width, aheight, interpolate);
 }
 
 }  // namespace libyuv
diff --git a/source/video_common.cc b/source/video_common.cc
index 8242316df..8b8ee622d 100644
--- a/source/video_common.cc
+++ b/source/video_common.cc
@@ -13,8 +13,6 @@
 
 #include <sstream>
 
-#include "common.h"
-
 namespace libyuv {
 
 #define ARRAY_SIZE(x) (static_cast<int>((sizeof(x)/sizeof(x[0]))))
@@ -47,20 +45,4 @@ uint32 CanonicalFourCC(uint32 fourcc) {
   return fourcc;
 }
 
-std::string VideoFormat::ToString() const {
-  std::string fourcc_name = GetFourccName(fourcc) + " ";
-  for (std::string::const_iterator i = fourcc_name.begin();
-      i < fourcc_name.end(); ++i) {
-    // Test character is printable; Avoid isprint() which asserts on negatives
-    if (*i < 32 || *i >= 127) {
-      fourcc_name = "";
-      break;
-    }
-  }
-
-  std::ostringstream ss;
-  ss << fourcc_name << width << "x" << height << "x" << IntervalToFps(interval);
-  return ss.str();
-}
-
 }  // namespace libyuv
diff --git a/source/video_common.h b/source/video_common.h
index c936c4cfc..9fe08a03a 100644
--- a/source/video_common.h
+++ b/source/video_common.h
@@ -18,7 +18,7 @@
 
 #include <string>
 
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 
 namespace libyuv {
 
@@ -32,16 +32,6 @@ namespace libyuv {
     (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
     (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
 
-// Get the name, that is, string with four characters, of a fourcc code.
-inline std::string GetFourccName(uint32 fourcc) {
-  std::string name;
-  name.push_back(static_cast<char>(fourcc & 0xFF));
-  name.push_back(static_cast<char>((fourcc >> 8) & 0xFF));
-  name.push_back(static_cast<char>((fourcc >> 16) & 0xFF));
-  name.push_back(static_cast<char>((fourcc >> 24) & 0xFF));
-  return name;
-}
-
 // Some good pages discussing FourCC codes:
 //   http://developer.apple.com/quicktime/icefloe/dispatch020.html
 //   http://www.fourcc.org/yuv.php
@@ -87,88 +77,6 @@ enum FourCC {
 // Converts fourcc aliases into canonical ones.
 uint32 CanonicalFourCC(uint32 fourcc);
 
-//////////////////////////////////////////////////////////////////////////////
-// Definition of VideoFormat.
-//////////////////////////////////////////////////////////////////////////////
-
-static const int64 kNumNanosecsPerSec = 1000000000;
-
-struct VideoFormat {
-  static const int64 kMinimumInterval = kNumNanosecsPerSec / 10000;  // 10k fps
-
-  VideoFormat() : width(0), height(0), interval(0), fourcc(0) {}
-
-  VideoFormat(int w, int h, int64 interval_ns, uint32 cc)
-      : width(w),
-        height(h),
-        interval(interval_ns),
-        fourcc(cc) {
-  }
-
-  VideoFormat(const VideoFormat& format)
-      : width(format.width),
-        height(format.height),
-        interval(format.interval),
-        fourcc(format.fourcc) {
-  }
-
-  static int64 FpsToInterval(int fps) {
-    return fps ? kNumNanosecsPerSec / fps : kMinimumInterval;
-  }
-
-  static int IntervalToFps(int64 interval) {
-    // Normalize the interval first.
-    interval = libyuv::_max(interval, kMinimumInterval);
-    return static_cast<int>(kNumNanosecsPerSec / interval);
-  }
-
-  bool operator==(const VideoFormat& format) const {
-    return width == format.width && height == format.height &&
-        interval == format.interval && fourcc == format.fourcc;
-  }
-
-  bool operator!=(const VideoFormat& format) const {
-    return !(*this == format);
-  }
-
-  bool operator<(const VideoFormat& format) const {
-    return (fourcc < format.fourcc) ||
-        (fourcc == format.fourcc && width < format.width) ||
-        (fourcc == format.fourcc && width == format.width &&
-            height < format.height) ||
-        (fourcc == format.fourcc && width == format.width &&
-            height == format.height && interval > format.interval);
-  }
-
-  int framerate() const { return IntervalToFps(interval); }
-
-  // Check if both width and height are 0.
-  bool IsSize0x0() const { return 0 == width && 0 == height; }
-
-  // Check if this format is less than another one by comparing the resolution
-  // and frame rate.
-  bool IsPixelRateLess(const VideoFormat& format) const {
-    return width * height * framerate() <
-        format.width * format.height * format.framerate();
-  }
-
-  // Get a string presentation in the form of "fourcc width x height x fps"
-  std::string ToString() const;
-
-  int    width;     // in number of pixels
-  int    height;    // in number of pixels
-  int64  interval;  // in nanoseconds
-  uint32 fourcc;    // color space. FOURCC_ANY means that any color space is OK.
-};
-
-// Result of video capturer start.
-enum CaptureResult {
-  CR_SUCCESS,    // The capturer starts successfully.
-  CR_PENDING,    // The capturer is pending to start the capture device.
-  CR_FAILURE,    // The capturer fails to start.
-  CR_NO_DEVICE,  // The capturer has no device and fails to start.
-};
-
 }  // namespace libyuv
 
 #endif  // LIBYUV_SOURCE_VIDEO_COMMON_H_
diff --git a/unit_test/unit_test.cc b/unit_test/unit_test.cc
index 73bb6a8a5..1996adf11 100644
--- a/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@@ -22,8 +22,7 @@ class libyuvEnvironment : public ::testing::Environment {
 
 libyuvTest::libyuvTest() :
   _rotate_max_w(128),
-  _rotate_max_h(128)
-{
+  _rotate_max_h(128) {
 }
 
 void libyuvTest::SetUp() {