From d2f4413d29d15b94d971630ba555dd0cd8fcc8c2 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Wed, 4 Apr 2012 21:53:27 +0000
Subject: [PATCH] Remove old alpha blend, expose GetARGB2Blend, fix
 ComputeSumSquareErrorPlane on SSE2 BUG=29 TEST=none Review URL:
 https://webrtc-codereview.appspot.com/469005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@234 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium                   |   2 +-
 include/libyuv/planar_functions.h |  21 +-
 include/libyuv/rotate.h           |   2 +-
 include/libyuv/version.h          |   2 +-
 source/compare.cc                 |  53 ++--
 source/convert.cc                 |  19 +-
 source/convert_from.cc            |   8 +-
 source/format_conversion.cc       |   7 +-
 source/planar_functions.cc        | 124 +++-------
 source/rotate.cc                  |  16 +-
 source/rotate_neon.cc             |   2 +-
 source/row.h                      |  37 ++-
 source/row_common.cc              |  65 +----
 source/row_neon.cc                |   2 +-
 source/row_posix.cc               | 179 +-------------
 source/row_win.cc                 | 390 +++++-------------------------
 source/scale.cc                   |  51 ++--
 17 files changed, 221 insertions(+), 759 deletions(-)

diff --git a/README.chromium b/README.chromium
index b83e57530..3af24f5a3 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 233
+Version: 234
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 87de9b6b7..d7fd3e10e 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -133,24 +133,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);
 
-// Alpha Blend ARGB row of pixels.
-void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width);
+typedef void (*ARGBBlendRow)(const uint8* src_argb0,
+                             const uint8* src_argb1,
+                             uint8* dst_argb, int width);
 
-// Alpha Blend 2 rows of ARGB pixels and store to destination.
-void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
-                   uint8* dst_argb, int width);
+// Get function to Alpha Blend ARGB pixels and store to destination.
+ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width);
 
-// Alpha Blend ARGB.
-int ARGBBlend(const uint8* src_argb, int src_stride_argb,
+// Alpha Blend ARGB images and store to destination.
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
 
-// Alpha Blend 2 ARGB images and store to destination.
-int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
-               const uint8* src_argb1, int src_stride_argb1,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
 // Convert I422 to YUY2.
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
diff --git a/include/libyuv/rotate.h b/include/libyuv/rotate.h
index f8d2f57db..773290701 100644
--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@@ -20,7 +20,7 @@ extern "C" {
 
 // Supported rotation
 enum RotationMode {
-  kRotate0 = 0, // No rotation
+  kRotate0 = 0,  // No rotation
   kRotate90 = 90,  // Rotate 90 degrees clockwise
   kRotate180 = 180,  // Rotate 180 degrees
   kRotate270 = 270,  // Rotate 270 degrees clockwise
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index aced5e45c..cdae68054 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define INCLUDE_LIBYUV_VERSION 233
+#define LIBYUV_VERSION 234
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
 
diff --git a/source/compare.cc b/source/compare.cc
index b1b88769f..5fccd3930 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -25,18 +25,37 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// hash seed of 5381 recommended.
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+// Internal C version of HashDjb2 with int sized count for efficiency.
+static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
   uint32 hash = seed;
-  if (count > 0) {
-    do {
-      hash = hash * 33 + *src++;
-    } while (--count);
+  for (int i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
   }
   return hash;
 }
 
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+// hash seed of 5381 recommended.
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  while (count >= static_cast<uint64>(kBlockSize)) {
+    seed = HashDjb2_C(src, kBlockSize, seed);
+    src += kBlockSize;
+    count -= kBlockSize;
+  }
+  int remainder = static_cast<int>(count) & ~15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+    src += remainder;
+    count -= remainder;
+  }
+  remainder = static_cast<int>(count) & 15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+  }
+  return seed;
+}
+
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SUMSQUAREERROR_NEON
 
 static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
@@ -75,9 +94,9 @@ static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
   return sse;
 }
 
-#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SUMSQUAREERROR_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
                                   int count) {
   __asm {
@@ -94,7 +113,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
     movdqa     xmm2, [eax + edx]
     lea        eax,  [eax + 16]
     sub        ecx, 16
-    movdqa     xmm3, xmm1
+    movdqa     xmm3, xmm1  // abs trick
     psubusb    xmm1, xmm2
     psubusb    xmm2, xmm3
     por        xmm1, xmm2
@@ -116,7 +135,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
   }
 }
 
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SUMSQUAREERROR_SSE2
 static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
                                   int count) {
@@ -167,11 +186,9 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
 static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
                                int count) {
   uint32 sse = 0u;
-  for (int x = 0; x < count; ++x) {
-    int diff = src_a[0] - src_b[0];
+  for (int i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
     sse += static_cast<uint32>(diff * diff);
-    src_a += 1;
-    src_b += 1;
   }
   return sse;
 }
@@ -187,6 +204,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
 #elif defined(HAS_SUMSQUAREERROR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) &&
       IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+    // Note only used for multiples of 16 so count is not checked.
     SumSquareError = SumSquareError_SSE2;
   }
 #endif
@@ -225,8 +243,9 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
     SumSquareError = SumSquareError_NEON;
   }
 #elif defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
+      IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
     SumSquareError = SumSquareError_SSE2;
   }
 #endif
diff --git a/source/convert.cc b/source/convert.cc
index e07970d80..0b1f03c74 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -61,9 +61,9 @@ int I420Copy(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_HALFROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                          uint8* dst_uv, int pix) {
   __asm {
@@ -86,7 +86,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
   }
 }
 
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_HALFROW_SSE2
 static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                          uint8* dst_uv, int pix) {
@@ -179,12 +179,13 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
 
 // Blends 32x2 pixels to 16x1
 // source in scale.cc
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SCALEROWDOWN2_NEON
 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
                            uint8* dst, int dst_width);
-#elif defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) && \
-    !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+
 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
                            uint8* dst_ptr, int dst_width);
 #endif
@@ -450,9 +451,9 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
                     width, height);
 }
 
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SPLITYUY2_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void SplitYUY2_SSE2(const uint8* src_yuy2,
                            uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
@@ -498,7 +499,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
   }
 }
 
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SPLITYUY2_SSE2
 static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
                            uint8* dst_u, uint8* dst_v, int pix) {
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 9a0d32ad9..efe58dd82 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -205,9 +205,9 @@ int I400Copy(const uint8* src_y, int src_stride_y,
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1
 
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_I42XTOYUY2ROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void I42xToYUY2Row_SSE2(const uint8* src_y,
                                const uint8* src_u,
                                const uint8* src_v,
@@ -246,7 +246,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
 }
 
 #define HAS_I42XTOUYVYROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void I42xToUYVYRow_SSE2(const uint8* src_y,
                                const uint8* src_u,
                                const uint8* src_v,
@@ -283,7 +283,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
     ret
   }
 }
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_I42XTOYUY2ROW_SSE2
 static void I42xToYUY2Row_SSE2(const uint8* src_y,
                                const uint8* src_u,
diff --git a/source/format_conversion.cc b/source/format_conversion.cc
index 46d7e7e23..1cdf709e4 100644
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -24,9 +24,9 @@ extern "C" {
 // and vst would select which 2 components to write.  The low level would need
 // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
 
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBTOBAYERROW_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
                                  uint8* dst_bayer, uint32 selector, int pix) {
   __asm {
@@ -36,6 +36,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
     mov        ecx, [esp + 16]   // pix
     pshufd     xmm5, xmm5, 0
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]
     lea        eax, [eax + 16]
@@ -48,7 +49,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
   }
 }
 
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 
 #define HAS_ARGBTOBAYERROW_SSSE3
 static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 866fcb4fd..2bc3e3fe2 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -137,87 +137,38 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-// Alpha Blend ARGB
-void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width) {
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlendRow_SSSE3(src_argb, dst_argb, width);
-    return;
-  }
-#endif
+// Get a blender that optimized for the CPU, alignment and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width) {
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = ARGBBlendRow_C;
 #if defined(HAS_ARGBBLENDROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlendRow_SSE2(src_argb, dst_argb, width);
-    return;
-  }
-#endif
-  ARGBBlendRow_C(src_argb, dst_argb, width);
-}
-
-// Alpha Blend 2 rows of ARGB pixels and store to destination.
-void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
-                   uint8* dst_argb, int width) {
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlend2Row_SSSE3(src_argb0, src_argb1, dst_argb, width);
-    return;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlend2Row_SSE2(src_argb0, src_argb1, dst_argb, width);
-    return;
-  }
-#endif
-  ARGBBlend2Row_C(src_argb0, src_argb1, dst_argb, width);
-}
-
-// Alpha Blend ARGB
-// TODO(fbarchard): Call 3 pointer low levels to reduce code size.
-int ARGBBlend(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-  void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) =
-      ARGBBlendRow_C;
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlendRow = ARGBBlendRow_SSE2;
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
+    ARGBBlendRow = ARGBBlendRow1_SSE2;
+    if (width >= 4) {
+      ARGBBlendRow = ARGBBlendRow_Any_SSE2;
+      if (IS_ALIGNED(width, 4) &&
+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
+      }
     }
   }
 #endif
 #if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlendRow = ARGBBlendRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+    ARGBBlendRow = ARGBBlendRow_Any_SSSE3;
     if (IS_ALIGNED(width, 4) &&
         IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
       ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3;
     }
   }
 #endif
-
-  for (int y = 0; y < height; ++y) {
-    ARGBBlendRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
+  return ARGBBlendRow;
 }
 
 // Alpha Blend 2 ARGB images and store to destination.
-int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
                const uint8* src_argb1, int src_stride_argb1,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height) {
@@ -230,30 +181,12 @@ int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-
-  void (*ARGBBlend2Row)(const uint8* src_argb, const uint8* src_argb1,
-                        uint8* dst_argb, int width) = ARGBBlend2Row_C;
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlend2Row = ARGBBlend2Row_SSE2;
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      ARGBBlend2Row = ARGBBlend2Row_Aligned_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlend2Row = ARGBBlend2Row_SSSE3;
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      ARGBBlend2Row = ARGBBlend2Row_Aligned_SSSE3;
-    }
-  }
-#endif
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) =
+      GetARGBBlend(dst_argb, dst_stride_argb, width);
 
   for (int y = 0; y < height; ++y) {
-    ARGBBlend2Row(src_argb0, src_argb1, dst_argb, width);
+    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
     src_argb0 += src_stride_argb0;
     src_argb1 += src_stride_argb1;
     dst_argb += dst_stride_argb;
@@ -725,7 +658,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
 // SetRow8 writes 'count' bytes using a 32 bit value repeated
 // SetRow32 writes 'count' words using a 32 bit value repeated
 
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SETROW_NEON
 static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
   asm volatile (
@@ -749,9 +682,9 @@ static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
   }
 }
 
-#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SETROW_X86
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
   __asm {
     mov        edx, edi
@@ -765,7 +698,7 @@ static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void SetRows32_X86(uint8* dst, uint32 v32, int width,
                          int dst_stride, int height) {
   __asm {
@@ -793,7 +726,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
   }
 }
 
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SETROW_X86
 static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
   size_t width_tmp = static_cast<size_t>(width);
@@ -903,6 +836,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
   return 0;
 }
 
+// TODO(fbarchard): Add TestCpuFlag(kCpuHasX86) to allow C code to be tested.
 // Draw a rectangle into ARGB
 int ARGBRect(uint8* dst_argb, int dst_stride_argb,
              int dst_x, int dst_y,
@@ -916,12 +850,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
   uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
 #if defined(HAS_SETROW_X86)
   SetRows32_X86(dst, value, width, dst_stride_argb, height);
-#elif defined(HAS_SETROW_NEON)
+#else
+#if defined(HAS_SETROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
     SetRows32_NEON(dst, value, width, dst_stride_argb, height);
     return 0;
   }
+#endif
   SetRows32_C(dst, value, width, dst_stride_argb, height);
 #endif
   return 0;
diff --git a/source/rotate.cc b/source/rotate.cc
index f5f9075c3..a029a17bc 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -21,8 +21,8 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
-    !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #if defined(__APPLE__) && defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
     ".text                                     \n"                             \
@@ -59,9 +59,9 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                          int width);
 #endif
 
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                                uint8* dst, int dst_stride, int width) {
   __asm {
@@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
 }
 
 #define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                 uint8* dst_a, int dst_stride_a,
                                 uint8* dst_b, int dst_stride_b,
@@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     ret
   }
 }
-#elif defined(__i386__) || defined(__x86_64__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSE_WX8_SSSE3
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                                uint8* dst, int dst_stride, int width) {
@@ -369,7 +369,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
   );
 }
 
-#if defined (__i386__)
+#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
 #define HAS_TRANSPOSE_UVWX8_SSE2
 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                     uint8* dst_a, int dst_stride_a,
@@ -491,7 +491,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     "pop    %ebx                               \n"
     "ret                                       \n"
 );
-#elif defined(__x86_64__)
+#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index 0240fe12f..7ff993617 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -17,7 +17,7 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 
 static const uvec8 vtbl_4x4_transpose =
   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
diff --git a/source/row.h b/source/row.h
index c70160025..4ed17a096 100644
--- a/source/row.h
+++ b/source/row.h
@@ -18,6 +18,7 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// TODO(fbarchard): Remove kMaxStride
 #define kMaxStride (2560 * 4)
 #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
 
@@ -26,8 +27,9 @@ extern "C" {
 #endif
 
 // The following are available on all x86 platforms
-#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
-    !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+
 #define HAS_ABGRTOARGBROW_SSSE3
 #define HAS_BGRATOARGBROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
@@ -66,7 +68,7 @@ extern "C" {
 #endif
 
 // The following are available on Neon platforms
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORROWUV_NEON
 #define HAS_SPLITUV_NEON
@@ -78,7 +80,7 @@ extern "C" {
 
 // The following are only available on Win32
 // TODO(fbarchard): Port to GCC
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBBLENDROW_SSSE3
 #endif
 
@@ -265,25 +267,18 @@ void YToARGBRow_SSE2(const uint8* y_buf,
                      int width);
 
 // ARGB preattenuated alpha blend.
-void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-
-// ARGB preattenuated alpha blend with 2 sources and a destination.
-void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                                 uint8* dst_argb, int width);
-void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                                 uint8* dst_argb, int width);
-void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                               uint8* dst_argb, int width);
+void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                         uint8* dst_argb, int width);
-void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
+void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                            uint8* dst_argb, int width);
+void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                           uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                    uint8* dst_argb, int width);
 
 // 'Any' functions handle any size and alignment.
 void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,
diff --git a/source/row_common.cc b/source/row_common.cc
index d2f17ef30..32e2db95a 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -454,73 +454,10 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
 }
 
 #define BLENDER(f, b, a) (((256 - a) * b) >> 8) + f
-void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    uint32 a = src_argb[3];
-    if (a) {
-      if (a < 255) {
-        const uint32 fb = src_argb[0];
-        const uint32 fg = src_argb[1];
-        const uint32 fr = src_argb[2];
-        const uint32 bb = dst_argb[0];
-        const uint32 bg = dst_argb[1];
-        const uint32 br = dst_argb[2];
-        dst_argb[0] = BLENDER(fb, bb, a);
-        dst_argb[1] = BLENDER(fg, bg, a);
-        dst_argb[2] = BLENDER(fr, br, a);
-        dst_argb[3] = 255u;
-      } else {
-        *reinterpret_cast<uint32*>(dst_argb) =
-            *reinterpret_cast<const uint32*>(src_argb);
-      }
-    }
-    a = src_argb[4 + 3];
-    if (a) {
-      if (a < 255) {
-        const uint32 fb = src_argb[4 + 0];
-        const uint32 fg = src_argb[4 + 1];
-        const uint32 fr = src_argb[4 + 2];
-        const uint32 bb = dst_argb[4 + 0];
-        const uint32 bg = dst_argb[4 + 1];
-        const uint32 br = dst_argb[4 + 2];
-        dst_argb[4 + 0] = BLENDER(fb, bb, a);
-        dst_argb[4 + 1] = BLENDER(fg, bg, a);
-        dst_argb[4 + 2] = BLENDER(fr, br, a);
-        dst_argb[4 + 3] = 255u;
-      } else {
-        *reinterpret_cast<uint32*>(dst_argb + 4) =
-            *reinterpret_cast<const uint32*>(src_argb + 4);
-      }
-    }
-    src_argb += 8;
-    dst_argb += 8;
-  }
-
-  if (width & 1) {
-    const uint32 a = src_argb[3];
-    if (a) {
-      if (a < 255) {
-        const uint32 fb = src_argb[0];
-        const uint32 fg = src_argb[1];
-        const uint32 fr = src_argb[2];
-        const uint32 bb = dst_argb[0];
-        const uint32 bg = dst_argb[1];
-        const uint32 br = dst_argb[2];
-        dst_argb[0] = BLENDER(fb, bb, a);
-        dst_argb[1] = BLENDER(fg, bg, a);
-        dst_argb[2] = BLENDER(fr, br, a);
-        dst_argb[3] = 255u;
-      } else {
-        *reinterpret_cast<uint32*>(dst_argb) =
-            *reinterpret_cast<const uint32*>(src_argb);
-      }
-    }
-  }
-}
 
 // Blend src_argb0 over src_argb1 and store to dst_argb.
 // dst_argb may be src_argb0 or src_argb1.
-void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   for (int x = 0; x < width - 1; x += 2) {
     uint32 a = src_argb0[3];
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 2c68492e3..ba22c8073 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -16,7 +16,7 @@ extern "C" {
 #endif
 
 // This module is for GCC Neon
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 
 #define YUVTORGB                                                               \
     "vld1.u8    {d0}, [%0]!                    \n"                             \
diff --git a/source/row_posix.cc b/source/row_posix.cc
index e7cfb011a..f8979ace0 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -18,7 +18,7 @@ extern "C" {
 #endif
 
 // This module is for GCC x86 and x64
-#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 
 // GCC 4.2 on OSX has link error when passing static or const to inline.
 // TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
@@ -2031,162 +2031,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
 #ifdef HAS_ARGBBLENDROW_SSE2
 // Blend 8 pixels at a time
 // Destination aligned to 16 bytes, multiple of 4 pixels
-void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-
-  // 8 pixel loop
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm3                     \n"  // first 4 pixels
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqa    (%1),%%xmm2                     \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqa    (%1),%%xmm1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "movdqu    0x10(%0),%%xmm3                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "jle       9f                              \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"  // next 4 pixels
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqa    0x10(%1),%%xmm2                 \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqa    0x10(%1),%%xmm1                 \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "jg        1b                              \n"
-  "9:                                          \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-// Blend 1 pixel at a time, unaligned
-void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-
-  // 1 pixel loop
-  "1:                                          \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%2                         \n"
-    "movd      %%xmm0,(%1)                     \n"
-    "lea       0x4(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  // Do 1 to 3 pixels to get destination aligned.
-  if ((uintptr_t)(dst_argb) & 15) {
-    int count = width;
-    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
-      count = (-(intptr_t)(dst_argb) >> 2) & 3;
-    }
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
-    src_argb += count * 4;
-    dst_argb += count * 4;
-    width -= count;
-  }
-  // Do multiple of 4 pixels
-  if (width & ~3) {
-    ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3);
-  }
-  // Do remaining 1 to 3 pixels
-  if (width & 3) {
-    src_argb += (width & ~3) * 4;
-    dst_argb += (width & ~3) * 4;
-    width &= 3;
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
-  }
-}
-
-#endif  // HAS_ARGBBLENDROW_SSE2
-
-
-
-
-
-
-
-
-#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time
-// Destination aligned to 16 bytes, multiple of 4 pixels
-void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                                uint8* dst_argb, int width) {
   asm volatile (
     "pcmpeqb   %%xmm7,%%xmm7                   \n"
@@ -2259,7 +2104,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 }
 
 // Blend 1 pixel at a time, unaligned
-void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                          uint8* dst_argb, int width) {
   asm volatile (
     "pcmpeqb   %%xmm7,%%xmm7                   \n"
@@ -2309,15 +2154,15 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   );
 }
 
-void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
+void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                            uint8* dst_argb, int width) {
   // Do 1 to 3 pixels to get destination aligned.
   if ((uintptr_t)(dst_argb) & 15) {
     int count = width;
     if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
       count = (-(intptr_t)(dst_argb) >> 2) & 3;
     }
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
     src_argb0 += count * 4;
     src_argb1 += count * 4;
     dst_argb += count * 4;
@@ -2325,7 +2170,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   }
   // Do multiple of 4 pixels
   if (width & ~3) {
-    ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
+    ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
   }
   // Do remaining 1 to 3 pixels
   if (width & 3) {
@@ -2333,19 +2178,11 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     src_argb1 += (width & ~3) * 4;
     dst_argb += (width & ~3) * 4;
     width &= 3;
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
   }
 }
 #endif  // HAS_ARGBBLENDROW_SSE2
 
-
-
-
-
-
-
-
-
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/source/row_win.cc b/source/row_win.cc
index ada7788c7..c7c553774 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -16,7 +16,7 @@ extern "C" {
 #endif
 
 // This module is for Visual C x86
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 
 #ifdef HAS_ARGBTOYROW_SSSE3
 
@@ -99,7 +99,7 @@ static const uvec8 kShuffleMaskARGBToRAW = {
   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
 };
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   __asm {
     mov        eax, [esp + 4]        // src_y
@@ -127,7 +127,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
 __asm {
     mov       eax, [esp + 4]   // src_abgr
@@ -148,7 +148,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
 __asm {
     mov       eax, [esp + 4]   // src_bgra
@@ -169,7 +169,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
 __asm {
     mov       eax, [esp + 4]   // src_rgb24
@@ -208,7 +208,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
                         int pix) {
 __asm {
@@ -255,7 +255,7 @@ __asm {
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
                           int pix) {
 __asm {
@@ -306,7 +306,7 @@ __asm {
 }
 
 // 24 instructions
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
                             int pix) {
 __asm {
@@ -360,7 +360,7 @@ __asm {
 }
 
 // 18 instructions
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                             int pix) {
 __asm {
@@ -399,7 +399,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
     mov       eax, [esp + 4]   // src_argb
@@ -438,7 +438,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
     mov       eax, [esp + 4]   // src_argb
@@ -477,7 +477,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
     mov       eax, [esp + 4]   // src_argb
@@ -516,7 +516,7 @@ __asm {
 }
 
 // TODO(fbarchard): Improve sign extension/packing
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
     mov       eax, [esp + 4]   // src_argb
@@ -558,7 +558,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
     mov       eax, [esp + 4]   // src_argb
@@ -589,7 +589,7 @@ __asm {
 }
 
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
     mov        eax, [esp + 4]   /* src_argb */
@@ -623,7 +623,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
     mov        eax, [esp + 4]   /* src_argb */
@@ -657,7 +657,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
     mov        eax, [esp + 4]   /* src_argb */
@@ -691,7 +691,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
     mov        eax, [esp + 4]   /* src_argb */
@@ -725,7 +725,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
     mov        eax, [esp + 4]   /* src_argb */
@@ -759,7 +759,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
     mov        eax, [esp + 4]   /* src_argb */
@@ -793,7 +793,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@@ -859,7 +859,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@@ -929,7 +929,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@@ -995,7 +995,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@@ -1065,7 +1065,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@@ -1131,7 +1131,7 @@ __asm {
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                  uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@@ -1268,7 +1268,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
     __asm packuswb   xmm2, xmm2           /* R */                              \
   }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -1308,7 +1308,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToBGRARow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -1348,7 +1348,7 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToABGRRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -1388,7 +1388,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
@@ -1428,7 +1428,7 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
@@ -1468,7 +1468,7 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
@@ -1508,7 +1508,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I444ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -1575,7 +1575,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
 #endif
 
 #ifdef HAS_YTOARGBROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void YToARGBRow_SSE2(const uint8* y_buf,
                      uint8* rgb_buf,
                      int width) {
@@ -1628,7 +1628,7 @@ static const uvec8 kShuffleMirror = {
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 __asm {
     mov       eax, [esp + 4]   // src
@@ -1653,7 +1653,7 @@ __asm {
 #ifdef HAS_MIRRORROW_SSE2
 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
 // version can not.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 __asm {
     mov       eax, [esp + 4]   // src
@@ -1686,7 +1686,7 @@ static const uvec8 kShuffleMirrorUV = {
   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
 };
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
                        int width) {
   __asm {
@@ -1717,7 +1717,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 #endif
 
 #ifdef HAS_SPLITUV_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
@@ -1756,7 +1756,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 
 #ifdef HAS_COPYROW_SSE2
 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, [esp + 4]   // src
@@ -1779,7 +1779,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
 #endif  // HAS_COPYROW_SSE2
 
 #ifdef HAS_COPYROW_X86
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, esi
@@ -1797,7 +1797,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
 #endif
 
 #ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
                      uint8* dst_y, int pix) {
   __asm {
@@ -1823,7 +1823,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_y, int pix) {
   __asm {
@@ -1867,7 +1867,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
                                uint8* dst_y, int pix) {
   __asm {
@@ -1893,7 +1893,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
                                 uint8* dst_u, uint8* dst_y, int pix) {
   __asm {
@@ -1937,7 +1937,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void UYVYToYRow_SSE2(const uint8* src_uyvy,
                      uint8* dst_y, int pix) {
   __asm {
@@ -1961,7 +1961,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_y, int pix) {
   __asm {
@@ -2005,7 +2005,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                                uint8* dst_y, int pix) {
   __asm {
@@ -2029,7 +2029,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
                                 uint8* dst_u, uint8* dst_y, int pix) {
   __asm {
@@ -2078,273 +2078,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
 // Blend 8 pixels at a time
 // Destination aligned to 16 bytes, multiple of 4 pixels
 __declspec(naked) __declspec(align(16))
-void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-
-    align      16
- convertloop:
-    movdqu     xmm3, [eax]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqa     xmm2, [edx]      // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [edx]      // _a_g
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    movdqu     xmm3, [eax + 16]
-    lea        eax, [eax + 32]
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    jle        done
-
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqa     xmm2, [edx + 16] // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [edx + 16] // _a_g
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx + 16], xmm0
-    lea        edx, [edx + 32]
-    jg         convertloop
-
- done:
-    ret
-  }
-}
-
-// Blend 1 pixel at a time, unaligned
-__declspec(naked) __declspec(align(16))
-void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-
-    align      16
- convertloop:
-    movd       xmm3, [eax]
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [edx]      // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [edx]      // _a_g
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    jg         convertloop
-    ret
-  }
-}
-
-void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  // Do 1 to 3 pixels to get destination aligned.
-  if ((uintptr_t)(dst_argb) & 15) {
-    int count = width;
-    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
-      count = (-(intptr_t)(dst_argb) >> 2) & 3;
-    }
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
-    src_argb += count * 4;
-    dst_argb += count * 4;
-    width -= count;
-  }
-  // Do multiple of 4 pixels
-  if (width & ~3) {
-    ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3);
-  }
-  // Do remaining 1 to 3 pixels
-  if (width & 3) {
-    src_argb += (width & ~3) * 4;
-    dst_argb += (width & ~3) * 4;
-    width &= 3;
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
-  }
-}
-#endif  // HAS_ARGBBLENDROW_SSE2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Blend 8 pixels at a time
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
-
-// Same as SSE2, but replaces
-//    psrlw      xmm3, 8          // alpha
-//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-//    pshuflw    xmm3, xmm3,0F5h
-// with..
-//    pshufb     xmm3, kShuffleAlpha // alpha
-
-// Destination aligned to 16 bytes, multiple of 4 pixels
-__declspec(naked) __declspec(align(16))
-void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-
-    align      16
- convertloop:
-    movdqu     xmm3, [eax]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    pshufb     xmm3, kShuffleAlpha // alpha
-    movdqa     xmm2, [edx]      // _r_b
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [edx]      // _a_g
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    movdqu     xmm3, [eax + 16]
-    lea        eax, [eax + 32]
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    jle        done
-
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqa     xmm2, [edx + 16] // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [edx + 16] // _a_g
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx + 16], xmm0
-    lea        edx, [edx + 32]
-    jg         convertloop
-
- done:
-    ret
-  }
-}
-
-void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  // Do 1 to 3 pixels to get destination aligned.
-  if ((uintptr_t)(dst_argb) & 15) {
-    int count = width;
-    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
-      count = (-(intptr_t)(dst_argb) >> 2) & 3;
-    }
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
-    src_argb += count * 4;
-    dst_argb += count * 4;
-    width -= count;
-  }
-  // Do multiple of 4 pixels
-  if (width & ~3) {
-    ARGBBlendRow_Aligned_SSSE3(src_argb, dst_argb, width & ~3);
-  }
-  // Do remaining 1 to 3 pixels
-  if (width & 3) {
-    src_argb += (width & ~3) * 4;
-    dst_argb += (width & ~3) * 4;
-    width &= 3;
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
-  }
-}
-#endif  // HAS_ARGBBLENDROW_SSSE3
-
-
-
-
-
-
-
-///////////////////////////////////////
-///////////////////// 2 source versions
-///////////////////////////////////////
-
-
-
-
-
-
-
-#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time
-// Destination aligned to 16 bytes, multiple of 4 pixels
-__declspec(naked) __declspec(align(16))
-void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                                 uint8* dst_argb, int width) {
   __asm {
     push       esi
@@ -2418,7 +2152,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
 // Blend 1 pixel at a time, unaligned
 __declspec(naked) __declspec(align(16))
-void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                          uint8* dst_argb, int width) {
   __asm {
     push       esi
@@ -2467,7 +2201,7 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   }
 }
 
-void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                         uint8* dst_argb, int width) {
   // Do 1 to 3 pixels to get destination aligned.
   if ((uintptr_t)(dst_argb) & 15) {
@@ -2475,7 +2209,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
       count = (-(intptr_t)(dst_argb) >> 2) & 3;
     }
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
     src_argb0 += count * 4;
     src_argb1 += count * 4;
     dst_argb += count * 4;
@@ -2483,7 +2217,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   }
   // Do multiple of 4 pixels
   if (width & ~3) {
-    ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
+    ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
   }
   // Do remaining 1 to 3 pixels
   if (width & 3) {
@@ -2491,12 +2225,18 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     src_argb1 += (width & ~3) * 4;
     dst_argb += (width & ~3) * 4;
     width &= 3;
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
   }
 }
 #endif  // HAS_ARGBBLENDROW_SSE2
 
 #ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
 // Blend 8 pixels at a time
 // Shuffle table for reversing the bytes.
 
@@ -2509,7 +2249,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
 // Destination aligned to 16 bytes, multiple of 4 pixels
 __declspec(naked) __declspec(align(16))
-void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                                  uint8* dst_argb, int width) {
   __asm {
     push       esi
@@ -2577,7 +2317,7 @@ void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
   }
 }
 
-void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                         uint8* dst_argb, int width) {
   // Do 1 to 3 pixels to get destination aligned.
   if ((uintptr_t)(dst_argb) & 15) {
@@ -2585,7 +2325,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
       count = (-(intptr_t)(dst_argb) >> 2) & 3;
     }
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
     src_argb0 += count * 4;
     src_argb1 += count * 4;
     dst_argb += count * 4;
@@ -2593,7 +2333,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
   }
   // Do multiple of 4 pixels
   if (width & ~3) {
-    ARGBBlend2Row_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3);
+    ARGBBlendRow_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3);
   }
   // Do remaining 1 to 3 pixels
   if (width & 3) {
@@ -2601,7 +2341,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     src_argb1 += (width & ~3) * 4;
     dst_argb += (width & ~3) * 4;
     width &= 3;
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
   }
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
diff --git a/source/scale.cc b/source/scale.cc
index bd9127095..60b39a519 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -55,7 +55,7 @@ void SetUseReferenceImpl(bool use) {
  *
  */
 
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SCALEROWDOWN2_NEON
 void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
                         uint8* dst, int dst_width) {
@@ -566,12 +566,13 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr,
  */
 
 // Constants for SSE2 code
-#elif defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) && \
-    !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
+
 #if defined(_MSC_VER)
 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
-#elif defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__) && \
-    defined(__i386__)
+#elif defined(__i386__) && \
+    (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__))
 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
 #else
 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
@@ -670,12 +671,12 @@ extern "C" TALIGN16(const uint16, scaleab2[8]) =
   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 #endif
 
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 
 #define HAS_SCALEROWDOWN2_SSE2
 // Reads 32 pixels, throws half away and writes 16 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
                                uint8* dst_ptr, int dst_width) {
   __asm {
@@ -704,7 +705,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
 }
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
                            uint8* dst_ptr, int dst_width) {
   __asm {
@@ -749,7 +750,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEROWDOWN4_SSE2
 // Point samples 32 pixels to 8 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
                                uint8* dst_ptr, int dst_width) {
   __asm {
@@ -780,7 +781,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
 
 // Blends 32x4 rectangle to 8x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
                                   uint8* dst_ptr, int dst_width) {
   __asm {
@@ -842,7 +843,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEROWDOWN8_SSE2
 // Point samples 32 pixels to 4 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
                                uint8* dst_ptr, int dst_width) {
   __asm {
@@ -874,7 +875,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
 
 // Blends 32x8 rectangle to 4x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                   uint8* dst_ptr, int dst_width) {
   __asm {
@@ -952,7 +953,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
                                  uint8* dst_ptr, int dst_width) {
   __asm {
@@ -1001,7 +1002,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                        uint8* dst_ptr, int dst_width) {
   __asm {
@@ -1059,7 +1060,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                        uint8* dst_ptr, int dst_width) {
   __asm {
@@ -1122,7 +1123,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
 // 3/8 point sampler
 
 // Scale 32 pixels to 12
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
                                  uint8* dst_ptr, int dst_width) {
   __asm {
@@ -1154,7 +1155,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
 }
 
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                        uint8* dst_ptr, int dst_width) {
   __asm {
@@ -1221,7 +1222,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
 }
 
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                        uint8* dst_ptr, int dst_width) {
   __asm {
@@ -1269,7 +1270,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEADDROWS_SSE2
 
 // Reads 16xN bytes and produces 16 shorts at a time.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
                               uint16* dst_ptr, int src_width,
                               int src_height) {
@@ -1329,7 +1330,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
 
 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
 #define HAS_SCALEFILTERROWS_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                                  int src_stride, int dst_width,
                                  int source_y_fraction) {
@@ -1420,7 +1421,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 
 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
 #define HAS_SCALEFILTERROWS_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                                   int src_stride, int dst_width,
                                   int source_y_fraction) {
@@ -1501,7 +1502,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 
 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                                     int dst_width) {
   __asm {
@@ -1547,7 +1548,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 
 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
@@ -1766,7 +1767,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
   );
 }
 
-#if defined(__i386__)
+#if !defined(YUV_DISABLE_ASM) && defined(__i386__)
 extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width);
   asm(
@@ -2260,7 +2261,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     "ret                                       \n"
 );
 
-#elif defined(__x86_64__)
+#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                   uint8* dst_ptr, int dst_width) {
   asm volatile (