Remove old alpha blend, expose GetARGB2Blend, fix ComputeSumSquareErrorPlane on SSE2

BUG=29 TEST=none Review URL: https://webrtc-codereview.appspot.com/469005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@234 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-01-01 03:12:16 +08:00 · 2012-04-04 21:53:27 +00:00 · 2012-04-04 21:53:27 +00:00 · d2f4413d29
commit d2f4413d29
parent c757f308ea
17 changed files with 221 additions and 759 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 233
+Version: 234
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -133,24 +133,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,
             int width, int height);

-// Alpha Blend ARGB row of pixels.
-void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width);
+typedef void (*ARGBBlendRow)(const uint8* src_argb0,
+                             const uint8* src_argb1,
+                             uint8* dst_argb, int width);

-// Alpha Blend 2 rows of ARGB pixels and store to destination.
-void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
-                   uint8* dst_argb, int width);
+// Get function to Alpha Blend ARGB pixels and store to destination.
+ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width);

-// Alpha Blend ARGB.
-int ARGBBlend(const uint8* src_argb, int src_stride_argb,
+// Alpha Blend ARGB images and store to destination.
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);

-// Alpha Blend 2 ARGB images and store to destination.
-int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
-               const uint8* src_argb1, int src_stride_argb1,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
 // Convert I422 to YUY2.
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@ -20,7 +20,7 @@ extern "C" {

 // Supported rotation
 enum RotationMode {
-  kRotate0 = 0, // No rotation
+  kRotate0 = 0,  // No rotation
  kRotate90 = 90,  // Rotate 90 degrees clockwise
  kRotate180 = 180,  // Rotate 180 degrees
  kRotate270 = 270,  // Rotate 270 degrees clockwise
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define INCLUDE_LIBYUV_VERSION 233
+#define LIBYUV_VERSION 234

 #endif  // INCLUDE_LIBYUV_VERSION_H_

--- a/source/compare.cc
+++ b/source/compare.cc
@ -25,18 +25,37 @@ namespace libyuv {
 extern "C" {
 #endif

-// hash seed of 5381 recommended.
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+// Internal C version of HashDjb2 with int sized count for efficiency.
+static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
  uint32 hash = seed;
-  if (count > 0) {
-    do {
-      hash = hash * 33 + *src++;
-    } while (--count);
+  for (int i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
  }
  return hash;
 }

-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+// hash seed of 5381 recommended.
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  while (count >= static_cast<uint64>(kBlockSize)) {
+    seed = HashDjb2_C(src, kBlockSize, seed);
+    src += kBlockSize;
+    count -= kBlockSize;
+  }
+  int remainder = static_cast<int>(count) & ~15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+    src += remainder;
+    count -= remainder;
+  }
+  remainder = static_cast<int>(count) & 15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+  }
+  return seed;
+}
+
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SUMSQUAREERROR_NEON

 static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
@ -75,9 +94,9 @@ static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
  return sse;
 }

-#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SUMSQUAREERROR_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
                                  int count) {
  __asm {
@ -94,7 +113,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
    movdqa     xmm2, [eax + edx]
    lea        eax,  [eax + 16]
    sub        ecx, 16
-    movdqa     xmm3, xmm1
+    movdqa     xmm3, xmm1  // abs trick
    psubusb    xmm1, xmm2
    psubusb    xmm2, xmm3
    por        xmm1, xmm2
@ -116,7 +135,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
  }
 }

-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SUMSQUAREERROR_SSE2
 static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
                                  int count) {
@ -167,11 +186,9 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
 static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
                               int count) {
  uint32 sse = 0u;
-  for (int x = 0; x < count; ++x) {
-    int diff = src_a[0] - src_b[0];
+  for (int i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
    sse += static_cast<uint32>(diff * diff);
-    src_a += 1;
-    src_b += 1;
  }
  return sse;
 }
@ -187,6 +204,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
 #elif defined(HAS_SUMSQUAREERROR_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+    // Note only used for multiples of 16 so count is not checked.
    SumSquareError = SumSquareError_SSE2;
  }
 #endif
@ -225,8 +243,9 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
    SumSquareError = SumSquareError_NEON;
  }
 #elif defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
+      IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
    SumSquareError = SumSquareError_SSE2;
  }
 #endif
--- a/source/convert.cc
+++ b/source/convert.cc
@ -61,9 +61,9 @@ int I420Copy(const uint8* src_y, int src_stride_y,
  return 0;
 }

-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_HALFROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                         uint8* dst_uv, int pix) {
  __asm {
@ -86,7 +86,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
  }
 }

-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_HALFROW_SSE2
 static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                         uint8* dst_uv, int pix) {
@ -179,12 +179,13 @@ int I422ToI420(const uint8* src_y, int src_stride_y,

 // Blends 32x2 pixels to 16x1
 // source in scale.cc
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SCALEROWDOWN2_NEON
 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
                           uint8* dst, int dst_width);
-#elif defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) && \
-    !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+
 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
                           uint8* dst_ptr, int dst_width);
 #endif
@ -450,9 +451,9 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
                    width, height);
 }

-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SPLITYUY2_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void SplitYUY2_SSE2(const uint8* src_yuy2,
                           uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
@ -498,7 +499,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
  }
 }

-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SPLITYUY2_SSE2
 static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
                           uint8* dst_u, uint8* dst_v, int pix) {
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -205,9 +205,9 @@ int I400Copy(const uint8* src_y, int src_stride_y,
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1

-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_I42XTOYUY2ROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void I42xToYUY2Row_SSE2(const uint8* src_y,
                               const uint8* src_u,
                               const uint8* src_v,
@ -246,7 +246,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
 }

 #define HAS_I42XTOUYVYROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void I42xToUYVYRow_SSE2(const uint8* src_y,
                               const uint8* src_u,
                               const uint8* src_v,
@ -283,7 +283,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
    ret
  }
 }
-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_I42XTOYUY2ROW_SSE2
 static void I42xToYUY2Row_SSE2(const uint8* src_y,
                               const uint8* src_u,
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@ -24,9 +24,9 @@ extern "C" {
 // and vst would select which 2 components to write.  The low level would need
 // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR

-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBTOBAYERROW_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
                                 uint8* dst_bayer, uint32 selector, int pix) {
  __asm {
@ -36,6 +36,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
    mov        ecx, [esp + 16]   // pix
    pshufd     xmm5, xmm5, 0

+    align      16
  wloop:
    movdqa     xmm0, [eax]
    lea        eax, [eax + 16]
@ -48,7 +49,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
  }
 }

-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))

 #define HAS_ARGBTOBAYERROW_SSSE3
 static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -137,87 +137,38 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
  return 0;
 }

-// Alpha Blend ARGB
-void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width) {
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlendRow_SSSE3(src_argb, dst_argb, width);
-    return;
-  }
-#endif
+// Get a blender that optimized for the CPU, alignment and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width) {
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = ARGBBlendRow_C;
 #if defined(HAS_ARGBBLENDROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlendRow_SSE2(src_argb, dst_argb, width);
-    return;
-  }
-#endif
-  ARGBBlendRow_C(src_argb, dst_argb, width);
-}
-
-// Alpha Blend 2 rows of ARGB pixels and store to destination.
-void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
-                   uint8* dst_argb, int width) {
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlend2Row_SSSE3(src_argb0, src_argb1, dst_argb, width);
-    return;
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlend2Row_SSE2(src_argb0, src_argb1, dst_argb, width);
-    return;
-  }
-#endif
-  ARGBBlend2Row_C(src_argb0, src_argb1, dst_argb, width);
-}
-
-// Alpha Blend ARGB
-// TODO(fbarchard): Call 3 pointer low levels to reduce code size.
-int ARGBBlend(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-  void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) =
-      ARGBBlendRow_C;
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlendRow = ARGBBlendRow_SSE2;
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
+    ARGBBlendRow = ARGBBlendRow1_SSE2;
+    if (width >= 4) {
+      ARGBBlendRow = ARGBBlendRow_Any_SSE2;
+      if (IS_ALIGNED(width, 4) &&
+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
+      }
    }
  }
 #endif
 #if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlendRow = ARGBBlendRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+    ARGBBlendRow = ARGBBlendRow_Any_SSSE3;
    if (IS_ALIGNED(width, 4) &&
        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
      ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3;
    }
  }
 #endif
-
-  for (int y = 0; y < height; ++y) {
-    ARGBBlendRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
+  return ARGBBlendRow;
 }

 // Alpha Blend 2 ARGB images and store to destination.
-int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
               const uint8* src_argb1, int src_stride_argb1,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
@ -230,30 +181,12 @@ int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
-
-  void (*ARGBBlend2Row)(const uint8* src_argb, const uint8* src_argb1,
-                        uint8* dst_argb, int width) = ARGBBlend2Row_C;
-#if defined(HAS_ARGBBLENDROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBBlend2Row = ARGBBlend2Row_SSE2;
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      ARGBBlend2Row = ARGBBlend2Row_Aligned_SSE2;
-    }
-  }
-#endif
-#if defined(HAS_ARGBBLENDROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBBlend2Row = ARGBBlend2Row_SSSE3;
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      ARGBBlend2Row = ARGBBlend2Row_Aligned_SSSE3;
-    }
-  }
-#endif
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) =
+      GetARGBBlend(dst_argb, dst_stride_argb, width);

  for (int y = 0; y < height; ++y) {
-    ARGBBlend2Row(src_argb0, src_argb1, dst_argb, width);
+    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
    src_argb0 += src_stride_argb0;
    src_argb1 += src_stride_argb1;
    dst_argb += dst_stride_argb;
@ -725,7 +658,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
 // SetRow8 writes 'count' bytes using a 32 bit value repeated
 // SetRow32 writes 'count' words using a 32 bit value repeated

-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SETROW_NEON
 static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
  asm volatile (
@ -749,9 +682,9 @@ static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
  }
 }

-#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SETROW_X86
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
  __asm {
    mov        edx, edi
@ -765,7 +698,7 @@ static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void SetRows32_X86(uint8* dst, uint32 v32, int width,
                         int dst_stride, int height) {
  __asm {
@ -793,7 +726,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
  }
 }

-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SETROW_X86
 static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
  size_t width_tmp = static_cast<size_t>(width);
@ -903,6 +836,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
  return 0;
 }

+// TODO(fbarchard): Add TestCpuFlag(kCpuHasX86) to allow C code to be tested.
 // Draw a rectangle into ARGB
 int ARGBRect(uint8* dst_argb, int dst_stride_argb,
             int dst_x, int dst_y,
@ -916,12 +850,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
 #if defined(HAS_SETROW_X86)
  SetRows32_X86(dst, value, width, dst_stride_argb, height);
-#elif defined(HAS_SETROW_NEON)
+#else
+#if defined(HAS_SETROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    SetRows32_NEON(dst, value, width, dst_stride_argb, height);
    return 0;
  }
+#endif
  SetRows32_C(dst, value, width, dst_stride_argb, height);
 #endif
  return 0;
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -21,8 +21,8 @@ namespace libyuv {
 extern "C" {
 #endif

-#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
-    !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #if defined(__APPLE__) && defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
@ -59,9 +59,9 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         int width);
 #endif

-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_TRANSPOSE_WX8_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                               uint8* dst, int dst_stride, int width) {
  __asm {
@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
 }

 #define HAS_TRANSPOSE_UVWX8_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                uint8* dst_a, int dst_stride_a,
                                uint8* dst_b, int dst_stride_b,
@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    ret
  }
 }
-#elif defined(__i386__) || defined(__x86_64__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSE_WX8_SSSE3
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                               uint8* dst, int dst_stride, int width) {
@ -369,7 +369,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
  );
 }

-#if defined (__i386__)
+#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
 #define HAS_TRANSPOSE_UVWX8_SSE2
 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                    uint8* dst_a, int dst_stride_a,
@ -491,7 +491,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    "pop    %ebx                               \n"
    "ret                                       \n"
 );
-#elif defined(__x86_64__)
+#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@ -17,7 +17,7 @@ namespace libyuv {
 extern "C" {
 #endif

-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)

 static const uvec8 vtbl_4x4_transpose =
  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
--- a/source/row.h
+++ b/source/row.h
@ -18,6 +18,7 @@ namespace libyuv {
 extern "C" {
 #endif

+// TODO(fbarchard): Remove kMaxStride
 #define kMaxStride (2560 * 4)
 #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))

@ -26,8 +27,9 @@ extern "C" {
 #endif

 // The following are available on all x86 platforms
-#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
-    !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+
 #define HAS_ABGRTOARGBROW_SSSE3
 #define HAS_BGRATOARGBROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
@ -66,7 +68,7 @@ extern "C" {
 #endif

 // The following are available on Neon platforms
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORROWUV_NEON
 #define HAS_SPLITUV_NEON
@ -78,7 +80,7 @@ extern "C" {

 // The following are only available on Win32
 // TODO(fbarchard): Port to GCC
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBBLENDROW_SSSE3
 #endif

@ -265,25 +267,18 @@ void YToARGBRow_SSE2(const uint8* y_buf,
                     int width);

 // ARGB preattenuated alpha blend.
-void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-
-// ARGB preattenuated alpha blend with 2 sources and a destination.
-void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                                 uint8* dst_argb, int width);
-void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                                uint8* dst_argb, int width);
-void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                               uint8* dst_argb, int width);
+void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width);
-void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
+void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                            uint8* dst_argb, int width);
+void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                           uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                    uint8* dst_argb, int width);

 // 'Any' functions handle any size and alignment.
 void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -454,73 +454,10 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
 }

 #define BLENDER(f, b, a) (((256 - a) * b) >> 8) + f
-void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
-  for (int x = 0; x < width - 1; x += 2) {
-    uint32 a = src_argb[3];
-    if (a) {
-      if (a < 255) {
-        const uint32 fb = src_argb[0];
-        const uint32 fg = src_argb[1];
-        const uint32 fr = src_argb[2];
-        const uint32 bb = dst_argb[0];
-        const uint32 bg = dst_argb[1];
-        const uint32 br = dst_argb[2];
-        dst_argb[0] = BLENDER(fb, bb, a);
-        dst_argb[1] = BLENDER(fg, bg, a);
-        dst_argb[2] = BLENDER(fr, br, a);
-        dst_argb[3] = 255u;
-      } else {
-        *reinterpret_cast<uint32*>(dst_argb) =
-            *reinterpret_cast<const uint32*>(src_argb);
-      }
-    }
-    a = src_argb[4 + 3];
-    if (a) {
-      if (a < 255) {
-        const uint32 fb = src_argb[4 + 0];
-        const uint32 fg = src_argb[4 + 1];
-        const uint32 fr = src_argb[4 + 2];
-        const uint32 bb = dst_argb[4 + 0];
-        const uint32 bg = dst_argb[4 + 1];
-        const uint32 br = dst_argb[4 + 2];
-        dst_argb[4 + 0] = BLENDER(fb, bb, a);
-        dst_argb[4 + 1] = BLENDER(fg, bg, a);
-        dst_argb[4 + 2] = BLENDER(fr, br, a);
-        dst_argb[4 + 3] = 255u;
-      } else {
-        *reinterpret_cast<uint32*>(dst_argb + 4) =
-            *reinterpret_cast<const uint32*>(src_argb + 4);
-      }
-    }
-    src_argb += 8;
-    dst_argb += 8;
-  }
-
-  if (width & 1) {
-    const uint32 a = src_argb[3];
-    if (a) {
-      if (a < 255) {
-        const uint32 fb = src_argb[0];
-        const uint32 fg = src_argb[1];
-        const uint32 fr = src_argb[2];
-        const uint32 bb = dst_argb[0];
-        const uint32 bg = dst_argb[1];
-        const uint32 br = dst_argb[2];
-        dst_argb[0] = BLENDER(fb, bb, a);
-        dst_argb[1] = BLENDER(fg, bg, a);
-        dst_argb[2] = BLENDER(fr, br, a);
-        dst_argb[3] = 255u;
-      } else {
-        *reinterpret_cast<uint32*>(dst_argb) =
-            *reinterpret_cast<const uint32*>(src_argb);
-      }
-    }
-  }
-}

 // Blend src_argb0 over src_argb1 and store to dst_argb.
 // dst_argb may be src_argb0 or src_argb1.
-void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
                     uint8* dst_argb, int width) {
  for (int x = 0; x < width - 1; x += 2) {
    uint32 a = src_argb0[3];
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -16,7 +16,7 @@ extern "C" {
 #endif

 // This module is for GCC Neon
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)

 #define YUVTORGB                                                               \
    "vld1.u8    {d0}, [%0]!                    \n"                             \
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -18,7 +18,7 @@ extern "C" {
 #endif

 // This module is for GCC x86 and x64
-#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))

 // GCC 4.2 on OSX has link error when passing static or const to inline.
 // TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
@ -2031,162 +2031,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
 #ifdef HAS_ARGBBLENDROW_SSE2
 // Blend 8 pixels at a time
 // Destination aligned to 16 bytes, multiple of 4 pixels
-void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-
-  // 8 pixel loop
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm3                     \n"  // first 4 pixels
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqa    (%1),%%xmm2                     \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqa    (%1),%%xmm1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "movdqu    0x10(%0),%%xmm3                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "jle       9f                              \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"  // next 4 pixels
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqa    0x10(%1),%%xmm2                 \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqa    0x10(%1),%%xmm1                 \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "jg        1b                              \n"
-  "9:                                          \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-// Blend 1 pixel at a time, unaligned
-void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-
-  // 1 pixel loop
-  "1:                                          \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%2                         \n"
-    "movd      %%xmm0,(%1)                     \n"
-    "lea       0x4(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  // Do 1 to 3 pixels to get destination aligned.
-  if ((uintptr_t)(dst_argb) & 15) {
-    int count = width;
-    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
-      count = (-(intptr_t)(dst_argb) >> 2) & 3;
-    }
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
-    src_argb += count * 4;
-    dst_argb += count * 4;
-    width -= count;
-  }
-  // Do multiple of 4 pixels
-  if (width & ~3) {
-    ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3);
-  }
-  // Do remaining 1 to 3 pixels
-  if (width & 3) {
-    src_argb += (width & ~3) * 4;
-    dst_argb += (width & ~3) * 4;
-    width &= 3;
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
-  }
-}
-
-#endif  // HAS_ARGBBLENDROW_SSE2
-
-
-
-
-
-
-
-
-#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time
-// Destination aligned to 16 bytes, multiple of 4 pixels
-void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                               uint8* dst_argb, int width) {
  asm volatile (
    "pcmpeqb   %%xmm7,%%xmm7                   \n"
@ -2259,7 +2104,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 }

 // Blend 1 pixel at a time, unaligned
-void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                         uint8* dst_argb, int width) {
  asm volatile (
    "pcmpeqb   %%xmm7,%%xmm7                   \n"
@ -2309,15 +2154,15 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  );
 }

-void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
+void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                            uint8* dst_argb, int width) {
  // Do 1 to 3 pixels to get destination aligned.
  if ((uintptr_t)(dst_argb) & 15) {
    int count = width;
    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
      count = (-(intptr_t)(dst_argb) >> 2) & 3;
    }
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
    src_argb0 += count * 4;
    src_argb1 += count * 4;
    dst_argb += count * 4;
@ -2325,7 +2170,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  }
  // Do multiple of 4 pixels
  if (width & ~3) {
-    ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
+    ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
  }
  // Do remaining 1 to 3 pixels
  if (width & 3) {
@ -2333,19 +2178,11 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    src_argb1 += (width & ~3) * 4;
    dst_argb += (width & ~3) * 4;
    width &= 3;
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
  }
 }
 #endif  // HAS_ARGBBLENDROW_SSE2

-
-
-
-
-
-
-
-
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -16,7 +16,7 @@ extern "C" {
 #endif

 // This module is for Visual C x86
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)

 #ifdef HAS_ARGBTOYROW_SSSE3

@ -99,7 +99,7 @@ static const uvec8 kShuffleMaskARGBToRAW = {
  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
 };

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
  __asm {
    mov        eax, [esp + 4]        // src_y
@ -127,7 +127,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_abgr
@ -148,7 +148,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_bgra
@ -169,7 +169,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_rgb24
@ -208,7 +208,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
                        int pix) {
 __asm {
@ -255,7 +255,7 @@ __asm {
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
                          int pix) {
 __asm {
@ -306,7 +306,7 @@ __asm {
 }

 // 24 instructions
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
                            int pix) {
 __asm {
@ -360,7 +360,7 @@ __asm {
 }

 // 18 instructions
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                            int pix) {
 __asm {
@ -399,7 +399,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_argb
@ -438,7 +438,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_argb
@ -477,7 +477,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_argb
@ -516,7 +516,7 @@ __asm {
 }

 // TODO(fbarchard): Improve sign extension/packing
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_argb
@ -558,7 +558,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_argb
@ -589,7 +589,7 @@ __asm {
 }

 // Convert 16 ARGB pixels (64 bytes) to 16 Y values
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
    mov        eax, [esp + 4]   /* src_argb */
@ -623,7 +623,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
    mov        eax, [esp + 4]   /* src_argb */
@ -657,7 +657,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
    mov        eax, [esp + 4]   /* src_argb */
@ -691,7 +691,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
    mov        eax, [esp + 4]   /* src_argb */
@ -725,7 +725,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
    mov        eax, [esp + 4]   /* src_argb */
@ -759,7 +759,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 __asm {
    mov        eax, [esp + 4]   /* src_argb */
@ -793,7 +793,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@ -859,7 +859,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                 uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@ -929,7 +929,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@ -995,7 +995,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                 uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@ -1065,7 +1065,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@ -1131,7 +1131,7 @@ __asm {
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
                                 uint8* dst_u, uint8* dst_v, int width) {
 __asm {
@ -1268,7 +1268,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
    __asm packuswb   xmm2, xmm2           /* R */                              \
  }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@ -1308,7 +1308,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToBGRARow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@ -1348,7 +1348,7 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToABGRRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@ -1388,7 +1388,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
@ -1428,7 +1428,7 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
@ -1468,7 +1468,7 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
@ -1508,7 +1508,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void I444ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@ -1575,7 +1575,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
 #endif

 #ifdef HAS_YTOARGBROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void YToARGBRow_SSE2(const uint8* y_buf,
                     uint8* rgb_buf,
                     int width) {
@ -1628,7 +1628,7 @@ static const uvec8 kShuffleMirror = {
  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 __asm {
    mov       eax, [esp + 4]   // src
@ -1653,7 +1653,7 @@ __asm {
 #ifdef HAS_MIRRORROW_SSE2
 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
 // version can not.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 __asm {
    mov       eax, [esp + 4]   // src
@ -1686,7 +1686,7 @@ static const uvec8 kShuffleMirrorUV = {
  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
 };

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
                       int width) {
  __asm {
@ -1717,7 +1717,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 #endif

 #ifdef HAS_SPLITUV_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  __asm {
    push       edi
@ -1756,7 +1756,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {

 #ifdef HAS_COPYROW_SSE2
 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
  __asm {
    mov        eax, [esp + 4]   // src
@ -1779,7 +1779,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
 #endif  // HAS_COPYROW_SSE2

 #ifdef HAS_COPYROW_X86
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
  __asm {
    mov        eax, esi
@ -1797,7 +1797,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
 #endif

 #ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
                     uint8* dst_y, int pix) {
  __asm {
@ -1823,7 +1823,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
                      uint8* dst_u, uint8* dst_y, int pix) {
  __asm {
@ -1867,7 +1867,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
                               uint8* dst_y, int pix) {
  __asm {
@ -1893,7 +1893,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
                                uint8* dst_u, uint8* dst_y, int pix) {
  __asm {
@ -1937,7 +1937,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void UYVYToYRow_SSE2(const uint8* src_uyvy,
                     uint8* dst_y, int pix) {
  __asm {
@ -1961,7 +1961,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
                      uint8* dst_u, uint8* dst_y, int pix) {
  __asm {
@ -2005,7 +2005,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                               uint8* dst_y, int pix) {
  __asm {
@ -2029,7 +2029,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
  }
 }

-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
                                uint8* dst_u, uint8* dst_y, int pix) {
  __asm {
@ -2078,273 +2078,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
 // Blend 8 pixels at a time
 // Destination aligned to 16 bytes, multiple of 4 pixels
 __declspec(naked) __declspec(align(16))
-void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-
-    align      16
- convertloop:
-    movdqu     xmm3, [eax]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqa     xmm2, [edx]      // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [edx]      // _a_g
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    movdqu     xmm3, [eax + 16]
-    lea        eax, [eax + 32]
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    jle        done
-
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqa     xmm2, [edx + 16] // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [edx + 16] // _a_g
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx + 16], xmm0
-    lea        edx, [edx + 32]
-    jg         convertloop
-
- done:
-    ret
-  }
-}
-
-// Blend 1 pixel at a time, unaligned
-__declspec(naked) __declspec(align(16))
-void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-
-    align      16
- convertloop:
-    movd       xmm3, [eax]
-    lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [edx]      // _r_b
-    psrlw      xmm3, 8          // alpha
-    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-    pshuflw    xmm3, xmm3,0F5h
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [edx]      // _a_g
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
-    movd       [edx], xmm0
-    lea        edx, [edx + 4]
-    jg         convertloop
-    ret
-  }
-}
-
-void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
-  // Do 1 to 3 pixels to get destination aligned.
-  if ((uintptr_t)(dst_argb) & 15) {
-    int count = width;
-    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
-      count = (-(intptr_t)(dst_argb) >> 2) & 3;
-    }
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
-    src_argb += count * 4;
-    dst_argb += count * 4;
-    width -= count;
-  }
-  // Do multiple of 4 pixels
-  if (width & ~3) {
-    ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3);
-  }
-  // Do remaining 1 to 3 pixels
-  if (width & 3) {
-    src_argb += (width & ~3) * 4;
-    dst_argb += (width & ~3) * 4;
-    width &= 3;
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
-  }
-}
-#endif  // HAS_ARGBBLENDROW_SSE2
-
-#ifdef HAS_ARGBBLENDROW_SSSE3
-// Blend 8 pixels at a time
-// Shuffle table for reversing the bytes.
-static const uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
-
-// Same as SSE2, but replaces
-//    psrlw      xmm3, 8          // alpha
-//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
-//    pshuflw    xmm3, xmm3,0F5h
-// with..
-//    pshufb     xmm3, kShuffleAlpha // alpha
-
-// Destination aligned to 16 bytes, multiple of 4 pixels
-__declspec(naked) __declspec(align(16))
-void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
-    psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
-    psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
-    psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
-    pslld      xmm4, 24
-
-    align      16
- convertloop:
-    movdqu     xmm3, [eax]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    pshufb     xmm3, kShuffleAlpha // alpha
-    movdqa     xmm2, [edx]      // _r_b
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [edx]      // _a_g
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    movdqu     xmm3, [eax + 16]
-    lea        eax, [eax + 32]
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    jle        done
-
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqa     xmm2, [edx + 16] // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [edx + 16] // _a_g
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx + 16], xmm0
-    lea        edx, [edx + 32]
-    jg         convertloop
-
- done:
-    ret
-  }
-}
-
-void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  // Do 1 to 3 pixels to get destination aligned.
-  if ((uintptr_t)(dst_argb) & 15) {
-    int count = width;
-    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
-      count = (-(intptr_t)(dst_argb) >> 2) & 3;
-    }
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
-    src_argb += count * 4;
-    dst_argb += count * 4;
-    width -= count;
-  }
-  // Do multiple of 4 pixels
-  if (width & ~3) {
-    ARGBBlendRow_Aligned_SSSE3(src_argb, dst_argb, width & ~3);
-  }
-  // Do remaining 1 to 3 pixels
-  if (width & 3) {
-    src_argb += (width & ~3) * 4;
-    dst_argb += (width & ~3) * 4;
-    width &= 3;
-    ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
-  }
-}
-#endif  // HAS_ARGBBLENDROW_SSSE3
-
-
-
-
-
-
-
-///////////////////////////////////////
-///////////////////// 2 source versions
-///////////////////////////////////////
-
-
-
-
-
-
-
-#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time
-// Destination aligned to 16 bytes, multiple of 4 pixels
-__declspec(naked) __declspec(align(16))
-void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                                uint8* dst_argb, int width) {
  __asm {
    push       esi
@ -2418,7 +2152,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,

 // Blend 1 pixel at a time, unaligned
 __declspec(naked) __declspec(align(16))
-void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                         uint8* dst_argb, int width) {
  __asm {
    push       esi
@ -2467,7 +2201,7 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  }
 }

-void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width) {
  // Do 1 to 3 pixels to get destination aligned.
  if ((uintptr_t)(dst_argb) & 15) {
@ -2475,7 +2209,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
      count = (-(intptr_t)(dst_argb) >> 2) & 3;
    }
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
    src_argb0 += count * 4;
    src_argb1 += count * 4;
    dst_argb += count * 4;
@ -2483,7 +2217,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
  }
  // Do multiple of 4 pixels
  if (width & ~3) {
-    ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
+    ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
  }
  // Do remaining 1 to 3 pixels
  if (width & 3) {
@ -2491,12 +2225,18 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    src_argb1 += (width & ~3) * 4;
    dst_argb += (width & ~3) * 4;
    width &= 3;
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
  }
 }
 #endif  // HAS_ARGBBLENDROW_SSE2

 #ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
 // Blend 8 pixels at a time
 // Shuffle table for reversing the bytes.

@ -2509,7 +2249,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,

 // Destination aligned to 16 bytes, multiple of 4 pixels
 __declspec(naked) __declspec(align(16))
-void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                                 uint8* dst_argb, int width) {
  __asm {
    push       esi
@ -2577,7 +2317,7 @@ void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  }
 }

-void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width) {
  // Do 1 to 3 pixels to get destination aligned.
  if ((uintptr_t)(dst_argb) & 15) {
@ -2585,7 +2325,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
      count = (-(intptr_t)(dst_argb) >> 2) & 3;
    }
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
    src_argb0 += count * 4;
    src_argb1 += count * 4;
    dst_argb += count * 4;
@ -2593,7 +2333,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  }
  // Do multiple of 4 pixels
  if (width & ~3) {
-    ARGBBlend2Row_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3);
+    ARGBBlendRow_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3);
  }
  // Do remaining 1 to 3 pixels
  if (width & 3) {
@ -2601,7 +2341,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    src_argb1 += (width & ~3) * 4;
    dst_argb += (width & ~3) * 4;
    width &= 3;
-    ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
+    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
  }
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
--- a/source/scale.cc
+++ b/source/scale.cc
@ -55,7 +55,7 @@ void SetUseReferenceImpl(bool use) {
 *
 */

-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SCALEROWDOWN2_NEON
 void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
                        uint8* dst, int dst_width) {
@ -566,12 +566,13 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr,
 */

 // Constants for SSE2 code
-#elif defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) && \
-    !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
+
 #if defined(_MSC_VER)
 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
-#elif defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__) && \
-    defined(__i386__)
+#elif defined(__i386__) && \
+    (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__))
 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
 #else
 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
@ -670,12 +671,12 @@ extern "C" TALIGN16(const uint16, scaleab2[8]) =
  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 #endif

-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)

 #define HAS_SCALEROWDOWN2_SSE2
 // Reads 32 pixels, throws half away and writes 16 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
                               uint8* dst_ptr, int dst_width) {
  __asm {
@ -704,7 +705,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
 }
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
                           uint8* dst_ptr, int dst_width) {
  __asm {
@ -749,7 +750,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEROWDOWN4_SSE2
 // Point samples 32 pixels to 8 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
                               uint8* dst_ptr, int dst_width) {
  __asm {
@ -780,7 +781,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,

 // Blends 32x4 rectangle to 8x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
                                  uint8* dst_ptr, int dst_width) {
  __asm {
@ -842,7 +843,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEROWDOWN8_SSE2
 // Point samples 32 pixels to 4 pixels.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
                               uint8* dst_ptr, int dst_width) {
  __asm {
@ -874,7 +875,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,

 // Blends 32x8 rectangle to 4x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                  uint8* dst_ptr, int dst_width) {
  __asm {
@ -952,7 +953,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,

 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
                                 uint8* dst_ptr, int dst_width) {
  __asm {
@ -1001,7 +1002,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,

 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width) {
  __asm {
@ -1059,7 +1060,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,

 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width) {
  __asm {
@ -1122,7 +1123,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
 // 3/8 point sampler

 // Scale 32 pixels to 12
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
                                 uint8* dst_ptr, int dst_width) {
  __asm {
@ -1154,7 +1155,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
 }

 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width) {
  __asm {
@ -1221,7 +1222,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
 }

 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width) {
  __asm {
@ -1269,7 +1270,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEADDROWS_SSE2

 // Reads 16xN bytes and produces 16 shorts at a time.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
                              uint16* dst_ptr, int src_width,
                              int src_height) {
@ -1329,7 +1330,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,

 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
 #define HAS_SCALEFILTERROWS_SSE2
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                                 int src_stride, int dst_width,
                                 int source_y_fraction) {
@ -1420,7 +1421,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,

 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
 #define HAS_SCALEFILTERROWS_SSSE3
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                                  int src_stride, int dst_width,
                                  int source_y_fraction) {
@ -1501,7 +1502,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

 // Note that movdqa+palign may be better than movdqu.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
+__declspec(naked) __declspec(align(16))
 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                                    int dst_width) {
  __asm {
@ -1547,7 +1548,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  }
 }

-#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
+#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))

 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
@ -1766,7 +1767,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
  );
 }

-#if defined(__i386__)
+#if !defined(YUV_DISABLE_ASM) && defined(__i386__)
 extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                      uint8* dst_ptr, int dst_width);
  asm(
@ -2260,7 +2261,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    "ret                                       \n"
 );

-#elif defined(__x86_64__)
+#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                  uint8* dst_ptr, int dst_width) {
  asm volatile (