diff --git a/README.chromium b/README.chromium
index 52f29a6c3..0fe897610 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 254
+Version: 255
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index b6797c893..facc89116 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 254
+#define LIBYUV_VERSION 255
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
 
diff --git a/source/compare.cc b/source/compare.cc
index c82b3918f..68a42ba82 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -163,6 +163,7 @@ static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
     "movd      %2,%%xmm0                       \n"
     "pxor      %%xmm7,%%xmm7                   \n"
     "movdqa    %4,%%xmm6                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm1                     \n"
     "lea       0x10(%0),%0                     \n"
@@ -331,7 +332,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
     "pxor      %%xmm0,%%xmm0                   \n"
     "pxor      %%xmm5,%%xmm5                   \n"
     "sub       %0,%1                           \n"
-
+    ".p2align  4                               \n"
     "1:                                        \n"
     "movdqa    (%0),%%xmm1                     \n"
     "movdqa    (%0,%1,1),%%xmm2                \n"
diff --git a/source/convert.cc b/source/convert.cc
index 0b1f03c74..1cfb4c4ac 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -74,6 +74,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
     mov        ecx, [esp + 4 + 16]   // pix
     sub        edi, eax
 
+    align      16
   convertloop:
     movdqa     xmm0, [eax]
     pavgb      xmm0, [eax + edx]
@@ -92,6 +93,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                          uint8* dst_uv, int pix) {
   asm volatile (
   "sub        %0,%1                            \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "pavgb      (%0,%3),%%xmm0                   \n"
@@ -467,6 +469,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
+    align      16
   convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -506,6 +509,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
   asm volatile (
   "pcmpeqb    %%xmm5,%%xmm5                    \n"
   "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     0x10(%0),%%xmm1                  \n"
diff --git a/source/convert_from.cc b/source/convert_from.cc
index efe58dd82..55ff8f5c4 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -291,6 +291,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
                                uint8* dst_frame, int width) {
  asm volatile (
     "sub        %1,%2                            \n"
+    ".p2align  4                                 \n"
   "1:                                            \n"
     "movq      (%1),%%xmm2                       \n"
     "movq      (%1,%2,1),%%xmm3                  \n"
@@ -326,6 +327,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
                                uint8* dst_frame, int width) {
  asm volatile (
     "sub        %1,%2                            \n"
+    ".p2align  4                                 \n"
   "1:                                            \n"
     "movq      (%1),%%xmm2                       \n"
     "movq      (%1,%2,1),%%xmm3                  \n"
diff --git a/source/format_conversion.cc b/source/format_conversion.cc
index 1cdf709e4..dea1491bd 100644
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -57,6 +57,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
   asm volatile (
     "movd   %3,%%xmm5                          \n"
     "pshufd $0x0,%%xmm5,%%xmm5                 \n"
+    ".p2align  4                               \n"
 "1:                                            \n"
     "movdqa (%0),%%xmm0                        \n"
     "lea    0x10(%0),%0                        \n"
diff --git a/source/rotate.cc b/source/rotate.cc
index a029a17bc..b11a0b88d 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -288,6 +288,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
   asm volatile (
     // Read in the data from the source pointer.
     // First round of bit swap.
+    ".p2align  4                                 \n"
   "1:                                            \n"
     "movq       (%0),%%xmm0                      \n"
     "movq       (%0,%3),%%xmm1                   \n"
@@ -499,6 +500,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
   asm volatile (
   // Read in the data from the source pointer.
   // First round of bit swap.
+  ".p2align  4                                 \n"
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     (%0,%3),%%xmm1                   \n"
@@ -639,6 +641,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   asm volatile (
   // Read in the data from the source pointer.
   // First round of bit swap.
+  ".p2align  4                                 \n"
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     (%0,%4),%%xmm1                   \n"
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index 70dd4201e..af790aeee 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -32,6 +32,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
     "sub         %4, #8                        \n"
 
     // handle 8x8 blocks.  this should be the majority of the plane
+    ".p2align  4                               \n"
     "1:                                        \n"
       "mov         r9, %0                      \n"
 
@@ -198,6 +199,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
     "sub         %6, #8                        \n"
 
     // handle 8x8 blocks.  this should be the majority of the plane
+    ".p2align  4                               \n"
     "1:                                        \n"
       "mov         r9, %0                      \n"
 
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 1a8f4fb8f..122b30933 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -112,6 +112,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0x18,%%xmm5                    \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movq      (%0),%%xmm0                     \n"
     "lea       0x8(%0),%0                      \n"
@@ -141,6 +142,7 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
     "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
@@ -164,6 +166,7 @@ void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
     "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
@@ -187,6 +190,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
     "pslld     $0x18,%%xmm5                    \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -227,6 +231,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
     "pslld     $0x18,%%xmm5                    \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -279,6 +284,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "psllw     $0x8,%%xmm7                     \n"
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -327,6 +333,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "psllw     $0x8,%%xmm7                     \n"
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -372,6 +379,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "pslld     $0x4,%%xmm5                     \n"
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
@@ -405,6 +413,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm6                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -445,6 +454,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm6                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -491,6 +501,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "pslld     $0x5,%%xmm4                     \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0xb,%%xmm5                     \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -531,6 +542,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "pslld     $0xa,%%xmm6                     \n"
     "pcmpeqb   %%xmm7,%%xmm7                   \n"
     "pslld     $0xf,%%xmm7                     \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -570,6 +582,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "psllw     $0xc,%%xmm4                     \n"
     "movdqa    %%xmm4,%%xmm3                   \n"
     "psrlw     $0x8,%%xmm3                     \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -599,6 +612,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -635,6 +649,7 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -689,6 +704,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -753,6 +769,7 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -808,6 +825,7 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -844,6 +862,7 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -893,6 +912,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -957,6 +977,7 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -1012,6 +1033,7 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1048,6 +1070,7 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -1097,6 +1120,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1161,6 +1185,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -1295,6 +1320,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "punpcklbw %%xmm1,%%xmm0                   \n"
@@ -1329,6 +1355,7 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
@@ -1364,6 +1391,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "punpcklbw %%xmm1,%%xmm2                   \n"
@@ -1398,6 +1426,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "punpcklbw %%xmm1,%%xmm0                   \n"
@@ -1432,6 +1461,7 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
@@ -1467,6 +1497,7 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "punpcklbw %%xmm1,%%xmm2                   \n"
@@ -1501,6 +1532,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movd      (%1),%%xmm0                     \n"
     "movd      (%1,%2,1),%%xmm1                \n"
@@ -1562,6 +1594,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
     "mov       $0x012a012a,%%eax               \n"
     "movd      %%eax,%%xmm2                    \n"
     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     "movq      (%0),%%xmm0                     \n"
@@ -1607,6 +1640,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
     "lea       -0x10(%0),%0                    \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0,%2),%%xmm0                  \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
@@ -1631,6 +1665,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = static_cast<intptr_t>(width);
   asm volatile (
     "lea       -0x10(%0),%0                    \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0,%2),%%xmm0                  \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -1668,6 +1703,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
     "movdqa    %4,%%xmm1                       \n"
     "lea       -16(%0,%3,2),%0                 \n"
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "lea       -16(%0),%0                      \n"
@@ -1695,6 +1731,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
   asm volatile (
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm2                     \n"
     "lea       0x10(%0),%0                     \n"
@@ -1725,6 +1762,7 @@ void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
 void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
   asm volatile (
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm2                     \n"
     "lea       0x10(%0),%0                     \n"
@@ -1758,6 +1796,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     "pcmpeqb    %%xmm5,%%xmm5                    \n"
     "psrlw      $0x8,%%xmm5                      \n"
     "sub        %1,%2                            \n"
+    ".p2align  4                               \n"
   "1:                                            \n"
     "movdqa     (%0),%%xmm0                      \n"
     "movdqa     0x10(%0),%%xmm1                  \n"
@@ -1833,6 +1872,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1861,6 +1901,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1900,6 +1941,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -1930,6 +1972,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -1965,6 +2008,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
 
 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
   asm volatile (
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1993,6 +2037,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -2029,6 +2074,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                                uint8* dst_y, int pix) {
   asm volatile (
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -2057,6 +2103,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -2109,6 +2156,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "pslld     $0x18,%%xmm4                    \n"
 
   // 8 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm3                     \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
@@ -2184,6 +2232,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "pslld     $0x18,%%xmm4                    \n"
 
   // 1 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movd      (%0),%%xmm3                     \n"
     "lea       0x4(%0),%0                      \n"
@@ -2241,6 +2290,7 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     "pslld     $0x18,%%xmm4                    \n"
 
   // 8 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm3                     \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
@@ -2313,6 +2363,7 @@ void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     "pslld     $0x18,%%xmm4                    \n"
 
   // 1 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movd      (%0),%%xmm3                     \n"
     "lea       0x4(%0),%0                      \n"
@@ -2361,6 +2412,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     "psrld     $0x8,%%xmm5                     \n"
 
   // 4 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
@@ -2415,6 +2467,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     "movdqa    %4,%%xmm5                       \n"
 
   // 4 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
@@ -2503,6 +2556,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "pslld     $0x18,%%xmm4                    \n"
 
   // 4 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movzb     0x3(%0),%3                      \n"
diff --git a/source/scale.cc b/source/scale.cc
index 13ca2288f..044d2ebe8 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -1457,8 +1457,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     lea        esi, [esi + 16]
     jg         xloop
 
-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel to allow horizontal filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
     pop        edi
     pop        esi
     ret
@@ -1471,8 +1473,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     lea        esi, [esi + 16]
     jg         xloop1
 
-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel to allow horizontal filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
     pop        edi
     pop        esi
     ret
@@ -1486,8 +1490,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     lea        esi, [esi + 16]
     jg         xloop2
 
-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel to allow horizontal filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
     pop        edi
     pop        esi
     ret
@@ -1538,8 +1544,11 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     lea        esi, [esi + 16]
     jg         xloop
 
-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel to allow horizontal filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
+
     pop        edi
     pop        esi
     ret
@@ -1552,8 +1561,10 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     lea        esi, [esi + 16]
     jg         xloop1
 
-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
     pop        edi
     pop        esi
     ret
@@ -1567,8 +1578,10 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     lea        esi, [esi + 16]
     jg         xloop2
 
-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
     pop        edi
     pop        esi
     ret
@@ -1634,6 +1647,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
   asm volatile (
   "pcmpeqb    %%xmm5,%%xmm5                    \n"
   "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     0x10(%0),%%xmm1                  \n"
@@ -1658,6 +1672,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
   asm volatile (
   "pcmpeqb    %%xmm5,%%xmm5                    \n"
   "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     0x10(%0),%%xmm1                  \n"
@@ -1692,6 +1707,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, int src_stride,
   asm volatile (
   "pcmpeqb    %%xmm5,%%xmm5                    \n"
   "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
   "movdqu     (%0),%%xmm0                      \n"
   "movdqu     0x10(%0),%%xmm1                  \n"
@@ -1717,6 +1733,7 @@ static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
   asm volatile (
   "pcmpeqb    %%xmm5,%%xmm5                    \n"
   "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
   "movdqu     (%0),%%xmm0                      \n"
   "movdqu     0x10(%0),%%xmm1                  \n"
@@ -1752,6 +1769,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
   asm volatile (
   "pcmpeqb    %%xmm5,%%xmm5                    \n"
   "psrld      $0x18,%%xmm5                     \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     0x10(%0),%%xmm1                  \n"
@@ -1779,6 +1797,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
   "pcmpeqb    %%xmm7,%%xmm7                    \n"
   "psrlw      $0x8,%%xmm7                      \n"
   "lea        (%4,%4,2),%3                     \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     0x10(%0),%%xmm1                  \n"
@@ -1831,6 +1850,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlq     $0x38,%%xmm5                    \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1860,6 +1880,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
   asm volatile (
     "pxor      %%xmm4,%%xmm4                   \n"
     "sub       $0x1,%5                         \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "mov       %0,%3                           \n"
@@ -2284,8 +2305,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
     "lea    0x10(%esi),%esi                    \n"
     "jg     1b                                 \n"
 
-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
@@ -2297,8 +2320,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
     "lea    0x10(%esi),%esi                    \n"
     "jg     2b                                 \n"
 
-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
@@ -2311,8 +2336,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
     "lea    0x10(%esi),%esi                    \n"
     "jg     3b                                 \n"
 
-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
@@ -2361,8 +2388,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     "lea    0x10(%esi),%esi                    \n"
     "jg     1b                                 \n"
 
-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
@@ -2374,8 +2403,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     "lea    0x10(%esi),%esi                    \n"
     "jg     2b                                 \n"
 
-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
@@ -2388,8 +2419,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     "lea    0x10(%esi),%esi                    \n"
     "jg     3b                                 \n"
 
-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "ret                                       \n"
@@ -2401,6 +2434,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
   asm volatile (
   "lea        (%3,%3,2),%%r10                  \n"
   "pxor       %%xmm7,%%xmm7                    \n"
+  ".p2align  4                                 \n"
 "1:"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     0x10(%0),%%xmm1                  \n"
@@ -2461,6 +2495,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
   "movdqa     (%3),%%xmm3                      \n"
   "movdqa     (%4),%%xmm4                      \n"
   "movdqa     (%5),%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     0x10(%0),%%xmm2                  \n"
@@ -2496,6 +2531,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
   "movdqa     (%8),%%xmm6                      \n"  // _madd11
   "movdqa     (%9),%%xmm7                      \n"  // _round34
   "movdqa     (%10),%%xmm8                     \n"  // _madd21
+  ".p2align  4                                 \n"
 "1:"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     (%0,%3),%%xmm1                   \n"
@@ -2553,6 +2589,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
   "movdqa     (%8),%%xmm6                      \n"  // _madd11
   "movdqa     (%9),%%xmm7                      \n"  // _round34
   "movdqa     (%10),%%xmm8                     \n"  // _madd21
+  ".p2align  4                                 \n"
 "1:"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     (%0,%3,1),%%xmm1                 \n"
@@ -2609,6 +2646,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
   asm volatile (
   "movdqa     (%3),%%xmm4                      \n"
   "movdqa     (%4),%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     0x10(%0),%%xmm1                  \n"
@@ -2638,6 +2676,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
   "movdqa     (%5),%%xmm5                      \n"
   "movdqa     (%6),%%xmm6                      \n"
   "pxor       %%xmm7,%%xmm7                    \n"
+  ".p2align  4                                 \n"
 "1:"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     (%0,%3,1),%%xmm2                 \n"
@@ -2695,6 +2734,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
   "movdqa     (%5),%%xmm5                      \n"
   "movdqa     (%6),%%xmm6                      \n"
   "movdqa     (%7),%%xmm7                      \n"
+  ".p2align  4                                 \n"
 "1:"
   "movdqa     (%0),%%xmm2                      \n"
   "pavgb      (%0,%3,1),%%xmm2                 \n"
@@ -2733,6 +2773,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
                                  int dst_width, int source_y_fraction) {
   if (source_y_fraction == 0) {
     asm volatile (
+    ".p2align  4                               \n"
     "1:"
       "movdqa     (%1),%%xmm0                  \n"
       "lea        0x10(%1),%1                  \n"
@@ -2751,6 +2792,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
     return;
   } else if (source_y_fraction == 128) {
     asm volatile (
+    ".p2align  4                               \n"
     "1:"
       "movdqa     (%1),%%xmm0                  \n"
       "movdqa     (%1,%3,1),%%xmm2             \n"
@@ -2781,6 +2823,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
       "punpcklwd  %%xmm5,%%xmm5                \n"
       "pshufd     $0x0,%%xmm5,%%xmm5           \n"
       "pxor       %%xmm7,%%xmm7                \n"
+      ".p2align  4                             \n"
     "1:"
       "movdqa     (%1),%%xmm0                  \n"
       "movdqa     (%1,%4,1),%%xmm2             \n"
@@ -2824,6 +2867,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
                                   int dst_width, int source_y_fraction) {
   if (source_y_fraction <= 1) {
     asm volatile (
+    ".p2align  4                               \n"
    "1:"
       "movdqa     (%1),%%xmm0                  \n"
       "lea        0x10(%1),%1                  \n"
@@ -2842,6 +2886,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
     return;
   } else if (source_y_fraction == 128) {
     asm volatile (
+    ".p2align  4                               \n"
     "1:"
       "movdqa     (%1),%%xmm0                  \n"
       "movdqa     (%1,%3,1),%%xmm2             \n"
@@ -2870,6 +2915,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
       "movd       %%eax,%%xmm5                 \n"
       "punpcklwd  %%xmm5,%%xmm5                \n"
       "pshufd     $0x0,%%xmm5,%%xmm5           \n"
+      ".p2align  4                             \n"
     "1:"
       "movdqa     (%1),%%xmm0                  \n"
       "movdqa     (%1,%4,1),%%xmm2             \n"
@@ -2904,12 +2950,13 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
 // CPU agnostic row functions
 static void ScaleRowDown2_C(const uint8* src_ptr, int,
                             uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width - 1; x += 2) {
+  uint8* dend = dst + dst_width - 1;
+  do {
     dst[0] = src_ptr[0];
     dst[1] = src_ptr[2];
     dst += 2;
     src_ptr += 4;
-  }
+  } while (dst < dend);
   if (dst_width & 1) {
     dst[0] = src_ptr[0];
   }
@@ -2917,28 +2964,30 @@ static void ScaleRowDown2_C(const uint8* src_ptr, int,
 
 void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
                         uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (src_ptr[0] + src_ptr[1] +
-             src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
-    dst[1] = (src_ptr[2] + src_ptr[3] +
-             src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + 2) >> 2;
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  uint8* dend = dst + dst_width - 1;
+  do {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
     dst += 2;
-    src_ptr += 4;
-  }
+    s += 4;
+    t += 4;
+  } while (dst < dend);
   if (dst_width & 1) {
-    dst[0] = (src_ptr[0] + src_ptr[1] +
-             src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
   }
 }
 
 static void ScaleRowDown4_C(const uint8* src_ptr, int,
                             uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width - 1; x += 2) {
+  uint8* dend = dst + dst_width - 1;
+  do {
     dst[0] = src_ptr[0];
     dst[1] = src_ptr[4];
     dst += 2;
     src_ptr += 8;
-  }
+  } while (dst < dend);
   if (dst_width & 1) {
     dst[0] = src_ptr[0];
   }
@@ -2946,34 +2995,36 @@ static void ScaleRowDown4_C(const uint8* src_ptr, int,
 
 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
                                uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width - 1; x += 2) {
+  intptr_t stride = src_stride;
+  uint8* dend = dst + dst_width - 1;
+  do {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-             src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
-             src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
-             src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
-             src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
-             src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
              8) >> 4;
     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[src_stride + 4] + src_ptr[src_stride + 5] +
-             src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
-             src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] +
-             src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] +
-             src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] +
-             src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
              8) >> 4;
     dst += 2;
     src_ptr += 8;
-  }
+  } while (dst < dend);
   if (dst_width & 1) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-             src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
-             src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
-             src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
-             src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
-             src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
              8) >> 4;
   }
 }
@@ -2985,12 +3036,13 @@ static const int kMaxRow12 = kMaxOutputWidth * 2;
 
 static void ScaleRowDown8_C(const uint8* src_ptr, int,
                             uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width - 1; x += 2) {
+  uint8* dend = dst + dst_width - 1;
+  do {
     dst[0] = src_ptr[0];
     dst[1] = src_ptr[8];
     dst += 2;
     src_ptr += 16;
-  }
+  } while (dst < dend);
   if (dst_width & 1) {
     dst[0] = src_ptr[0];
   }
@@ -3026,9 +3078,9 @@ static void ScaleRowDown34_C(const uint8* src_ptr, int,
 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
                                    uint8* d, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = d + dst_width;
   const uint8* s = src_ptr;
   const uint8* t = src_ptr + src_stride;
+  uint8* dend = d + dst_width;
   do {
     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -3049,9 +3101,9 @@ static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
                                    uint8* d, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = d + dst_width;
   const uint8* s = src_ptr;
   const uint8* t = src_ptr + src_stride;
+  uint8* dend = d + dst_width;
   do {
     uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
     uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -3073,8 +3125,8 @@ static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
                                 int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = dst_ptr + dst_width;
   const uint8* s = src_ptr;
+  uint8* dend = dst_ptr + dst_width;
   do {
     dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
     dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@@ -3150,20 +3202,21 @@ static void ScaleRowDown38_C(const uint8* src_ptr, int,
 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
                                    uint8* dst_ptr, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
+  intptr_t stride = src_stride;
   for (int i = 0; i < dst_width; i += 3) {
     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-        src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
-        src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
         (65536 / 9) >> 16;
     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
-        src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
-        src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
         (65536 / 9) >> 16;
     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
-        src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
         (65536 / 6) >> 16;
     src_ptr += 8;
     dst_ptr += 3;
@@ -3174,15 +3227,16 @@ static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
                                    uint8* dst_ptr, int dst_width) {
   assert((dst_width % 3 == 0) && (dst_width > 0));
+  intptr_t stride = src_stride;
   for (int i = 0; i < dst_width; i += 3) {
     dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-        src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
     dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
-        src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
     dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
         (65536 / 4) >> 16;
     src_ptr += 8;
     dst_ptr += 3;