p2align all loops, copy stride to local for scale, and copy last byte in bilinear more efficiently

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/547007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@255 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-08 01:36:47 +08:00 · 2012-05-02 00:10:16 +00:00 · 2012-05-02 00:10:16 +00:00 · 5bf29b59db
commit 5bf29b59db
parent f906ae1360
10 changed files with 199 additions and 78 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 254
+Version: 255
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 254
+#define LIBYUV_VERSION 255

 #endif  // INCLUDE_LIBYUV_VERSION_H_

--- a/source/compare.cc
+++ b/source/compare.cc
@ -163,6 +163,7 @@ static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
    "movd      %2,%%xmm0                       \n"
    "pxor      %%xmm7,%%xmm7                   \n"
    "movdqa    %4,%%xmm6                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm1                     \n"
    "lea       0x10(%0),%0                     \n"
@ -331,7 +332,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
    "pxor      %%xmm0,%%xmm0                   \n"
    "pxor      %%xmm5,%%xmm5                   \n"
    "sub       %0,%1                           \n"
-
+    ".p2align  4                               \n"
    "1:                                        \n"
    "movdqa    (%0),%%xmm1                     \n"
    "movdqa    (%0,%1,1),%%xmm2                \n"
--- a/source/convert.cc
+++ b/source/convert.cc
@ -74,6 +74,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
    mov        ecx, [esp + 4 + 16]   // pix
    sub        edi, eax

+    align      16
  convertloop:
    movdqa     xmm0, [eax]
    pavgb      xmm0, [eax + edx]
@ -92,6 +93,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                         uint8* dst_uv, int pix) {
  asm volatile (
  "sub        %0,%1                            \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "pavgb      (%0,%3),%%xmm0                   \n"
@ -467,6 +469,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
    psrlw      xmm5, 8

+    align      16
  convertloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@ -506,6 +509,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
  asm volatile (
  "pcmpeqb    %%xmm5,%%xmm5                    \n"
  "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     0x10(%0),%%xmm1                  \n"
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -291,6 +291,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
                               uint8* dst_frame, int width) {
 asm volatile (
    "sub        %1,%2                            \n"
+    ".p2align  4                                 \n"
  "1:                                            \n"
    "movq      (%1),%%xmm2                       \n"
    "movq      (%1,%2,1),%%xmm3                  \n"
@ -326,6 +327,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
                               uint8* dst_frame, int width) {
 asm volatile (
    "sub        %1,%2                            \n"
+    ".p2align  4                                 \n"
  "1:                                            \n"
    "movq      (%1),%%xmm2                       \n"
    "movq      (%1,%2,1),%%xmm3                  \n"
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@ -57,6 +57,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
  asm volatile (
    "movd   %3,%%xmm5                          \n"
    "pshufd $0x0,%%xmm5,%%xmm5                 \n"
+    ".p2align  4                               \n"
 "1:                                            \n"
    "movdqa (%0),%%xmm0                        \n"
    "lea    0x10(%0),%0                        \n"
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -288,6 +288,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
  asm volatile (
    // Read in the data from the source pointer.
    // First round of bit swap.
+    ".p2align  4                                 \n"
  "1:                                            \n"
    "movq       (%0),%%xmm0                      \n"
    "movq       (%0,%3),%%xmm1                   \n"
@ -499,6 +500,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
  asm volatile (
  // Read in the data from the source pointer.
  // First round of bit swap.
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     (%0,%3),%%xmm1                   \n"
@ -639,6 +641,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
  asm volatile (
  // Read in the data from the source pointer.
  // First round of bit swap.
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     (%0,%4),%%xmm1                   \n"
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@ -32,6 +32,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
    "sub         %4, #8                        \n"

    // handle 8x8 blocks.  this should be the majority of the plane
+    ".p2align  4                               \n"
    "1:                                        \n"
      "mov         r9, %0                      \n"

@ -198,6 +199,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    "sub         %6, #8                        \n"

    // handle 8x8 blocks.  this should be the majority of the plane
+    ".p2align  4                               \n"
    "1:                                        \n"
      "mov         r9, %0                      \n"

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -112,6 +112,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "pslld     $0x18,%%xmm5                    \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movq      (%0),%%xmm0                     \n"
    "lea       0x8(%0),%0                      \n"
@ -141,6 +142,7 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
  asm volatile (
    "movdqa    %3,%%xmm5                       \n"
    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "pshufb    %%xmm5,%%xmm0                   \n"
@ -164,6 +166,7 @@ void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
  asm volatile (
    "movdqa    %3,%%xmm5                       \n"
    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "pshufb    %%xmm5,%%xmm0                   \n"
@ -187,6 +190,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
    "pslld     $0x18,%%xmm5                    \n"
    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -227,6 +231,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
    "pslld     $0x18,%%xmm5                    \n"
    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -279,6 +284,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
    "psllw     $0x8,%%xmm7                     \n"
    "sub       %0,%1                           \n"
    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
@ -327,6 +333,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
    "psllw     $0x8,%%xmm7                     \n"
    "sub       %0,%1                           \n"
    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
@ -372,6 +379,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
    "pslld     $0x4,%%xmm5                     \n"
    "sub       %0,%1                           \n"
    "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqa    %%xmm0,%%xmm2                   \n"
@ -405,6 +413,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
  asm volatile (
    "movdqa    %3,%%xmm6                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -445,6 +454,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
  asm volatile (
    "movdqa    %3,%%xmm6                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -491,6 +501,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
    "pslld     $0x5,%%xmm4                     \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "pslld     $0xb,%%xmm5                     \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
@ -531,6 +542,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
    "pslld     $0xa,%%xmm6                     \n"
    "pcmpeqb   %%xmm7,%%xmm7                   \n"
    "pslld     $0xf,%%xmm7                     \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
@ -570,6 +582,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
    "psllw     $0xc,%%xmm4                     \n"
    "movdqa    %%xmm4,%%xmm3                   \n"
    "psrlw     $0x8,%%xmm3                     \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
@ -599,6 +612,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  asm volatile (
    "movdqa    %4,%%xmm5                       \n"
    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -635,6 +649,7 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
  asm volatile (
    "movdqa    %4,%%xmm5                       \n"
    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -689,6 +704,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
  );
  asm volatile (
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -753,6 +769,7 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  );
  asm volatile (
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -808,6 +825,7 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
  asm volatile (
    "movdqa    %4,%%xmm5                       \n"
    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -844,6 +862,7 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
  asm volatile (
    "movdqa    %4,%%xmm5                       \n"
    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -893,6 +912,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
  );
  asm volatile (
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -957,6 +977,7 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
  );
  asm volatile (
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -1012,6 +1033,7 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
  asm volatile (
    "movdqa    %4,%%xmm5                       \n"
    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -1048,6 +1070,7 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
  asm volatile (
    "movdqa    %4,%%xmm5                       \n"
    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -1097,6 +1120,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
  );
  asm volatile (
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -1161,6 +1185,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
  );
  asm volatile (
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -1295,6 +1320,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
    "sub       %1,%2                           \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    YUVTORGB
    "punpcklbw %%xmm1,%%xmm0                   \n"
@ -1329,6 +1355,7 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
    "sub       %1,%2                           \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    YUVTORGB
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
@ -1364,6 +1391,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
    "sub       %1,%2                           \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    YUVTORGB
    "punpcklbw %%xmm1,%%xmm2                   \n"
@ -1398,6 +1426,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
    "sub       %1,%2                           \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    YUVTORGB
    "punpcklbw %%xmm1,%%xmm0                   \n"
@ -1432,6 +1461,7 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
    "sub       %1,%2                           \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    YUVTORGB
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
@ -1467,6 +1497,7 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
    "sub       %1,%2                           \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    YUVTORGB
    "punpcklbw %%xmm1,%%xmm2                   \n"
@ -1501,6 +1532,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
    "sub       %1,%2                           \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movd      (%1),%%xmm0                     \n"
    "movd      (%1,%2,1),%%xmm1                \n"
@ -1562,6 +1594,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
    "mov       $0x012a012a,%%eax               \n"
    "movd      %%eax,%%xmm2                    \n"
    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
    "movq      (%0),%%xmm0                     \n"
@ -1607,6 +1640,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
  asm volatile (
    "movdqa    %3,%%xmm5                       \n"
    "lea       -0x10(%0),%0                    \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0,%2),%%xmm0                  \n"
    "pshufb    %%xmm5,%%xmm0                   \n"
@ -1631,6 +1665,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
    "lea       -0x10(%0),%0                    \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0,%2),%%xmm0                  \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
@ -1668,6 +1703,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
    "movdqa    %4,%%xmm1                       \n"
    "lea       -16(%0,%3,2),%0                 \n"
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "lea       -16(%0),%0                      \n"
@ -1695,6 +1731,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
  asm volatile (
    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm2                     \n"
    "lea       0x10(%0),%0                     \n"
@ -1725,6 +1762,7 @@ void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
 void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
  asm volatile (
    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm2                     \n"
    "lea       0x10(%0),%0                     \n"
@ -1758,6 +1796,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
    "pcmpeqb    %%xmm5,%%xmm5                    \n"
    "psrlw      $0x8,%%xmm5                      \n"
    "sub        %1,%2                            \n"
+    ".p2align  4                               \n"
  "1:                                            \n"
    "movdqa     (%0),%%xmm0                      \n"
    "movdqa     0x10(%0),%%xmm1                  \n"
@ -1833,6 +1872,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -1861,6 +1901,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrlw     $0x8,%%xmm5                     \n"
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -1900,6 +1941,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -1930,6 +1972,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrlw     $0x8,%%xmm5                     \n"
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -1965,6 +2008,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,

 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
  asm volatile (
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -1993,6 +2037,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrlw     $0x8,%%xmm5                     \n"
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -2029,6 +2074,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                               uint8* dst_y, int pix) {
  asm volatile (
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -2057,6 +2103,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrlw     $0x8,%%xmm5                     \n"
    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm0                     \n"
    "movdqu    0x10(%0),%%xmm1                 \n"
@ -2109,6 +2156,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    "pslld     $0x18,%%xmm4                    \n"

  // 8 pixel loop
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm3                     \n"
    "movdqa    %%xmm3,%%xmm0                   \n"
@ -2184,6 +2232,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    "pslld     $0x18,%%xmm4                    \n"

  // 1 pixel loop
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movd      (%0),%%xmm3                     \n"
    "lea       0x4(%0),%0                      \n"
@ -2241,6 +2290,7 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    "pslld     $0x18,%%xmm4                    \n"

  // 8 pixel loop
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqu    (%0),%%xmm3                     \n"
    "movdqa    %%xmm3,%%xmm0                   \n"
@ -2313,6 +2363,7 @@ void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    "pslld     $0x18,%%xmm4                    \n"

  // 1 pixel loop
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movd      (%0),%%xmm3                     \n"
    "lea       0x4(%0),%0                      \n"
@ -2361,6 +2412,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    "psrld     $0x8,%%xmm5                     \n"

  // 4 pixel loop
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "punpcklbw %%xmm0,%%xmm0                   \n"
@ -2415,6 +2467,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
    "movdqa    %4,%%xmm5                       \n"

  // 4 pixel loop
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "pshufb    %%xmm4,%%xmm0                   \n"
@ -2503,6 +2556,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
    "pslld     $0x18,%%xmm4                    \n"

  // 4 pixel loop
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movzb     0x3(%0),%3                      \n"
--- a/source/scale.cc
+++ b/source/scale.cc
@ -1457,8 +1457,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
    lea        esi, [esi + 16]
    jg         xloop

-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel to allow horizontal filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
    pop        edi
    pop        esi
    ret
@ -1471,8 +1473,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
    lea        esi, [esi + 16]
    jg         xloop1

-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel to allow horizontal filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
    pop        edi
    pop        esi
    ret
@ -1486,8 +1490,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
    lea        esi, [esi + 16]
    jg         xloop2

-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel to allow horizontal filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
    pop        edi
    pop        esi
    ret
@ -1538,8 +1544,11 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    lea        esi, [esi + 16]
    jg         xloop

-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0           // duplicate last pixel to allow horizontal filtering
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
+
    pop        edi
    pop        esi
    ret
@ -1552,8 +1561,10 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    lea        esi, [esi + 16]
    jg         xloop1

-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
    pop        edi
    pop        esi
    ret
@ -1567,8 +1578,10 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    lea        esi, [esi + 16]
    jg         xloop2

-    mov        al, [esi + edi - 1]
-    mov        [esi + edi], al
+    punpckhbw  xmm0, xmm0
+    pshufhw    xmm0, xmm0, 0xff
+    punpckhqdq xmm0, xmm0
+    movdqa     [esi + edi], xmm0
    pop        edi
    pop        esi
    ret
@ -1634,6 +1647,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
  asm volatile (
  "pcmpeqb    %%xmm5,%%xmm5                    \n"
  "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     0x10(%0),%%xmm1                  \n"
@ -1658,6 +1672,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
  asm volatile (
  "pcmpeqb    %%xmm5,%%xmm5                    \n"
  "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     0x10(%0),%%xmm1                  \n"
@ -1692,6 +1707,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, int src_stride,
  asm volatile (
  "pcmpeqb    %%xmm5,%%xmm5                    \n"
  "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqu     (%0),%%xmm0                      \n"
  "movdqu     0x10(%0),%%xmm1                  \n"
@ -1717,6 +1733,7 @@ static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
  asm volatile (
  "pcmpeqb    %%xmm5,%%xmm5                    \n"
  "psrlw      $0x8,%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqu     (%0),%%xmm0                      \n"
  "movdqu     0x10(%0),%%xmm1                  \n"
@ -1752,6 +1769,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
  asm volatile (
  "pcmpeqb    %%xmm5,%%xmm5                    \n"
  "psrld      $0x18,%%xmm5                     \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     0x10(%0),%%xmm1                  \n"
@ -1779,6 +1797,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
  "pcmpeqb    %%xmm7,%%xmm7                    \n"
  "psrlw      $0x8,%%xmm7                      \n"
  "lea        (%4,%4,2),%3                     \n"
+  ".p2align  4                                 \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     0x10(%0),%%xmm1                  \n"
@ -1831,6 +1850,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrlq     $0x38,%%xmm5                    \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    0x10(%0),%%xmm1                 \n"
@ -1860,6 +1880,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
  asm volatile (
    "pxor      %%xmm4,%%xmm4                   \n"
    "sub       $0x1,%5                         \n"
+    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "mov       %0,%3                           \n"
@ -2284,8 +2305,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
    "lea    0x10(%esi),%esi                    \n"
    "jg     1b                                 \n"

-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
    "pop    %edi                               \n"
    "pop    %esi                               \n"
    "ret                                       \n"
@ -2297,8 +2320,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
    "lea    0x10(%esi),%esi                    \n"
    "jg     2b                                 \n"

-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
    "pop    %edi                               \n"
    "pop    %esi                               \n"
    "ret                                       \n"
@ -2311,8 +2336,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
    "lea    0x10(%esi),%esi                    \n"
    "jg     3b                                 \n"

-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
    "pop    %edi                               \n"
    "pop    %esi                               \n"
    "ret                                       \n"
@ -2361,8 +2388,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    "lea    0x10(%esi),%esi                    \n"
    "jg     1b                                 \n"

-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
    "pop    %edi                               \n"
    "pop    %esi                               \n"
    "ret                                       \n"
@ -2374,8 +2403,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    "lea    0x10(%esi),%esi                    \n"
    "jg     2b                                 \n"

-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
    "pop    %edi                               \n"
    "pop    %esi                               \n"
    "ret                                       \n"
@ -2388,8 +2419,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    "lea    0x10(%esi),%esi                    \n"
    "jg     3b                                 \n"

-    "mov    -0x1(%esi,%edi,1),%al              \n"
-    "mov    %al,(%esi,%edi,1)                  \n"
+    "punpckhbw %xmm0,%xmm0                     \n"
+    "pshufhw $0xff,%xmm0,%xmm0                 \n"
+    "punpckhqdq %xmm0,%xmm0                    \n"
+    "movdqa %xmm0,(%esi,%edi,1)                \n"
    "pop    %edi                               \n"
    "pop    %esi                               \n"
    "ret                                       \n"
@ -2401,6 +2434,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
  asm volatile (
  "lea        (%3,%3,2),%%r10                  \n"
  "pxor       %%xmm7,%%xmm7                    \n"
+  ".p2align  4                                 \n"
 "1:"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     0x10(%0),%%xmm1                  \n"
@ -2461,6 +2495,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
  "movdqa     (%3),%%xmm3                      \n"
  "movdqa     (%4),%%xmm4                      \n"
  "movdqa     (%5),%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     0x10(%0),%%xmm2                  \n"
@ -2496,6 +2531,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
  "movdqa     (%8),%%xmm6                      \n"  // _madd11
  "movdqa     (%9),%%xmm7                      \n"  // _round34
  "movdqa     (%10),%%xmm8                     \n"  // _madd21
+  ".p2align  4                                 \n"
 "1:"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     (%0,%3),%%xmm1                   \n"
@ -2553,6 +2589,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
  "movdqa     (%8),%%xmm6                      \n"  // _madd11
  "movdqa     (%9),%%xmm7                      \n"  // _round34
  "movdqa     (%10),%%xmm8                     \n"  // _madd21
+  ".p2align  4                                 \n"
 "1:"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     (%0,%3,1),%%xmm1                 \n"
@ -2609,6 +2646,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
  asm volatile (
  "movdqa     (%3),%%xmm4                      \n"
  "movdqa     (%4),%%xmm5                      \n"
+  ".p2align  4                                 \n"
 "1:"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     0x10(%0),%%xmm1                  \n"
@ -2638,6 +2676,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
  "movdqa     (%5),%%xmm5                      \n"
  "movdqa     (%6),%%xmm6                      \n"
  "pxor       %%xmm7,%%xmm7                    \n"
+  ".p2align  4                                 \n"
 "1:"
  "movdqa     (%0),%%xmm0                      \n"
  "movdqa     (%0,%3,1),%%xmm2                 \n"
@ -2695,6 +2734,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
  "movdqa     (%5),%%xmm5                      \n"
  "movdqa     (%6),%%xmm6                      \n"
  "movdqa     (%7),%%xmm7                      \n"
+  ".p2align  4                                 \n"
 "1:"
  "movdqa     (%0),%%xmm2                      \n"
  "pavgb      (%0,%3,1),%%xmm2                 \n"
@ -2733,6 +2773,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
                                 int dst_width, int source_y_fraction) {
  if (source_y_fraction == 0) {
    asm volatile (
+    ".p2align  4                               \n"
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "lea        0x10(%1),%1                  \n"
@ -2751,6 +2792,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
    return;
  } else if (source_y_fraction == 128) {
    asm volatile (
+    ".p2align  4                               \n"
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "movdqa     (%1,%3,1),%%xmm2             \n"
@ -2781,6 +2823,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
      "punpcklwd  %%xmm5,%%xmm5                \n"
      "pshufd     $0x0,%%xmm5,%%xmm5           \n"
      "pxor       %%xmm7,%%xmm7                \n"
+      ".p2align  4                             \n"
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "movdqa     (%1,%4,1),%%xmm2             \n"
@ -2824,6 +2867,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
                                  int dst_width, int source_y_fraction) {
  if (source_y_fraction <= 1) {
    asm volatile (
+    ".p2align  4                               \n"
   "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "lea        0x10(%1),%1                  \n"
@ -2842,6 +2886,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    return;
  } else if (source_y_fraction == 128) {
    asm volatile (
+    ".p2align  4                               \n"
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "movdqa     (%1,%3,1),%%xmm2             \n"
@ -2870,6 +2915,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
      "movd       %%eax,%%xmm5                 \n"
      "punpcklwd  %%xmm5,%%xmm5                \n"
      "pshufd     $0x0,%%xmm5,%%xmm5           \n"
+      ".p2align  4                             \n"
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "movdqa     (%1,%4,1),%%xmm2             \n"
@ -2904,12 +2950,13 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
 // CPU agnostic row functions
 static void ScaleRowDown2_C(const uint8* src_ptr, int,
                            uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width - 1; x += 2) {
+  uint8* dend = dst + dst_width - 1;
+  do {
    dst[0] = src_ptr[0];
    dst[1] = src_ptr[2];
    dst += 2;
    src_ptr += 4;
-  }
+  } while (dst < dend);
  if (dst_width & 1) {
    dst[0] = src_ptr[0];
  }
@ -2917,28 +2964,30 @@ static void ScaleRowDown2_C(const uint8* src_ptr, int,

 void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
                        uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = (src_ptr[0] + src_ptr[1] +
-             src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
-    dst[1] = (src_ptr[2] + src_ptr[3] +
-             src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + 2) >> 2;
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  uint8* dend = dst + dst_width - 1;
+  do {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
    dst += 2;
-    src_ptr += 4;
-  }
+    s += 4;
+    t += 4;
+  } while (dst < dend);
  if (dst_width & 1) {
-    dst[0] = (src_ptr[0] + src_ptr[1] +
-             src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
  }
 }

 static void ScaleRowDown4_C(const uint8* src_ptr, int,
                            uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width - 1; x += 2) {
+  uint8* dend = dst + dst_width - 1;
+  do {
    dst[0] = src_ptr[0];
    dst[1] = src_ptr[4];
    dst += 2;
    src_ptr += 8;
-  }
+  } while (dst < dend);
  if (dst_width & 1) {
    dst[0] = src_ptr[0];
  }
@ -2946,34 +2995,36 @@ static void ScaleRowDown4_C(const uint8* src_ptr, int,

 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
                               uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width - 1; x += 2) {
+  intptr_t stride = src_stride;
+  uint8* dend = dst + dst_width - 1;
+  do {
    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-             src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
-             src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
-             src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
-             src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
-             src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
             8) >> 4;
    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[src_stride + 4] + src_ptr[src_stride + 5] +
-             src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
-             src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] +
-             src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] +
-             src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] +
-             src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
             8) >> 4;
    dst += 2;
    src_ptr += 8;
-  }
+  } while (dst < dend);
  if (dst_width & 1) {
    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-             src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
-             src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
-             src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
-             src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
-             src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
             8) >> 4;
  }
 }
@ -2985,12 +3036,13 @@ static const int kMaxRow12 = kMaxOutputWidth * 2;

 static void ScaleRowDown8_C(const uint8* src_ptr, int,
                            uint8* dst, int dst_width) {
-  for (int x = 0; x < dst_width - 1; x += 2) {
+  uint8* dend = dst + dst_width - 1;
+  do {
    dst[0] = src_ptr[0];
    dst[1] = src_ptr[8];
    dst += 2;
    src_ptr += 16;
-  }
+  } while (dst < dend);
  if (dst_width & 1) {
    dst[0] = src_ptr[0];
  }
@ -3026,9 +3078,9 @@ static void ScaleRowDown34_C(const uint8* src_ptr, int,
 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
                                   uint8* d, int dst_width) {
  assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = d + dst_width;
  const uint8* s = src_ptr;
  const uint8* t = src_ptr + src_stride;
+  uint8* dend = d + dst_width;
  do {
    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@ -3049,9 +3101,9 @@ static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
                                   uint8* d, int dst_width) {
  assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = d + dst_width;
  const uint8* s = src_ptr;
  const uint8* t = src_ptr + src_stride;
+  uint8* dend = d + dst_width;
  do {
    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@ -3073,8 +3125,8 @@ static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
                                int dst_width) {
  assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = dst_ptr + dst_width;
  const uint8* s = src_ptr;
+  uint8* dend = dst_ptr + dst_width;
  do {
    dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
    dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
@ -3150,20 +3202,21 @@ static void ScaleRowDown38_C(const uint8* src_ptr, int,
 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
                                   uint8* dst_ptr, int dst_width) {
  assert((dst_width % 3 == 0) && (dst_width > 0));
+  intptr_t stride = src_stride;
  for (int i = 0; i < dst_width; i += 3) {
    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-        src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
-        src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
        (65536 / 9) >> 16;
    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
-        src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
-        src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
        (65536 / 9) >> 16;
    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
-        src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
        (65536 / 6) >> 16;
    src_ptr += 8;
    dst_ptr += 3;
@ -3174,15 +3227,16 @@ static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
                                   uint8* dst_ptr, int dst_width) {
  assert((dst_width % 3 == 0) && (dst_width > 0));
+  intptr_t stride = src_stride;
  for (int i = 0; i < dst_width; i += 3) {
    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
-        src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
-        src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
        (65536 / 4) >> 16;
    src_ptr += 8;
    dst_ptr += 3;