diff --git a/README.chromium b/README.chromium
index 5b350e5d4..d87a8190a 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 207
+Version: 208
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index c4e31c673..579f72071 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 207
+#define LIBYUV_VERSION 208
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
 
diff --git a/source/compare.cc b/source/compare.cc
index 44c08661c..7d188d082 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -89,6 +89,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
     pxor       xmm5, xmm5
     sub        edx, eax
 
+    align      16
   wloop:
     movdqa     xmm1, [eax]
     movdqa     xmm2, [eax + edx]
diff --git a/source/convert.cc b/source/convert.cc
index aefa6d0e2..02e0a06f7 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -1746,8 +1746,8 @@ int ConvertToI420(const uint8* sample, size_t sample_size,
       r = NV12ToI420Rotate(src, src_width,
                            src_uv, aligned_src_width,
                            y, y_stride,
-                           u, u_stride,
                            v, v_stride,
+                           u, u_stride,
                            dst_width, inv_dst_height, rotation);
       break;
     case FOURCC_M420:
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 5af8eea41..0893eed71 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -222,6 +222,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
     mov        ecx, [esp + 8 + 20]   // width
     sub        edx, esi
 
+    align      16
   convertloop:
     movq       xmm2, qword ptr [esi] // U
     movq       xmm3, qword ptr [esi + edx] // V
@@ -260,6 +261,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
     mov        ecx, [esp + 8 + 20]   // width
     sub        edx, esi
 
+    align      16
   convertloop:
     movq       xmm2, qword ptr [esi] // U
     movq       xmm3, qword ptr [esi + edx] // V
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 1e77ae723..1ed557b94 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -722,6 +722,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
     lea        ecx, [ebp * 4]
     sub        edx, ecx             // stride - width * 4
 
+    align      16
   convertloop:
     mov        ecx, ebp
     rep stosd
diff --git a/source/rotate.cc b/source/rotate.cc
index 310ff4935..670114800 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -83,9 +83,11 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
     mov       edx, [esp + 12 + 12]  // dst
     mov       esi, [esp + 12 + 16]  // dst_stride
     mov       ecx, [esp + 12 + 20]  // width
- convertloop:
+
     // Read in the data from the source pointer.
     // First round of bit swap.
+    align      16
+ convertloop:
     movq      xmm0, qword ptr [eax]
     lea       ebp, [eax + 8]
     movq      xmm1, qword ptr [eax + edi]
@@ -182,6 +184,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     and       esp, ~15
     mov       [esp + 16], ecx
     mov       ecx, [ecx + 16 + 28]  // w
+
+    align      16
  convertloop:
     // Read in the data from the source pointer.
     // First round of bit swap.
diff --git a/source/scale.cc b/source/scale.cc
index 0870ce79f..d0ebb075b 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -685,6 +685,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -714,6 +715,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -757,6 +759,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
     pcmpeqb    xmm5, xmm5            // generate mask 0x000000ff
     psrld      xmm5, 24
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -790,6 +793,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
     psrlw      xmm7, 8
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -848,6 +852,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
     pcmpeqb    xmm5, xmm5            // generate mask isolating 1 src 8 bytes
     psrlq      xmm5, 56
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -882,6 +887,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
     lea        edi, [esi + esi * 2]  // src_stride * 3
     pxor       xmm7, xmm7
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]           // average 8 rows to 1
     movdqa     xmm1, [eax + 16]
@@ -957,6 +963,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
     movdqa     xmm4, _shuf1
     movdqa     xmm5, _shuf2
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -1009,6 +1016,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
     movdqa     xmm6, _madd11
     movdqa     xmm7, _round34
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]           // pixels 0..7
     movdqa     xmm1, [eax + esi]
@@ -1066,6 +1074,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
     movdqa     xmm6, _madd11
     movdqa     xmm7, _round34
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]           // pixels 0..7
     movdqa     xmm1, [eax + esi]
@@ -1123,6 +1132,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
     movdqa     xmm4, _shuf38a
     movdqa     xmm5, _shuf38b
 
+    align      16
   xloop:
     movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
     movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
@@ -1158,6 +1168,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
     movdqa     xmm6, _scaleac3
     pxor       xmm7, xmm7
 
+    align      16
   xloop:
     movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
     movdqa     xmm2, [eax + esi]
@@ -1224,6 +1235,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
     movdqa     xmm6, _shufab2
     movdqa     xmm7, _scaleab2
 
+    align      16
   xloop:
     movdqa     xmm2, [eax]           // average 2 rows into xmm2
     pavgb      xmm2, [eax + esi]
@@ -1256,8 +1268,6 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEADDROWS_SSE2
 
 // Reads 16xN bytes and produces 16 shorts at a time.
-// TODO(fbarchard): support 1 rows
-// TODO(fbarchard): align loops
 __declspec(naked)
 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
                               uint16* dst_ptr, int src_width,
@@ -1275,6 +1285,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
     pxor       xmm4, xmm4
     dec        ebx
 
+    align      16
   xloop:
     // first row
     movdqa     xmm0, [esi]
@@ -1284,8 +1295,11 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
     punpckhbw  xmm1, xmm4
     lea        esi, [esi + 16]
     mov        ebp, ebx
+    test       ebp, ebp
+    je         ydone
 
     // sum remaining rows
+    align      16
   yloop:
     movdqa     xmm2, [eax]       // read 16 pixels
     lea        eax, [eax + edx]  // advance to next row
@@ -1296,7 +1310,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
     paddusw    xmm1, xmm3
     sub        ebp, 1
     ja         yloop
-
+  ydone:
     movdqa     [edi], xmm0
     movdqa     [edi + 16], xmm1
     lea        edi, [edi + 32]
@@ -1342,6 +1356,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     pshufd     xmm5, xmm5, 0
     pxor       xmm7, xmm7
 
+    align      16
   xloop:
     movdqa     xmm0, [esi]
     movdqa     xmm2, [esi + edx]
@@ -1371,6 +1386,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     pop        esi
     ret
 
+    align      16
   xloop1:
     movdqa     xmm0, [esi]
     sub        ecx, 16
@@ -1384,6 +1400,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     pop        esi
     ret
 
+    align      16
   xloop2:
     movdqa     xmm0, [esi]
     pavgb      xmm0, [esi + edx]
@@ -1428,6 +1445,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     punpcklwd  xmm5, xmm5
     pshufd     xmm5, xmm5, 0
 
+    align      16
   xloop:
     movdqa     xmm0, [esi]
     movdqa     xmm2, [esi + edx]
@@ -1450,6 +1468,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     pop        esi
     ret
 
+    align      16
   xloop1:
     movdqa     xmm0, [esi]
     sub        ecx, 16
@@ -1463,6 +1482,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     pop        esi
     ret
 
+    align      16
   xloop2:
     movdqa     xmm0, [esi]
     pavgb      xmm0, [esi + edx]
@@ -1496,6 +1516,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     movdqa     xmm6, _madd11
     movdqa     xmm7, _madd21
 
+    align      16
   wloop:
     movdqa     xmm0, [eax]           // pixels 0..7
     pshufb     xmm0, xmm2
@@ -1712,6 +1733,8 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
     "punpcklbw %%xmm4,%%xmm0                   \n"
     "punpckhbw %%xmm4,%%xmm1                   \n"
     "mov       %5,%2                           \n"
+    "test      %2,%2                           \n"
+    "je        3f                              \n"
   "2:                                          \n"
     "movdqa    (%0),%%xmm2                     \n"
     "add       %6,%0                           \n"
@@ -1722,6 +1745,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
     "paddusw   %%xmm3,%%xmm1                   \n"
     "sub       $0x1,%2                         \n"
     "ja        2b                              \n"
+  "3:                                          \n"
     "movdqa    %%xmm0,(%1)                     \n"
     "movdqa    %%xmm1,0x10(%1)                 \n"
     "lea       0x10(%3),%0                     \n"