diff --git a/README.chromium b/README.chromium
index 8864a09a8..d76316cdc 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 820
+Version: 821
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index e56684bd2..889e398e7 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 820
+#define LIBYUV_VERSION 821
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index e7c4bcebf..03c504e15 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1134,9 +1134,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
     if (IS_ALIGNED(width, 4)) {
       ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
@@ -1191,9 +1189,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
   void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
                              int width) = ARGBUnattenuateRow_C;
 #if defined(HAS_ARGBUNATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
     ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 01bb29e44..1ed20f41b 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -4117,17 +4117,17 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     // 4 pixel loop.
     ".p2align  4                               \n"
   "1:                                          \n"
-    "movdqa    "MEMACCESS(0)",%%xmm0           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm0           \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm1           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm1           \n"
     "punpcklbw %%xmm1,%%xmm1                   \n"
     "pmulhuw   %%xmm1,%%xmm0                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm1           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm1           \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm2           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm2           \n"
     "punpckhbw %%xmm2,%%xmm2                   \n"
     "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm2           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm2           \n"
     "lea       "MEMLEA(0x10,0)",%0             \n"
     "pand      %%xmm3,%%xmm2                   \n"
     "psrlw     $0x8,%%xmm0                     \n"
@@ -4135,7 +4135,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     "packuswb  %%xmm1,%%xmm0                   \n"
     "por       %%xmm2,%%xmm0                   \n"
     "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,"MEMACCESS(1)"           \n"
+    "movdqu    %%xmm0,"MEMACCESS(1)"           \n"
     "lea       "MEMLEA(0x10,1)",%1             \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
@@ -4161,7 +4161,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     // 4 pixel loop.
     ".p2align  4                               \n"
   "1:                                          \n"
-    "movdqa    "MEMACCESS(0)",%%xmm0           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm0           \n"
     "movzb     "MEMACCESS2(0x03,0)",%3         \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
@@ -4171,7 +4171,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
     "movlhps   %%xmm3,%%xmm2                   \n"
     "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm1           \n"
+    "movdqu    "MEMACCESS(0)",%%xmm1           \n"
     "movzb     "MEMACCESS2(0x0b,0)",%3         \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
@@ -4184,7 +4184,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "lea       "MEMLEA(0x10,0)",%0             \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,"MEMACCESS(1)"           \n"
+    "movdqu    %%xmm0,"MEMACCESS(1)"           \n"
     "lea       "MEMLEA(0x10,1)",%1             \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
diff --git a/source/row_win.cc b/source/row_win.cc
index 86d91262f..1f232a83c 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4586,7 +4586,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
+    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
     psrlw      xmm7, 15
     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
     psrlw      xmm6, 8
@@ -4788,17 +4788,17 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 
     align      16
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
     pshufb     xmm0, xmm4       // isolate first 2 alphas
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
     pmulhuw    xmm0, xmm1       // rgb * a
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     pshufb     xmm1, xmm5       // isolate next 2 alphas
-    movdqa     xmm2, [eax]      // read 4 pixels
+    movdqu     xmm2, [eax]      // read 4 pixels
     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
     pmulhuw    xmm1, xmm2       // rgb * a
-    movdqa     xmm2, [eax]      // mask original alpha
+    movdqu     xmm2, [eax]      // mask original alpha
     lea        eax, [eax + 16]
     pand       xmm2, xmm3
     psrlw      xmm0, 8
@@ -4806,7 +4806,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     packuswb   xmm0, xmm1
     por        xmm0, xmm2       // copy original alpha
     sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
 
@@ -4874,7 +4874,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 
     align      16
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
     movzx      esi, byte ptr [eax + 3]  // first alpha
     movzx      edi, byte ptr [eax + 7]  // second alpha
     punpcklbw  xmm0, xmm0       // first 2
@@ -4885,7 +4885,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     movlhps    xmm2, xmm3
     pmulhuw    xmm0, xmm2       // rgb * a
 
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     movzx      esi, byte ptr [eax + 11]  // third alpha
     movzx      edi, byte ptr [eax + 15]  // forth alpha
     punpckhbw  xmm1, xmm1       // next 2
@@ -4899,7 +4899,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 
     packuswb   xmm0, xmm1
     sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     jg         convertloop
     pop        edi