From f8e90176855a21248ef5213b34dadd46118e76fc Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Tue, 2 Apr 2013 21:18:12 +0000
Subject: [PATCH] switch toyuy2 from aligned to unaligned BUG=211
 TESTED=ToYUY2* Review URL: https://webrtc-codereview.appspot.com/1274005

git-svn-id: http://libyuv.googlecode.com/svn/trunk@633 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium             |  2 +-
 include/libyuv/version.h    |  2 +-
 source/convert_from.cc      | 16 ++++------------
 source/convert_from_argb.cc |  6 ++----
 source/row_posix.cc         | 12 ++++++------
 source/row_win.cc           | 14 +++++++-------
 6 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/README.chromium b/README.chromium
index a7ababfa3..a3ce58217 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 632
+Version: 633
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 6724efcbf..e4e6266c5 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 632
+#define LIBYUV_VERSION 633
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/convert_from.cc b/source/convert_from.cc
index f33702370..b0de08549 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -252,9 +252,7 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
                         const uint8* src_v, uint8* dst_yuy2, int width) =
       I422ToYUY2Row_C;
 #if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_yuy2, 16) && IS_ALIGNED(dst_stride_yuy2, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_SSE2;
@@ -299,9 +297,7 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
                         const uint8* src_v, uint8* dst_yuy2, int width) =
       I422ToYUY2Row_C;
 #if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_yuy2, 16) && IS_ALIGNED(dst_stride_yuy2, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_SSE2;
@@ -362,9 +358,7 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
                         const uint8* src_v, uint8* dst_uyvy, int width) =
       I422ToUYVYRow_C;
 #if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_uyvy, 16) && IS_ALIGNED(dst_stride_uyvy, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_SSE2;
@@ -409,9 +403,7 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
                         const uint8* src_v, uint8* dst_uyvy, int width) =
       I422ToUYVYRow_C;
 #if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_uyvy, 16) && IS_ALIGNED(dst_stride_uyvy, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_SSE2;
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 7e03bd9f1..cc5171ff3 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -531,8 +531,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
                         const uint8* src_v, uint8* dst_yuy2, int width) =
       I422ToYUY2Row_C;
 #if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
-      IS_ALIGNED(dst_yuy2, 16) && IS_ALIGNED(dst_stride_yuy2, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_SSE2;
@@ -628,8 +627,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
                         const uint8* src_v, uint8* dst_uyvy, int width) =
       I422ToUYVYRow_C;
 #if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
-      IS_ALIGNED(dst_uyvy, 16) && IS_ALIGNED(dst_stride_uyvy, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_SSE2;
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 74ae032b4..83a440787 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -5119,11 +5119,11 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
     "punpcklbw %%xmm3,%%xmm2                     \n"
     "movdqa    (%0),%%xmm0                       \n"
     "lea       0x10(%0),%0                       \n"
-    "movdqa    %%xmm0,%%xmm1                     \n"
+    "movdqu    %%xmm0,%%xmm1                     \n"
     "punpcklbw %%xmm2,%%xmm0                     \n"
     "punpckhbw %%xmm2,%%xmm1                     \n"
-    "movdqa    %%xmm0,(%3)                       \n"
-    "movdqa    %%xmm1,0x10(%3)                   \n"
+    "movdqu    %%xmm0,(%3)                       \n"
+    "movdqu    %%xmm1,0x10(%3)                   \n"
     "lea       0x20(%3),%3                       \n"
     "sub       $0x10,%4                          \n"
     "jg         1b                               \n"
@@ -5152,13 +5152,13 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
     "movq      (%1,%2,1),%%xmm3                  \n"
     "lea       0x8(%1),%1                        \n"
     "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqa    (%0),%%xmm0                       \n"
+    "movdqu    (%0),%%xmm0                       \n"
     "movdqa    %%xmm2,%%xmm1                     \n"
     "lea       0x10(%0),%0                       \n"
     "punpcklbw %%xmm0,%%xmm1                     \n"
     "punpckhbw %%xmm0,%%xmm2                     \n"
-    "movdqa    %%xmm1,(%3)                       \n"
-    "movdqa    %%xmm2,0x10(%3)                   \n"
+    "movdqu    %%xmm1,(%3)                       \n"
+    "movdqu    %%xmm2,0x10(%3)                   \n"
     "lea       0x20(%3),%3                       \n"
     "sub       $0x10,%4                          \n"
     "jg         1b                               \n"
diff --git a/source/row_win.cc b/source/row_win.cc
index 2b7c4df11..44af821b3 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -6144,13 +6144,13 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
     movq       xmm3, qword ptr [esi + edx] // V
     lea        esi, [esi + 8]
     punpcklbw  xmm2, xmm3 // UV
-    movdqa     xmm0, [eax] // Y
+    movdqu     xmm0, [eax] // Y
     lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
+    movdqu     xmm1, xmm0
     punpcklbw  xmm0, xmm2 // YUYV
     punpckhbw  xmm1, xmm2
-    movdqa     [edi], xmm0
-    movdqa     [edi + 16], xmm1
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm1
     lea        edi, [edi + 32]
     sub        ecx, 16
     jg         convertloop
@@ -6182,13 +6182,13 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
     movq       xmm3, qword ptr [esi + edx] // V
     lea        esi, [esi + 8]
     punpcklbw  xmm2, xmm3 // UV
-    movdqa     xmm0, [eax] // Y
+    movdqu     xmm0, [eax] // Y
     movdqa     xmm1, xmm2
     lea        eax, [eax + 16]
     punpcklbw  xmm1, xmm0 // UYVY
     punpckhbw  xmm2, xmm0
-    movdqa     [edi], xmm1
-    movdqa     [edi + 16], xmm2
+    movdqu     [edi], xmm1
+    movdqu     [edi + 16], xmm2
     lea        edi, [edi + 32]
     sub        ecx, 16
     jg         convertloop