SplitUV_Any variations for all CPUS.

BUG=126 TEST=convert tests NV12 with alignments Review URL: https://webrtc-codereview.appspot.com/896007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@426 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-01-01 03:12:16 +08:00 · 2012-10-17 21:54:04 +00:00 · 2012-10-17 21:54:04 +00:00 · db694edfc2
commit db694edfc2
parent 1c396a3d7d
10 changed files with 342 additions and 93 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 425
+Version: 426
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -103,11 +103,25 @@ extern "C" {
 #define HAS_CUMULATIVESUMTOAVERAGE_SSE2
 #endif

-// The following are Windows only.  TODO(fbarchard): Port to gcc.
+// The following are Windows only.
+// TODO(fbarchard): Port to gcc.
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ARGBCOLORTABLEROW_X86
 #endif

+// The following are Yasm x86 only.
+// TODO(fbarchard): Port AVX2 to inline.
+#if !defined(YUV_DISABLE_ASM) && defined(HAVE_YASM)
+    (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__x86_64__) || defined(__i386__))
+#define HAS_SPLITUV_AVX2
+#define HAS_SPLITUV_MMX
+#define HAS_YUY2TOYROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOYROW_MMX
+#define HAS_UYVYTOYROW_MMX
+#endif
+
 // The following are disabled when SSSE3 is available:
 #if !defined(YUV_DISABLE_ASM) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
@ -274,11 +288,25 @@ void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);

+void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUV_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                        int pix);
-void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int pix);
+void SplitUV_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int pix);
+void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int pix);
+void SplitUV_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
+                                  uint8* dst_v, int pix);
+void SplitUV_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUV_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUV_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int pix);

 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_X86(const uint8* src, uint8* dst, int count);
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 425
+#define LIBYUV_VERSION 426

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@ -367,21 +367,64 @@ static int X420ToI420(const uint8* src_y,
  int halfwidth = (width + 1) >> 1;
  void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
      SplitUV_C;
+#if defined(HAS_SPLITUV_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (halfwidth >= 16) {
+      SplitUV = SplitUV_Any_SSE2;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        SplitUV = SplitUV_Unaligned_SSE2;
+        if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
+            IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+            IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+          SplitUV = SplitUV_SSE2;
+        }
+      }
+    }
+  }
+#endif
+#if defined(HAS_SPLITUV_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    if (halfwidth >= 32) {
+      SplitUV = SplitUV_Any_AVX2;
+      if (IS_ALIGNED(halfwidth, 32)) {
+        SplitUV = SplitUV_Unaligned_AVX2;
+        if (IS_ALIGNED(src_uv, 32) && IS_ALIGNED(src_stride_uv, 32) &&
+            IS_ALIGNED(dst_u, 32) && IS_ALIGNED(dst_stride_u, 32) &&
+            IS_ALIGNED(dst_v, 32) && IS_ALIGNED(dst_stride_v, 32)) {
+          SplitUV = SplitUV_AVX2;
+        }
+      }
+    }
+  }
+#endif
 #if defined(HAS_SPLITUV_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
-    SplitUV = SplitUV_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (halfwidth >= 16) {
+      SplitUV = SplitUV_Any_NEON;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        SplitUV = SplitUV_Unaligned_NEON;
+        if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
+            IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+            IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+          SplitUV = SplitUV_NEON;
+        }
+      }
+    }
  }
-#elif defined(HAS_SPLITUV_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(halfwidth, 16) &&
-      IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
-      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-    SplitUV = SplitUV_SSE2;
-  }
-#elif defined(HAS_SPLITUV_MIPS_DSPR2)
-if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
-    SplitUV = SplitUV_MIPS_DSPR2;
+#endif
+#if defined(HAS_SPLITUV_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    if (halfwidth >= 16) {
+      SplitUV = SplitUV_Any_MIPS_DSPR2;
+      if (IS_ALIGNED(halfwidth, 16)) {
+        SplitUV = SplitUV_Unaligned_MIPS_DSPR2;
+        if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+            IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+            IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
+          SplitUV = SplitUV_MIPS_DSPR2;
+        }
+      }
+    }
  }
 #endif

--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -1141,11 +1141,11 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
 #undef UVANY

 #define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP)                        \
-    void NAMEANY(const uint8* src_argb,                                        \
+    void NAMEANY(const uint8* src_uv,                                          \
                 uint8* dst_u, uint8* dst_v, int width) {                      \
      int n = width & ~15;                                                     \
-      ANYTOUV_SIMD(src_argb, dst_u, dst_v, n);                                 \
-      ANYTOUV_C(src_argb  + n * BPP,                                           \
+      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \
+      ANYTOUV_C(src_uv  + n * BPP,                                             \
                 dst_u + (n >> 1),                                             \
                 dst_v + (n >> 1),                                             \
                 width & 15);                                                  \
@ -1165,6 +1165,32 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
 #endif
 #undef UV422ANY

+#define SPLITUVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK)                \
+    void NAMEANY(const uint8* src_uv,                                          \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      int n = width & ~MASK;                                                   \
+      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \
+      ANYTOUV_C(src_uv  + n * BPP,                                             \
+                dst_u + n,                                                     \
+                dst_v + n,                                                     \
+                width & MASK);                                                 \
+    }
+
+#ifdef HAS_SPLITUV_SSE2
+SPLITUVANY(SplitUV_Any_SSE2, SplitUV_Unaligned_SSE2, SplitUV_C, 2, 15)
+#endif
+#ifdef HAS_SPLITUV_AVX2
+SPLITUVANY(SplitUV_Any_AVX2, SplitUV_Unaligned_AVX2, SplitUV_C, 2, 31)
+#endif
+#ifdef HAS_SPLITUV_NEON
+SPLITUVANY(SplitUV_Any_NEON, SplitUV_Unaligned_NEON, SplitUV_C, 2, 15)
+#endif
+#ifdef HAS_SPLITUV_MIPS_DSPR2
+SPLITUVANY(SplitUV_Any_MIPS_DSPR2, SplitUV_Unaligned_MIPS_DSPR2, SplitUV_C,
+           2, 15)
+#endif
+#undef SPLITUVANY
+
 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
                               const int32* previous_cumsum, int width) {
  int32 row_sum[4] = {0, 0, 0, 0};
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@ -23,21 +23,77 @@ void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  __asm__ __volatile__ (
    ".set push                                     \n"
    ".set noreorder                                \n"
-
    "srl             $t4, %[width], 4              \n"  // multiplies of 16
    "blez            $t4, 2f                       \n"
    " andi           %[width], %[width], 0xf       \n"  // residual
-    "andi            $t0, %[src_uv], 0x3           \n"
-    "andi            $t1, %[dst_u], 0x3            \n"
-    "andi            $t2, %[dst_v], 0x3            \n"
-    "or              $t0, $t0, $t1                 \n"
-    "or              $t0, $t0, $t2                 \n"

-    "beqz            $t0, 12f                      \n"  // test if aligned
+  "1:                                              \n"
+    "addiu           $t4, $t4, -1                  \n"
+    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
+    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
+    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
+    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
+    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
+    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
+    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
+    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
+    "addiu           %[src_uv], %[src_uv], 32      \n"
+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
+    "sw              $t9, 0(%[dst_v])              \n"
+    "sw              $t0, 0(%[dst_u])              \n"
+    "sw              $t1, 4(%[dst_v])              \n"
+    "sw              $t2, 4(%[dst_u])              \n"
+    "sw              $t3, 8(%[dst_v])              \n"
+    "sw              $t5, 8(%[dst_u])              \n"
+    "sw              $t6, 12(%[dst_v])             \n"
+    "sw              $t7, 12(%[dst_u])             \n"
+    "addiu           %[dst_v], %[dst_v], 16        \n"
+    "bgtz            $t4, 1b                       \n"
+    " addiu          %[dst_u], %[dst_u], 16        \n"
+
+    "beqz            %[width], 3f                  \n"
    " nop                                          \n"

-    // src and dst are unaligned
-    "1:                                            \n"
+  "2:                                              \n"
+    "lbu             $t0, 0(%[src_uv])             \n"
+    "lbu             $t1, 1(%[src_uv])             \n"
+    "addiu           %[src_uv], %[src_uv], 2       \n"
+    "addiu           %[width], %[width], -1        \n"
+    "sb              $t0, 0(%[dst_u])              \n"
+    "sb              $t1, 0(%[dst_v])              \n"
+    "addiu           %[dst_u], %[dst_u], 1         \n"
+    "bgtz            %[width], 2b                  \n"
+    " addiu          %[dst_v], %[dst_v], 1         \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+     : [src_uv] "+r" (src_uv),
+       [width] "+r" (width),
+       [dst_u] "+r" (dst_u),
+       [dst_v] "+r" (dst_v)
+     :
+     : "t0", "t1", "t2", "t3",
+       "t4", "t5", "t6", "t7", "t8", "t9"
+  );
+}
+
+void SplitUV_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
+                                  uint8* dst_v, int width) {
+  __asm__ __volatile__ (
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+    "srl             $t4, %[width], 4              \n"  // multiplies of 16
+    "blez            $t4, 2f                       \n"
+    " andi           %[width], %[width], 0xf       \n"  // residual
+
+  "1:                                              \n"
    "addiu           $t4, $t4, -1                  \n"
    "lwr             $t0, 0(%[src_uv])             \n"
    "lwl             $t0, 3(%[src_uv])             \n"  // V1 | U1 | V0 | U0
@ -55,7 +111,6 @@ void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    "lwl             $t7, 27(%[src_uv])            \n"  // V13 | U13 | V12 | U12
    "lwr             $t8, 28(%[src_uv])            \n"
    "lwl             $t8, 31(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-
    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
@ -65,7 +120,6 @@ void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
    "addiu           %[src_uv], %[src_uv], 32      \n"
-
    "swr             $t9, 0(%[dst_v])              \n"
    "swl             $t9, 3(%[dst_v])              \n"
    "swr             $t0, 0(%[dst_u])              \n"
@ -88,47 +142,8 @@ void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

    "beqz            %[width], 3f                  \n"
    " nop                                          \n"
-    "b               2f                            \n"
-    " nop                                          \n"

-    // src and dst are aligned
-    "12:                                           \n"
-    "addiu           $t4, $t4, -1                  \n"
-    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
-    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
-    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-
-    "addiu           %[src_uv], %[src_uv], 32      \n"
-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
-
-    "sw              $t9, 0(%[dst_v])              \n"
-    "sw              $t0, 0(%[dst_u])              \n"
-    "sw              $t1, 4(%[dst_v])              \n"
-    "sw              $t2, 4(%[dst_u])              \n"
-    "sw              $t3, 8(%[dst_v])              \n"
-    "sw              $t5, 8(%[dst_u])              \n"
-    "sw              $t6, 12(%[dst_v])             \n"
-    "sw              $t7, 12(%[dst_u])             \n"
-    "addiu           %[dst_v], %[dst_v], 16        \n"
-    "bgtz            $t4, 12b                      \n"
-    " addiu          %[dst_u], %[dst_u], 16        \n"
-
-    "beqz            %[width], 3f                  \n"
-    " nop                                          \n"
-
-    "2:                                            \n"
+  "2:                                              \n"
    "lbu             $t0, 0(%[src_uv])             \n"
    "lbu             $t1, 1(%[src_uv])             \n"
    "addiu           %[src_uv], %[src_uv], 2       \n"
@ -139,7 +154,7 @@ void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
    "bgtz            %[width], 2b                  \n"
    " addiu          %[dst_v], %[dst_v], 1         \n"

-    "3:                                            \n"
+  "3:                                              \n"
    ".set pop                                      \n"
     : [src_uv] "+r" (src_uv),
       [width] "+r" (width),
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -339,6 +339,26 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
 // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
 void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld2.u8    {q0, q1}, [%0:128]!            \n"  // load 16 pairs of UV
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vst1.u8    {q0}, [%1:128]!                \n"  // store U
+    "vst1.u8    {q1}, [%2:128]!                \n"  // Store V
+    "bgt        1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "memory", "cc", "q0", "q1"  // Clobber List
+  );
+}
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
+// Alignment requirement: Multiple of 16 pixels, pointers unaligned.
+void SplitUV_Unaligned_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width) {
  asm volatile (
    ".p2align  2                               \n"
  "1:                                          \n"
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -2480,7 +2480,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
    "pcmpeqb    %%xmm5,%%xmm5                    \n"
    "psrlw      $0x8,%%xmm5                      \n"
    "sub        %1,%2                            \n"
-    ".p2align  4                               \n"
+    ".p2align   4                                \n"
  "1:                                            \n"
    "movdqa     (%0),%%xmm0                      \n"
    "movdqa     0x10(%0),%%xmm1                  \n"
@ -2509,6 +2509,42 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
 #endif
  );
 }
+
+void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int pix) {
+  asm volatile (
+    "pcmpeqb    %%xmm5,%%xmm5                    \n"
+    "psrlw      $0x8,%%xmm5                      \n"
+    "sub        %1,%2                            \n"
+    ".p2align   4                                \n"
+  "1:                                            \n"
+    "movdqu     (%0),%%xmm0                      \n"
+    "movdqu     0x10(%0),%%xmm1                  \n"
+    "lea        0x20(%0),%0                      \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "pand       %%xmm5,%%xmm0                    \n"
+    "pand       %%xmm5,%%xmm1                    \n"
+    "packuswb   %%xmm1,%%xmm0                    \n"
+    "psrlw      $0x8,%%xmm2                      \n"
+    "psrlw      $0x8,%%xmm3                      \n"
+    "packuswb   %%xmm3,%%xmm2                    \n"
+    "movdqu     %%xmm0,(%1)                      \n"
+    "movdqu     %%xmm2,(%1,%2)                   \n"
+    "lea        0x10(%1),%1                      \n"
+    "sub        $0x10,%3                         \n"
+    "jg         1b                               \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(pix)         // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
 #endif  // HAS_SPLITUV_SSE2

 #ifdef HAS_COPYROW_SSE2
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -2581,6 +2581,43 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
    ret
  }
 }
+
+__declspec(naked) __declspec(align(16))
+void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+    align      16
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm0, xmm5   // even bytes
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqu     [edx], xmm0
+    movdqu     [edx + edi], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
 #endif  // HAS_SPLITUV_SSE2

 #ifdef HAS_COPYROW_SSE2
--- a/source/row_x86.asm
+++ b/source/row_x86.asm
@ -1,25 +1,29 @@
-;*
-;* Copyright 2012 The LibYuv Project Authors. All rights reserved.
-;*
-;* Use of this source code is governed by a BSD-style license
-;* that can be found in the LICENSE file in the root of the source
-;* tree. An additional intellectual property rights grant can be found
-;* in the file PATENTS.  All contributing project authors may
-;* be found in the AUTHORS file in the root of the source tree.
-;*
+; 
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+; 
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS.  All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+; 

+%ifdef __YASM_VERSION_ID__
+%if __YASM_VERSION_ID__ < 01020000h
+%error AVX2 is supported only by yasm 1.2.0 or later.
+%endif
+%endif
 %include "x86inc.asm"

 SECTION .text

-; void YUY2ToYRow_SSE2(const uint8* src_yuy2,
-;                      uint8* dst_y, int pix);
+; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);

 %macro YUY2TOYROW 2-3
 cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
 %ifidn %1,YUY2
-    pcmpeqb    m2, m2        ; generate mask 0x00ff00ff
-    psrlw      m2, 8
+    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
+    psrlw      m2, m2, 8
 %endif

    ALIGN      16
@ -28,21 +32,21 @@ cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
    mov%2      m1, [src_yuy2q + mmsize]
    lea        src_yuy2q, [src_yuy2q + mmsize * 2]
 %ifidn %1,YUY2
-    pand       m0, m2   ; YUY2 even bytes are Y
-    pand       m1, m2
+    pand       m0, m0, m2   ; YUY2 even bytes are Y
+    pand       m1, m1, m2
 %else
-    psrlw      m0, 8    ; UYVY odd bytes are Y
-    psrlw      m1, 8
+    psrlw      m0, m0, 8    ; UYVY odd bytes are Y
+    psrlw      m1, m1, 8
 %endif
-    packuswb   m0, m1
+    packuswb   m0, m0, m1
    sub        pixd, mmsize
    mov%2      [dst_yq], m0
    lea        dst_yq, [dst_yq + mmsize]
    jg         .convertloop
-    RET
+    REP_RET
 %endmacro

-; TODO(fbarchard): Remove MMX when SSE2 is required.
+; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
 INIT_MMX MMX
 YUY2TOYROW YUY2,a,
 YUY2TOYROW YUY2,u,_Unaligned
@ -59,3 +63,43 @@ YUY2TOYROW YUY2,u,_Unaligned
 YUY2TOYROW UYVY,a,
 YUY2TOYROW UYVY,u,_Unaligned

+; void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 
+;                             int pix) {
+
+%macro SPLITUV 1-2
+cglobal SplitUV%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
+    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
+    psrlw      m4, m4, 8
+    sub        dst_vq, dst_uq
+
+    ALIGN      16
+.convertloop:
+    mov%1      m0, [src_uvq]
+    mov%1      m1, [src_uvq + mmsize]
+    lea        src_uvq, [src_uvq + mmsize * 2]
+    mova       m2, m0
+    mova       m3, m1
+    pand       m0, m0, m4        ; even bytes
+    pand       m1, m1, m4
+    packuswb   m0, m0, m1
+    psrlw      m2, m2, 8         ; odd bytes
+    psrlw      m3, m3, 8
+    packuswb   m2, m2, m3
+    mov%1      [dst_uq], m0
+    mov%1      [dst_uq + dst_vq], m2
+    lea        dst_uq, [dst_uq + mmsize]
+    sub        pixd, mmsize
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+INIT_MMX MMX
+SPLITUV a,
+SPLITUV u,_Unaligned
+INIT_XMM SSE2
+SPLITUV a,
+SPLITUV u,_Unaligned
+INIT_YMM AVX2
+SPLITUV a,
+SPLITUV u,_Unaligned
+