From 16a96645b4987fddbcf726dea2fcf5dc87ca10e1 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Fri, 2 Mar 2012 22:38:09 +0000
Subject: [PATCH] splituv and mirroruv in row use 2 pixels at a time in C
 BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/432006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@201 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium          |   2 +-
 include/libyuv/version.h |   2 +-
 source/rotate.cc         |  87 +-----------------
 source/rotate_neon.cc    | 162 ---------------------------------
 source/row.h             |   6 ++
 source/row_common.cc     | 132 ++++++++++++++++-----------
 source/row_neon.cc       | 191 +++++++++++++++++++++++++++++++++++----
 source/row_posix.cc      |  36 +++++++-
 source/row_win.cc        |  36 +++++++-
 9 files changed, 330 insertions(+), 324 deletions(-)

diff --git a/README.chromium b/README.chromium
index 6280fad0d..d046e76c4 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 200
+Version: 201
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index dcf55aab9..1c2305867 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 200
+#define LIBYUV_VERSION 201
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
 
diff --git a/source/rotate.cc b/source/rotate.cc
index 4d186c06e..a10313614 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -23,12 +23,6 @@ extern "C" {
 
 #if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
     !defined(YUV_DISABLE_ASM)
-// Note static const preferred, but gives internal compiler error on gcc 4.2
-// Shuffle table for reversing the bytes of UV channels.
-uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-
 #if defined(__APPLE__) && defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
     ".text                                     \n"                             \
@@ -759,8 +753,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 static void TransposeWx8_C(const uint8* src, int src_stride,
                            uint8* dst, int dst_stride,
                            int w) {
-  int i;
-  for (i = 0; i < w; ++i) {
+  for (int i = 0; i < w; ++i) {
     dst[0] = src[0 * src_stride];
     dst[1] = src[1 * src_stride];
     dst[2] = src[2 * src_stride];
@@ -777,9 +770,8 @@ static void TransposeWx8_C(const uint8* src, int src_stride,
 static void TransposeWxH_C(const uint8* src, int src_stride,
                            uint8* dst, int dst_stride,
                            int width, int height) {
-  int i, j;
-  for (i = 0; i < width; ++i)
-    for (j = 0; j < height; ++j)
+  for (int i = 0; i < width; ++i)
+    for (int j = 0; j < height; ++j)
       dst[i * dst_stride + j] = src[j * src_stride + i];
 }
 
@@ -1005,79 +997,6 @@ void RotateUV270(const uint8* src, int src_stride,
               width, height);
 }
 
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
-#define HAS_MIRRORROW_UV_SSSE3
-__declspec(naked)
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
-                       int width) {
-  __asm {
-    push      edi
-    mov       eax, [esp + 4 + 4]   // src
-    mov       edx, [esp + 4 + 8]   // dst_a
-    mov       edi, [esp + 4 + 12]  // dst_b
-    mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm1, kShuffleMirrorUV
-    lea       eax, [eax + ecx * 2 - 16]
-    sub       edi, edx
-
- convertloop:
-    movdqa    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufb    xmm0, xmm1
-    sub       ecx, 8
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [edx + edi], xmm0
-    lea       edx, [edx + 8]
-    ja        convertloop
-
-    pop       edi
-    ret
-  }
-}
-
-#elif (defined(__i386__) || defined(__x86_64__)) && \
-    !defined(YUV_DISABLE_ASM)
-#define HAS_MIRRORROW_UV_SSSE3
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
-                       int width) {
-  intptr_t temp_width = static_cast<intptr_t>(width);
-  asm volatile (
-    "movdqa     %4,%%xmm1                        \n"
-    "lea        -16(%0,%3,2),%0                  \n"
-    "sub        %1,%2                            \n"
-  "1:                                            \n"
-    "movdqa     (%0),%%xmm0                      \n"
-    "lea        -16(%0),%0                       \n"
-    "pshufb     %%xmm1,%%xmm0                    \n"
-    "sub        $8,%3                            \n"
-    "movlpd     %%xmm0,(%1)                      \n"
-    "movhpd     %%xmm0,(%1,%2)                   \n"
-    "lea        8(%1),%1                         \n"
-    "ja         1b                               \n"
-  : "+r"(src),      // %0
-    "+r"(dst_a),    // %1
-    "+r"(dst_b),    // %2
-    "+r"(temp_width)  // %3
-  : "m"(kShuffleMirrorUV) // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-#endif
-
-static void MirrorRowUV_C(const uint8* src,
-                            uint8* dst_a, uint8* dst_b,
-                            int width) {
-  src += (width << 1) - 2;
-  for (int i = 0; i < width; ++i) {
-    dst_a[i] = src[0];
-    dst_b[i] = src[1];
-    src -= 2;
-  }
-}
-
 void RotateUV180(const uint8* src, int src_stride,
                  uint8* dst_a, int dst_stride_a,
                  uint8* dst_b, int dst_stride_b,
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index 264e81e7e..0f01f02b2 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -19,94 +19,6 @@ extern "C" {
 
 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
 
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // compute where to start writing destination
-    "add         %1, %2                        \n"
-
-    // work on segments that are multiples of 16
-    "lsrs        r3, %2, #4                    \n"
-
-    // the output is written in two block.  8 bytes followed
-    // by another 8.  reading is done sequentially, from left to
-    // right.  writing is done from right to left in block sizes
-    // %1, the destination pointer is incremented after writing
-    // the first of the two blocks.  need to subtract that 8 off
-    // along with 16 to get the next location.
-    "mov         r3, #-24                      \n"
-
-    "beq         2f                            \n"
-
-    // back of destination by the size of the register that is
-    // going to be mirrord
-    "sub         %1, #16                       \n"
-
-    // the loop needs to run on blocks of 16.  what will be left
-    // over is either a negative number, the residuals that need
-    // to be done, or 0.  if this isn't subtracted off here the
-    // loop will run one extra time.
-    "sub         %2, #16                       \n"
-
-    "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                 \n"  // src += 16
-
-        // mirror the bytes in the 64 bit segments.  unable to mirror
-        // the bytes in the entire 128 bits in one go.
-      "vrev64.8    q0, q0                      \n"
-
-        // because of the inability to mirror the entire 128 bits
-        // mirror the writing out of the two 64 bit segments.
-      "vst1.8      {d1}, [%1]!                 \n"
-      "vst1.8      {d0}, [%1], r3              \n"  // dst -= 16
-
-      "subs        %2, #16                     \n"
-    "bge         1b                            \n"
-
-    // add 16 back to the counter.  if the result is 0 there is no
-    // residuals so jump past
-    "adds        %2, #16                       \n"
-    "beq         5f                            \n"
-
-    "add         %1, #16                       \n"
-
-    "2:                                        \n"
-
-    "mov         r3, #-3                       \n"
-
-    "sub         %1, #2                        \n"
-    "subs        %2, #2                        \n"
-    // check for 16*n+1 scenarios where segments_of_2 should not
-    // be run, but there is something left over.
-    "blt         4f                            \n"
-
-// do this in neon registers as per
-// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-    "3:                                        \n"
-    "vld2.8      {d0[0], d1[0]}, [%0]!         \n"  // src += 2
-
-    "vst1.8      {d1[0]}, [%1]!                \n"
-    "vst1.8      {d0[0]}, [%1], r3             \n"  // dst -= 2
-
-    "subs        %2, #2                        \n"
-    "bge         3b                            \n"
-
-    "adds        %2, #2                        \n"
-    "beq         5f                            \n"
-
-    "4:                                        \n"
-    "add         %1, #1                        \n"
-    "vld1.8      {d0[0]}, [%0]                 \n"
-    "vst1.8      {d0[0]}, [%1]                 \n"
-
-    "5:                                        \n"
-    : "+r"(src),              // %0
-      "+r"(dst),              // %1
-      "+r"(width)             // %2
-    :
-    : "memory", "cc", "r3", "q0"
-  );
-}
-
 static const uvec8 vtbl_4x4_transpose =
   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
 
@@ -272,80 +184,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
   );
 }
 
-void MirrorRowUV_NEON(const uint8* src,
-                       uint8* dst_a, uint8* dst_b,
-                       int width) {
-  asm volatile (
-    // compute where to start writing destination
-    "add         %1, %3                        \n"  // dst_a + width
-    "add         %2, %3                        \n"  // dst_b + width
-
-    // work on input segments that are multiples of 16, but
-    // width that has been passed is output segments, half
-    // the size of input.
-    "lsrs        r12, %3, #3                   \n"
-
-    "beq         2f                            \n"
-
-    // the output is written in to two blocks.
-    "mov         r12, #-8                      \n"
-
-    // back of destination by the size of the register that is
-    // going to be mirrord
-    "sub         %1, #8                        \n"
-    "sub         %2, #8                        \n"
-
-    // the loop needs to run on blocks of 8.  what will be left
-    // over is either a negative number, the residuals that need
-    // to be done, or 0.  if this isn't subtracted off here the
-    // loop will run one extra time.
-    "sub         %3, #8                        \n"
-
-    "1:                                        \n"
-      "vld2.8      {d0, d1}, [%0]!             \n"  // src += 16
-
-      // mirror the bytes in the 64 bit segments
-      "vrev64.8    q0, q0                      \n"
-
-      "vst1.8      {d0}, [%1], r12             \n"  // dst_a -= 8
-      "vst1.8      {d1}, [%2], r12             \n"  // dst_b -= 8
-
-      "subs        %3, #8                      \n"
-      "bge         1b                          \n"
-
-    // add 8 back to the counter.  if the result is 0 there is no
-    // residuals so return
-    "adds        %3, #8                        \n"
-    "beq         4f                            \n"
-
-    "add         %1, #8                        \n"
-    "add         %2, #8                        \n"
-
-    "2:                                        \n"
-
-    "mov         r12, #-1                      \n"
-
-    "sub         %1, #1                        \n"
-    "sub         %2, #1                        \n"
-
-    "3:                                        \n"
-      "vld2.8      {d0[0], d1[0]}, [%0]!       \n"  // src += 2
-
-      "vst1.8      {d0[0]}, [%1], r12          \n"  // dst_a -= 1
-      "vst1.8      {d1[0]}, [%2], r12          \n"  // dst_b -= 1
-
-      "subs        %3, %3, #1                  \n"
-      "bgt         3b                          \n"
-    "4:                                        \n"
-    : "+r"(src),              // %0
-      "+r"(dst_a),            // %1
-      "+r"(dst_b),            // %2
-      "+r"(width)             // %3
-    :
-    : "memory", "cc", "r12", "q0"
-  );
-}
-
 static const uvec8 vtbl_4x4_transpose_di =
   { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
 
diff --git a/source/row.h b/source/row.h
index f90eb2473..f1da41e50 100644
--- a/source/row.h
+++ b/source/row.h
@@ -54,6 +54,7 @@ extern "C" {
 #define HAS_I444TOARGBROW_SSSE3
 #define HAS_MIRRORROW_SSSE3
 #define HAS_MIRRORROW_SSE2
+#define HAS_MIRRORROWUV_SSSE3
 #define HAS_SPLITUV_SSE2
 #define HAS_COPYROW_SSE2
 #define HAS_COPYROW_X86
@@ -66,6 +67,7 @@ extern "C" {
 // The following are available on Neon platforms
 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
 #define HAS_MIRRORROW_NEON
+#define HAS_MIRRORROWUV_NEON
 #define HAS_SPLITUV_NEON
 #define HAS_COPYROW_NEON
 #define HAS_I420TOARGBROW_NEON
@@ -126,6 +128,10 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
 void MirrorRow_C(const uint8* src, uint8* dst, int width);
 
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
diff --git a/source/row_common.cc b/source/row_common.cc
index 23352c8b1..30b1da6fd 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -18,8 +18,8 @@ namespace libyuv {
 extern "C" {
 #endif
 
-void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
     // To support in-place conversion.
     uint8 r = src_abgr[0];
     uint8 g = src_abgr[1];
@@ -34,8 +34,8 @@ void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
   }
 }
 
-void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
     // To support in-place conversion.
     uint8 a = src_bgra[0];
     uint8 r = src_bgra[1];
@@ -50,8 +50,8 @@ void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
   }
 }
 
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
     uint8 b = src_rgb24[0];
     uint8 g = src_rgb24[1];
     uint8 r = src_rgb24[2];
@@ -64,8 +64,8 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   }
 }
 
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
     uint8 r = src_raw[0];
     uint8 g = src_raw[1];
     uint8 b = src_raw[2];
@@ -78,8 +78,8 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
   }
 }
 
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
     uint8 b = src_rgb[0] & 0x1f;
     uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
     uint8 r = src_rgb[1] >> 3;
@@ -92,8 +92,8 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
   }
 }
 
-void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
     uint8 b = src_rgb[0] & 0x1f;
     uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
     uint8 r = (src_rgb[1] & 0x7c) >> 2;
@@ -107,8 +107,8 @@ void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
   }
 }
 
-void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
     uint8 a = src_rgb[1] >> 4;
     uint8 r = src_rgb[1] & 0x0f;
     uint8 g = src_rgb[0] >> 4;
@@ -122,8 +122,8 @@ void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
   }
 }
 
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
     uint8 b = src_argb[0];
     uint8 g = src_argb[1];
     uint8 r = src_argb[2];
@@ -135,8 +135,8 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
     uint8 b = src_argb[0];
     uint8 g = src_argb[1];
     uint8 r = src_argb[2];
@@ -149,8 +149,8 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
 }
 
 // TODO(fbarchard): support big endian CPU
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
     uint8 b = src_argb[0] >> 3;
     uint8 g = src_argb[1] >> 2;
     uint8 r = src_argb[2] >> 3;
@@ -160,8 +160,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
     uint8 b = src_argb[0] >> 3;
     uint8 g = src_argb[1] >> 3;
     uint8 r = src_argb[2] >> 3;
@@ -172,8 +172,8 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
     uint8 b = src_argb[0] >> 4;
     uint8 g = src_argb[1] >> 4;
     uint8 r = src_argb[2] >> 4;
@@ -233,9 +233,9 @@ MAKEROWY(ARGB,2,1,0)
 MAKEROWY(BGRA,1,2,3)
 MAKEROWY(ABGR,0,1,2)
 
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
   // Copy a Y to RGB.
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
     uint8 y = src_y[0];
     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
     dst_argb[3] = 255u;
@@ -360,20 +360,42 @@ void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
 
 void MirrorRow_C(const uint8* src, uint8* dst, int width) {
   src += width - 1;
-  for (int i = 0; i < width; ++i) {
-    dst[i] = src[0];
-    --src;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
   }
 }
 
-void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
-  // Copy a row of UV.
-  for (int x = 0; x < pix; ++x) {
-    dst_u[0] = src_uv[0];
-    dst_v[0] = src_uv[1];
-    src_uv += 2;
-    dst_u += 1;
-    dst_v += 1;
+void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  src_uv += (width - 1) << 1;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[-2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[-2 + 1];
+    src_uv -= 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[3];
+    src_uv += 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
   }
 }
 
@@ -383,9 +405,9 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {
 
 // Filter 2 rows of YUY2 UV's (422) into U and V (420)
 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int pix) {
+                   uint8* dst_u, uint8* dst_v, int width) {
   // Output a row of UV values, filtering 2 rows of YUY2
-  for (int x = 0; x < pix; x += 2) {
+  for (int x = 0; x < width; x += 2) {
     dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
     dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
     src_yuy2 += 4;
@@ -394,20 +416,22 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
   }
 }
 
-void YUY2ToYRow_C(const uint8* src_yuy2,
-                  uint8* dst_y, int pix) {
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
   // Copy a row of yuy2 Y values
-  for (int x = 0; x < pix; ++x) {
-    dst_y[0] = src_yuy2[0];
-    src_yuy2 += 2;
-    dst_y += 1;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_yuy2[0];
+    dst_y[x + 1] = src_yuy2[2];
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[0];
   }
 }
 
 void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int pix) {
+                   uint8* dst_u, uint8* dst_v, int width) {
   // Copy a row of uyvy UV values
-  for (int x = 0; x < pix; x += 2) {
+  for (int x = 0; x < width; x += 2) {
     dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
     dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
     src_uyvy += 4;
@@ -416,13 +440,15 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
   }
 }
 
-void UYVYToYRow_C(const uint8* src_uyvy,
-                  uint8* dst_y, int pix) {
+void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
   // Copy a row of uyvy Y values
-  for (int x = 0; x < pix; ++x) {
-    dst_y[0] = src_uyvy[1];
-    src_uyvy += 2;
-    dst_y += 1;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_yuy2[1];
+    dst_y[x + 1] = src_yuy2[3];
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[1];
   }
 }
 
diff --git a/source/row_neon.cc b/source/row_neon.cc
index afa98bdbf..bd88eae93 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -22,38 +22,26 @@ extern "C" {
     "vld1.u8    {d0}, [%0]!                    \n"                             \
     "vld1.u32   {d2[0]}, [%1]!                 \n"                             \
     "vld1.u32   {d2[1]}, [%2]!                 \n"                             \
-                                                                               \
     "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
-                                                                               \
     "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
-                                                                               \
     "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
-                                                                               \
     "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
     "vtrn.u8    d0, d1                         \n"                             \
-                                                                               \
     "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
     "vmul.s16   q0, q0, q14                    \n"                             \
-                                                                               \
     "vadd.s16   d18, d19                       \n"                             \
-                                                                               \
     "vqadd.s16  d20, d0, d16                   \n"                             \
     "vqadd.s16  d21, d1, d16                   \n"                             \
-                                                                               \
     "vqadd.s16  d22, d0, d17                   \n"                             \
     "vqadd.s16  d23, d1, d17                   \n"                             \
-                                                                               \
     "vqadd.s16  d16, d0, d18                   \n"                             \
     "vqadd.s16  d17, d1, d18                   \n"                             \
-                                                                               \
     "vqrshrun.s16 d0, q10, #6                  \n"                             \
     "vqrshrun.s16 d1, q11, #6                  \n"                             \
     "vqrshrun.s16 d2, q8, #6                   \n"                             \
-                                                                               \
     "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
     "vmovl.u8   q11, d1                        \n"                             \
     "vmovl.u8   q8, d2                         \n"                             \
-                                                                               \
     "vtrn.u8    d20, d21                       \n"                             \
     "vtrn.u8    d22, d23                       \n"                             \
     "vtrn.u8    d16, d17                       \n"                             \
@@ -67,7 +55,7 @@ static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
                              0, 0, 0, 0, 0, 0, 0, 0 };
 #endif
 
-#if defined(HAS_I420TOARGBROW_NEON)
+#ifdef HAS_I420TOARGBROW_NEON
 void I420ToARGBRow_NEON(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -99,7 +87,7 @@ YUVTORGB
 }
 #endif
 
-#if defined(HAS_I420TOBGRAROW_NEON)
+#ifdef HAS_I420TOBGRAROW_NEON
 void I420ToBGRARow_NEON(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -132,7 +120,7 @@ YUVTORGB
 }
 #endif
 
-#if defined(HAS_I420TOABGRROW_NEON)
+#ifdef HAS_I420TOABGRROW_NEON
 void I420ToABGRRow_NEON(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -165,10 +153,10 @@ YUVTORGB
 }
 #endif
 
-#if defined(HAS_SPLITUV_NEON)
+#ifdef HAS_SPLITUV_NEON
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
 // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
-void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
   "1:                                          \n"
     "vld2.u8    {q0,q1}, [%0]!                 \n"  // load 16 pairs of UV
@@ -179,15 +167,14 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     : "+r"(src_uv),  // %0
       "+r"(dst_u),   // %1
       "+r"(dst_v),   // %2
-      "+r"(pix)      // %3  // Output registers
+      "+r"(width)    // %3  // Output registers
     :                       // Input registers
     : "memory", "cc", "q0", "q1" // Clobber List
   );
 }
 #endif
 
-#if defined(HAS_COPYROW_NEON)
-// TODO(fbarchard): Test without pld on NexusS
+#ifdef HAS_COPYROW_NEON
 // Copy multiple of 64
 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
   asm volatile (
@@ -206,6 +193,170 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
 }
 #endif  // HAS_COPYROW_NEON
 
+#ifdef HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // compute where to start writing destination
+    "add         %1, %2                        \n"
+
+    // work on segments that are multiples of 16
+    "lsrs        r3, %2, #4                    \n"
+
+    // the output is written in two block.  8 bytes followed
+    // by another 8.  reading is done sequentially, from left to
+    // right.  writing is done from right to left in block sizes
+    // %1, the destination pointer is incremented after writing
+    // the first of the two blocks.  need to subtract that 8 off
+    // along with 16 to get the next location.
+    "mov         r3, #-24                      \n"
+
+    "beq         2f                            \n"
+
+    // back of destination by the size of the register that is
+    // going to be mirrord
+    "sub         %1, #16                       \n"
+
+    // the loop needs to run on blocks of 16.  what will be left
+    // over is either a negative number, the residuals that need
+    // to be done, or 0.  if this isn't subtracted off here the
+    // loop will run one extra time.
+    "sub         %2, #16                       \n"
+
+    "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                 \n"  // src += 16
+
+        // mirror the bytes in the 64 bit segments.  unable to mirror
+        // the bytes in the entire 128 bits in one go.
+      "vrev64.8    q0, q0                      \n"
+
+        // because of the inability to mirror the entire 128 bits
+        // mirror the writing out of the two 64 bit segments.
+      "vst1.8      {d1}, [%1]!                 \n"
+      "vst1.8      {d0}, [%1], r3              \n"  // dst -= 16
+
+      "subs        %2, #16                     \n"
+    "bge         1b                            \n"
+
+    // add 16 back to the counter.  if the result is 0 there is no
+    // residuals so jump past
+    "adds        %2, #16                       \n"
+    "beq         5f                            \n"
+
+    "add         %1, #16                       \n"
+
+    "2:                                        \n"
+
+    "mov         r3, #-3                       \n"
+
+    "sub         %1, #2                        \n"
+    "subs        %2, #2                        \n"
+    // check for 16*n+1 scenarios where segments_of_2 should not
+    // be run, but there is something left over.
+    "blt         4f                            \n"
+
+// do this in neon registers as per
+// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+    "3:                                        \n"
+    "vld2.8      {d0[0], d1[0]}, [%0]!         \n"  // src += 2
+
+    "vst1.8      {d1[0]}, [%1]!                \n"
+    "vst1.8      {d0[0]}, [%1], r3             \n"  // dst -= 2
+
+    "subs        %2, #2                        \n"
+    "bge         3b                            \n"
+
+    "adds        %2, #2                        \n"
+    "beq         5f                            \n"
+
+    "4:                                        \n"
+    "add         %1, #1                        \n"
+    "vld1.8      {d0[0]}, [%0]                 \n"
+    "vst1.8      {d0[0]}, [%1]                 \n"
+
+    "5:                                        \n"
+    : "+r"(src),   // %0
+      "+r"(dst),   // %1
+      "+r"(width)  // %2
+    :
+    : "memory", "cc", "r3", "q0"
+  );
+}
+#endif
+
+#ifdef HAS_MIRRORROWUV_NEON
+void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
+  asm volatile (
+    // compute where to start writing destination
+    "add         %1, %3                        \n"  // dst_a + width
+    "add         %2, %3                        \n"  // dst_b + width
+
+    // work on input segments that are multiples of 16, but
+    // width that has been passed is output segments, half
+    // the size of input.
+    "lsrs        r12, %3, #3                   \n"
+
+    "beq         2f                            \n"
+
+    // the output is written in to two blocks.
+    "mov         r12, #-8                      \n"
+
+    // back of destination by the size of the register that is
+    // going to be mirrord
+    "sub         %1, #8                        \n"
+    "sub         %2, #8                        \n"
+
+    // the loop needs to run on blocks of 8.  what will be left
+    // over is either a negative number, the residuals that need
+    // to be done, or 0.  if this isn't subtracted off here the
+    // loop will run one extra time.
+    "sub         %3, #8                        \n"
+
+    "1:                                        \n"
+      "vld2.8      {d0, d1}, [%0]!             \n"  // src += 16
+
+      // mirror the bytes in the 64 bit segments
+      "vrev64.8    q0, q0                      \n"
+
+      "vst1.8      {d0}, [%1], r12             \n"  // dst_a -= 8
+      "vst1.8      {d1}, [%2], r12             \n"  // dst_b -= 8
+
+      "subs        %3, #8                      \n"
+      "bge         1b                          \n"
+
+    // add 8 back to the counter.  if the result is 0 there is no
+    // residuals so return
+    "adds        %3, #8                        \n"
+    "beq         4f                            \n"
+
+    "add         %1, #8                        \n"
+    "add         %2, #8                        \n"
+
+    "2:                                        \n"
+
+    "mov         r12, #-1                      \n"
+
+    "sub         %1, #1                        \n"
+    "sub         %2, #1                        \n"
+
+    "3:                                        \n"
+      "vld2.8      {d0[0], d1[0]}, [%0]!       \n"  // src += 2
+
+      "vst1.8      {d0[0]}, [%1], r12          \n"  // dst_a -= 1
+      "vst1.8      {d1[0]}, [%2], r12          \n"  // dst_b -= 1
+
+      "subs        %3, %3, #1                  \n"
+      "bgt         3b                          \n"
+    "4:                                        \n"
+    : "+r"(src),    // %0
+      "+r"(dst_a),  // %1
+      "+r"(dst_b),  // %2
+      "+r"(width)   // %3
+    :
+    : "memory", "cc", "r12", "q0"
+  );
+}
+#endif
+
 #endif  // __ARM_NEON__
 
 #ifdef __cplusplus
diff --git a/source/row_posix.cc b/source/row_posix.cc
index de9a954d7..ee2e77968 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1493,7 +1493,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
 #endif
 
 #ifdef HAS_MIRRORROW_SSSE3
-
 // Shuffle table for reversing the bytes.
 CONST uvec8 kShuffleMirror = {
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
@@ -1524,7 +1523,6 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 #endif
 
 #ifdef HAS_MIRRORROW_SSE2
-
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = static_cast<intptr_t>(width);
   asm volatile (
@@ -1554,6 +1552,40 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 }
 #endif
 
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+CONST uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile (
+    "movdqa    %4,%%xmm1                       \n"
+    "lea       -16(%0,%3,2),%0                 \n"
+    "sub       %1,%2                           \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "lea       -16(%0),%0                      \n"
+    "pshufb    %%xmm1,%%xmm0                   \n"
+    "sub       $8,%3                           \n"
+    "movlpd    %%xmm0,(%1)                     \n"
+    "movhpd    %%xmm0,(%1,%2)                  \n"
+    "lea       8(%1),%1                        \n"
+    "ja        1b                              \n"
+  : "+r"(src),      // %0
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(temp_width)  // %3
+  : "m"(kShuffleMirrorUV) // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+#endif
+
 #ifdef HAS_SPLITUV_SSE2
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
diff --git a/source/row_win.cc b/source/row_win.cc
index d6169a306..8b008e830 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -1501,7 +1501,6 @@ __asm {
 #endif
 
 #ifdef HAS_MIRRORROW_SSE2
-
 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
 // version can not.
 __declspec(naked)
@@ -1529,6 +1528,41 @@ __asm {
 }
 #endif
 
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+
+__declspec(naked)
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  __asm {
+    push      edi
+    mov       eax, [esp + 4 + 4]   // src
+    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       edi, [esp + 4 + 12]  // dst_v
+    mov       ecx, [esp + 4 + 16]  // width
+    movdqa    xmm1, kShuffleMirrorUV
+    lea       eax, [eax + ecx * 2 - 16]
+    sub       edi, edx
+
+ convertloop:
+    movdqa    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm1
+    sub       ecx, 8
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [edx + edi], xmm0
+    lea       edx, [edx + 8]
+    ja        convertloop
+
+    pop       edi
+    ret
+  }
+}
+#endif
+
 #ifdef HAS_SPLITUV_SSE2
 __declspec(naked)
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {