splituv and mirroruv in row use 2 pixels at a time in C

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/432006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@201 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2012-03-02 22:38:09 +00:00 · 2012-03-02 22:38:09 +00:00 · 16a96645b4
commit 16a96645b4
parent f69e90a19e
9 changed files with 330 additions and 324 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 200
+Version: 201
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 200
+#define LIBYUV_VERSION 201

 #endif  // INCLUDE_LIBYUV_VERSION_H_

--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -23,12 +23,6 @@ extern "C" {

 #if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
    !defined(YUV_DISABLE_ASM)
-// Note static const preferred, but gives internal compiler error on gcc 4.2
-// Shuffle table for reversing the bytes of UV channels.
-uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-
 #if defined(__APPLE__) && defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
@ -759,8 +753,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
 static void TransposeWx8_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
                           int w) {
-  int i;
-  for (i = 0; i < w; ++i) {
+  for (int i = 0; i < w; ++i) {
    dst[0] = src[0 * src_stride];
    dst[1] = src[1 * src_stride];
    dst[2] = src[2 * src_stride];
@ -777,9 +770,8 @@ static void TransposeWx8_C(const uint8* src, int src_stride,
 static void TransposeWxH_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
                           int width, int height) {
-  int i, j;
-  for (i = 0; i < width; ++i)
-    for (j = 0; j < height; ++j)
+  for (int i = 0; i < width; ++i)
+    for (int j = 0; j < height; ++j)
      dst[i * dst_stride + j] = src[j * src_stride + i];
 }

@ -1005,79 +997,6 @@ void RotateUV270(const uint8* src, int src_stride,
              width, height);
 }

-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
-#define HAS_MIRRORROW_UV_SSSE3
-__declspec(naked)
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
-                       int width) {
-  __asm {
-    push      edi
-    mov       eax, [esp + 4 + 4]   // src
-    mov       edx, [esp + 4 + 8]   // dst_a
-    mov       edi, [esp + 4 + 12]  // dst_b
-    mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm1, kShuffleMirrorUV
-    lea       eax, [eax + ecx * 2 - 16]
-    sub       edi, edx
-
- convertloop:
-    movdqa    xmm0, [eax]
-    lea       eax, [eax - 16]
-    pshufb    xmm0, xmm1
-    sub       ecx, 8
-    movlpd    qword ptr [edx], xmm0
-    movhpd    qword ptr [edx + edi], xmm0
-    lea       edx, [edx + 8]
-    ja        convertloop
-
-    pop       edi
-    ret
-  }
-}
-
-#elif (defined(__i386__) || defined(__x86_64__)) && \
-    !defined(YUV_DISABLE_ASM)
-#define HAS_MIRRORROW_UV_SSSE3
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
-                       int width) {
-  intptr_t temp_width = static_cast<intptr_t>(width);
-  asm volatile (
-    "movdqa     %4,%%xmm1                        \n"
-    "lea        -16(%0,%3,2),%0                  \n"
-    "sub        %1,%2                            \n"
-  "1:                                            \n"
-    "movdqa     (%0),%%xmm0                      \n"
-    "lea        -16(%0),%0                       \n"
-    "pshufb     %%xmm1,%%xmm0                    \n"
-    "sub        $8,%3                            \n"
-    "movlpd     %%xmm0,(%1)                      \n"
-    "movhpd     %%xmm0,(%1,%2)                   \n"
-    "lea        8(%1),%1                         \n"
-    "ja         1b                               \n"
-  : "+r"(src),      // %0
-    "+r"(dst_a),    // %1
-    "+r"(dst_b),    // %2
-    "+r"(temp_width)  // %3
-  : "m"(kShuffleMirrorUV) // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-#endif
-
-static void MirrorRowUV_C(const uint8* src,
-                            uint8* dst_a, uint8* dst_b,
-                            int width) {
-  src += (width << 1) - 2;
-  for (int i = 0; i < width; ++i) {
-    dst_a[i] = src[0];
-    dst_b[i] = src[1];
-    src -= 2;
-  }
-}
-
 void RotateUV180(const uint8* src, int src_stride,
                 uint8* dst_a, int dst_stride_a,
                 uint8* dst_b, int dst_stride_b,
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@ -19,94 +19,6 @@ extern "C" {

 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)

-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // compute where to start writing destination
-    "add         %1, %2                        \n"
-
-    // work on segments that are multiples of 16
-    "lsrs        r3, %2, #4                    \n"
-
-    // the output is written in two block.  8 bytes followed
-    // by another 8.  reading is done sequentially, from left to
-    // right.  writing is done from right to left in block sizes
-    // %1, the destination pointer is incremented after writing
-    // the first of the two blocks.  need to subtract that 8 off
-    // along with 16 to get the next location.
-    "mov         r3, #-24                      \n"
-
-    "beq         2f                            \n"
-
-    // back of destination by the size of the register that is
-    // going to be mirrord
-    "sub         %1, #16                       \n"
-
-    // the loop needs to run on blocks of 16.  what will be left
-    // over is either a negative number, the residuals that need
-    // to be done, or 0.  if this isn't subtracted off here the
-    // loop will run one extra time.
-    "sub         %2, #16                       \n"
-
-    "1:                                        \n"
-      "vld1.8      {q0}, [%0]!                 \n"  // src += 16
-
-        // mirror the bytes in the 64 bit segments.  unable to mirror
-        // the bytes in the entire 128 bits in one go.
-      "vrev64.8    q0, q0                      \n"
-
-        // because of the inability to mirror the entire 128 bits
-        // mirror the writing out of the two 64 bit segments.
-      "vst1.8      {d1}, [%1]!                 \n"
-      "vst1.8      {d0}, [%1], r3              \n"  // dst -= 16
-
-      "subs        %2, #16                     \n"
-    "bge         1b                            \n"
-
-    // add 16 back to the counter.  if the result is 0 there is no
-    // residuals so jump past
-    "adds        %2, #16                       \n"
-    "beq         5f                            \n"
-
-    "add         %1, #16                       \n"
-
-    "2:                                        \n"
-
-    "mov         r3, #-3                       \n"
-
-    "sub         %1, #2                        \n"
-    "subs        %2, #2                        \n"
-    // check for 16*n+1 scenarios where segments_of_2 should not
-    // be run, but there is something left over.
-    "blt         4f                            \n"
-
-// do this in neon registers as per
-// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
-    "3:                                        \n"
-    "vld2.8      {d0[0], d1[0]}, [%0]!         \n"  // src += 2
-
-    "vst1.8      {d1[0]}, [%1]!                \n"
-    "vst1.8      {d0[0]}, [%1], r3             \n"  // dst -= 2
-
-    "subs        %2, #2                        \n"
-    "bge         3b                            \n"
-
-    "adds        %2, #2                        \n"
-    "beq         5f                            \n"
-
-    "4:                                        \n"
-    "add         %1, #1                        \n"
-    "vld1.8      {d0[0]}, [%0]                 \n"
-    "vst1.8      {d0[0]}, [%1]                 \n"
-
-    "5:                                        \n"
-    : "+r"(src),              // %0
-      "+r"(dst),              // %1
-      "+r"(width)             // %2
-    :
-    : "memory", "cc", "r3", "q0"
-  );
-}
-
 static const uvec8 vtbl_4x4_transpose =
  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };

@ -272,80 +184,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
  );
 }

-void MirrorRowUV_NEON(const uint8* src,
-                       uint8* dst_a, uint8* dst_b,
-                       int width) {
-  asm volatile (
-    // compute where to start writing destination
-    "add         %1, %3                        \n"  // dst_a + width
-    "add         %2, %3                        \n"  // dst_b + width
-
-    // work on input segments that are multiples of 16, but
-    // width that has been passed is output segments, half
-    // the size of input.
-    "lsrs        r12, %3, #3                   \n"
-
-    "beq         2f                            \n"
-
-    // the output is written in to two blocks.
-    "mov         r12, #-8                      \n"
-
-    // back of destination by the size of the register that is
-    // going to be mirrord
-    "sub         %1, #8                        \n"
-    "sub         %2, #8                        \n"
-
-    // the loop needs to run on blocks of 8.  what will be left
-    // over is either a negative number, the residuals that need
-    // to be done, or 0.  if this isn't subtracted off here the
-    // loop will run one extra time.
-    "sub         %3, #8                        \n"
-
-    "1:                                        \n"
-      "vld2.8      {d0, d1}, [%0]!             \n"  // src += 16
-
-      // mirror the bytes in the 64 bit segments
-      "vrev64.8    q0, q0                      \n"
-
-      "vst1.8      {d0}, [%1], r12             \n"  // dst_a -= 8
-      "vst1.8      {d1}, [%2], r12             \n"  // dst_b -= 8
-
-      "subs        %3, #8                      \n"
-      "bge         1b                          \n"
-
-    // add 8 back to the counter.  if the result is 0 there is no
-    // residuals so return
-    "adds        %3, #8                        \n"
-    "beq         4f                            \n"
-
-    "add         %1, #8                        \n"
-    "add         %2, #8                        \n"
-
-    "2:                                        \n"
-
-    "mov         r12, #-1                      \n"
-
-    "sub         %1, #1                        \n"
-    "sub         %2, #1                        \n"
-
-    "3:                                        \n"
-      "vld2.8      {d0[0], d1[0]}, [%0]!       \n"  // src += 2
-
-      "vst1.8      {d0[0]}, [%1], r12          \n"  // dst_a -= 1
-      "vst1.8      {d1[0]}, [%2], r12          \n"  // dst_b -= 1
-
-      "subs        %3, %3, #1                  \n"
-      "bgt         3b                          \n"
-    "4:                                        \n"
-    : "+r"(src),              // %0
-      "+r"(dst_a),            // %1
-      "+r"(dst_b),            // %2
-      "+r"(width)             // %3
-    :
-    : "memory", "cc", "r12", "q0"
-  );
-}
-
 static const uvec8 vtbl_4x4_transpose_di =
  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };

--- a/source/row.h
+++ b/source/row.h
@ -54,6 +54,7 @@ extern "C" {
 #define HAS_I444TOARGBROW_SSSE3
 #define HAS_MIRRORROW_SSSE3
 #define HAS_MIRRORROW_SSE2
+#define HAS_MIRRORROWUV_SSSE3
 #define HAS_SPLITUV_SSE2
 #define HAS_COPYROW_SSE2
 #define HAS_COPYROW_X86
@ -66,6 +67,7 @@ extern "C" {
 // The following are available on Neon platforms
 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
 #define HAS_MIRRORROW_NEON
+#define HAS_MIRRORROWUV_NEON
 #define HAS_SPLITUV_NEON
 #define HAS_COPYROW_NEON
 #define HAS_I420TOARGBROW_NEON
@ -126,6 +128,10 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
 void MirrorRow_C(const uint8* src, uint8* dst, int width);

+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
+
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -18,8 +18,8 @@ namespace libyuv {
 extern "C" {
 #endif

-void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
    // To support in-place conversion.
    uint8 r = src_abgr[0];
    uint8 g = src_abgr[1];
@ -34,8 +34,8 @@ void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
  }
 }

-void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
    // To support in-place conversion.
    uint8 a = src_bgra[0];
    uint8 r = src_bgra[1];
@ -50,8 +50,8 @@ void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
  }
 }

-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_rgb24[0];
    uint8 g = src_rgb24[1];
    uint8 r = src_rgb24[2];
@ -64,8 +64,8 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) {
  }
 }

-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
    uint8 r = src_raw[0];
    uint8 g = src_raw[1];
    uint8 b = src_raw[2];
@ -78,8 +78,8 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
  }
 }

-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_rgb[0] & 0x1f;
    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
    uint8 r = src_rgb[1] >> 3;
@ -92,8 +92,8 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
  }
 }

-void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_rgb[0] & 0x1f;
    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
    uint8 r = (src_rgb[1] & 0x7c) >> 2;
@ -107,8 +107,8 @@ void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
  }
 }

-void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+  for (int x = 0; x < width; ++x) {
    uint8 a = src_rgb[1] >> 4;
    uint8 r = src_rgb[1] & 0x0f;
    uint8 g = src_rgb[0] >> 4;
@ -122,8 +122,8 @@ void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
  }
 }

-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0];
    uint8 g = src_argb[1];
    uint8 r = src_argb[2];
@ -135,8 +135,8 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
  }
 }

-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0];
    uint8 g = src_argb[1];
    uint8 r = src_argb[2];
@ -149,8 +149,8 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
 }

 // TODO(fbarchard): support big endian CPU
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0] >> 3;
    uint8 g = src_argb[1] >> 2;
    uint8 r = src_argb[2] >> 3;
@ -160,8 +160,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
  }
 }

-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0] >> 3;
    uint8 g = src_argb[1] >> 3;
    uint8 r = src_argb[2] >> 3;
@ -172,8 +172,8 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
  }
 }

-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
-  for (int x = 0; x < pix; ++x) {
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  for (int x = 0; x < width; ++x) {
    uint8 b = src_argb[0] >> 4;
    uint8 g = src_argb[1] >> 4;
    uint8 r = src_argb[2] >> 4;
@ -233,9 +233,9 @@ MAKEROWY(ARGB,2,1,0)
 MAKEROWY(BGRA,1,2,3)
 MAKEROWY(ABGR,0,1,2)

-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
  // Copy a Y to RGB.
-  for (int x = 0; x < pix; ++x) {
+  for (int x = 0; x < width; ++x) {
    uint8 y = src_y[0];
    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
    dst_argb[3] = 255u;
@ -360,20 +360,42 @@ void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {

 void MirrorRow_C(const uint8* src, uint8* dst, int width) {
  src += width - 1;
-  for (int i = 0; i < width; ++i) {
-    dst[i] = src[0];
-    --src;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
  }
 }

-void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
-  // Copy a row of UV.
-  for (int x = 0; x < pix; ++x) {
-    dst_u[0] = src_uv[0];
-    dst_v[0] = src_uv[1];
-    src_uv += 2;
-    dst_u += 1;
-    dst_v += 1;
+void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  src_uv += (width - 1) << 1;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[-2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[-2 + 1];
+    src_uv -= 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[3];
+    src_uv += 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
  }
 }

@ -383,9 +405,9 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {

 // Filter 2 rows of YUY2 UV's (422) into U and V (420)
 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int pix) {
+                   uint8* dst_u, uint8* dst_v, int width) {
  // Output a row of UV values, filtering 2 rows of YUY2
-  for (int x = 0; x < pix; x += 2) {
+  for (int x = 0; x < width; x += 2) {
    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
    src_yuy2 += 4;
@ -394,20 +416,22 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
  }
 }

-void YUY2ToYRow_C(const uint8* src_yuy2,
-                  uint8* dst_y, int pix) {
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
  // Copy a row of yuy2 Y values
-  for (int x = 0; x < pix; ++x) {
-    dst_y[0] = src_yuy2[0];
-    src_yuy2 += 2;
-    dst_y += 1;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_yuy2[0];
+    dst_y[x + 1] = src_yuy2[2];
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[0];
  }
 }

 void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int pix) {
+                   uint8* dst_u, uint8* dst_v, int width) {
  // Copy a row of uyvy UV values
-  for (int x = 0; x < pix; x += 2) {
+  for (int x = 0; x < width; x += 2) {
    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
    src_uyvy += 4;
@ -416,13 +440,15 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
  }
 }

-void UYVYToYRow_C(const uint8* src_uyvy,
-                  uint8* dst_y, int pix) {
+void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
  // Copy a row of uyvy Y values
-  for (int x = 0; x < pix; ++x) {
-    dst_y[0] = src_uyvy[1];
-    src_uyvy += 2;
-    dst_y += 1;
+  for (int x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_yuy2[1];
+    dst_y[x + 1] = src_yuy2[3];
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[1];
  }
 }

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -22,38 +22,26 @@ extern "C" {
    "vld1.u8    {d0}, [%0]!                    \n"                             \
    "vld1.u32   {d2[0]}, [%1]!                 \n"                             \
    "vld1.u32   {d2[1]}, [%2]!                 \n"                             \
-                                                                               \
    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
-                                                                               \
    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
-                                                                               \
    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
-                                                                               \
    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
    "vtrn.u8    d0, d1                         \n"                             \
-                                                                               \
    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
    "vmul.s16   q0, q0, q14                    \n"                             \
-                                                                               \
    "vadd.s16   d18, d19                       \n"                             \
-                                                                               \
    "vqadd.s16  d20, d0, d16                   \n"                             \
    "vqadd.s16  d21, d1, d16                   \n"                             \
-                                                                               \
    "vqadd.s16  d22, d0, d17                   \n"                             \
    "vqadd.s16  d23, d1, d17                   \n"                             \
-                                                                               \
    "vqadd.s16  d16, d0, d18                   \n"                             \
    "vqadd.s16  d17, d1, d18                   \n"                             \
-                                                                               \
    "vqrshrun.s16 d0, q10, #6                  \n"                             \
    "vqrshrun.s16 d1, q11, #6                  \n"                             \
    "vqrshrun.s16 d2, q8, #6                   \n"                             \
-                                                                               \
    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
    "vmovl.u8   q11, d1                        \n"                             \
    "vmovl.u8   q8, d2                         \n"                             \
-                                                                               \
    "vtrn.u8    d20, d21                       \n"                             \
    "vtrn.u8    d22, d23                       \n"                             \
    "vtrn.u8    d16, d17                       \n"                             \
@ -67,7 +55,7 @@ static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
                             0, 0, 0, 0, 0, 0, 0, 0 };
 #endif

-#if defined(HAS_I420TOARGBROW_NEON)
+#ifdef HAS_I420TOARGBROW_NEON
 void I420ToARGBRow_NEON(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@ -99,7 +87,7 @@ YUVTORGB
 }
 #endif

-#if defined(HAS_I420TOBGRAROW_NEON)
+#ifdef HAS_I420TOBGRAROW_NEON
 void I420ToBGRARow_NEON(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@ -132,7 +120,7 @@ YUVTORGB
 }
 #endif

-#if defined(HAS_I420TOABGRROW_NEON)
+#ifdef HAS_I420TOABGRROW_NEON
 void I420ToABGRRow_NEON(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
@ -165,10 +153,10 @@ YUVTORGB
 }
 #endif

-#if defined(HAS_SPLITUV_NEON)
+#ifdef HAS_SPLITUV_NEON
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
 // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
-void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
  asm volatile (
  "1:                                          \n"
    "vld2.u8    {q0,q1}, [%0]!                 \n"  // load 16 pairs of UV
@ -179,15 +167,14 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
    : "+r"(src_uv),  // %0
      "+r"(dst_u),   // %1
      "+r"(dst_v),   // %2
-      "+r"(pix)      // %3  // Output registers
+      "+r"(width)    // %3  // Output registers
    :                       // Input registers
    : "memory", "cc", "q0", "q1" // Clobber List
  );
 }
 #endif

-#if defined(HAS_COPYROW_NEON)
-// TODO(fbarchard): Test without pld on NexusS
+#ifdef HAS_COPYROW_NEON
 // Copy multiple of 64
 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  asm volatile (
@ -206,6 +193,170 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
 }
 #endif  // HAS_COPYROW_NEON

+#ifdef HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // compute where to start writing destination
+    "add         %1, %2                        \n"
+
+    // work on segments that are multiples of 16
+    "lsrs        r3, %2, #4                    \n"
+
+    // the output is written in two block.  8 bytes followed
+    // by another 8.  reading is done sequentially, from left to
+    // right.  writing is done from right to left in block sizes
+    // %1, the destination pointer is incremented after writing
+    // the first of the two blocks.  need to subtract that 8 off
+    // along with 16 to get the next location.
+    "mov         r3, #-24                      \n"
+
+    "beq         2f                            \n"
+
+    // back of destination by the size of the register that is
+    // going to be mirrord
+    "sub         %1, #16                       \n"
+
+    // the loop needs to run on blocks of 16.  what will be left
+    // over is either a negative number, the residuals that need
+    // to be done, or 0.  if this isn't subtracted off here the
+    // loop will run one extra time.
+    "sub         %2, #16                       \n"
+
+    "1:                                        \n"
+      "vld1.8      {q0}, [%0]!                 \n"  // src += 16
+
+        // mirror the bytes in the 64 bit segments.  unable to mirror
+        // the bytes in the entire 128 bits in one go.
+      "vrev64.8    q0, q0                      \n"
+
+        // because of the inability to mirror the entire 128 bits
+        // mirror the writing out of the two 64 bit segments.
+      "vst1.8      {d1}, [%1]!                 \n"
+      "vst1.8      {d0}, [%1], r3              \n"  // dst -= 16
+
+      "subs        %2, #16                     \n"
+    "bge         1b                            \n"
+
+    // add 16 back to the counter.  if the result is 0 there is no
+    // residuals so jump past
+    "adds        %2, #16                       \n"
+    "beq         5f                            \n"
+
+    "add         %1, #16                       \n"
+
+    "2:                                        \n"
+
+    "mov         r3, #-3                       \n"
+
+    "sub         %1, #2                        \n"
+    "subs        %2, #2                        \n"
+    // check for 16*n+1 scenarios where segments_of_2 should not
+    // be run, but there is something left over.
+    "blt         4f                            \n"
+
+// do this in neon registers as per
+// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
+    "3:                                        \n"
+    "vld2.8      {d0[0], d1[0]}, [%0]!         \n"  // src += 2
+
+    "vst1.8      {d1[0]}, [%1]!                \n"
+    "vst1.8      {d0[0]}, [%1], r3             \n"  // dst -= 2
+
+    "subs        %2, #2                        \n"
+    "bge         3b                            \n"
+
+    "adds        %2, #2                        \n"
+    "beq         5f                            \n"
+
+    "4:                                        \n"
+    "add         %1, #1                        \n"
+    "vld1.8      {d0[0]}, [%0]                 \n"
+    "vst1.8      {d0[0]}, [%1]                 \n"
+
+    "5:                                        \n"
+    : "+r"(src),   // %0
+      "+r"(dst),   // %1
+      "+r"(width)  // %2
+    :
+    : "memory", "cc", "r3", "q0"
+  );
+}
+#endif
+
+#ifdef HAS_MIRRORROWUV_NEON
+void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
+  asm volatile (
+    // compute where to start writing destination
+    "add         %1, %3                        \n"  // dst_a + width
+    "add         %2, %3                        \n"  // dst_b + width
+
+    // work on input segments that are multiples of 16, but
+    // width that has been passed is output segments, half
+    // the size of input.
+    "lsrs        r12, %3, #3                   \n"
+
+    "beq         2f                            \n"
+
+    // the output is written in to two blocks.
+    "mov         r12, #-8                      \n"
+
+    // back of destination by the size of the register that is
+    // going to be mirrord
+    "sub         %1, #8                        \n"
+    "sub         %2, #8                        \n"
+
+    // the loop needs to run on blocks of 8.  what will be left
+    // over is either a negative number, the residuals that need
+    // to be done, or 0.  if this isn't subtracted off here the
+    // loop will run one extra time.
+    "sub         %3, #8                        \n"
+
+    "1:                                        \n"
+      "vld2.8      {d0, d1}, [%0]!             \n"  // src += 16
+
+      // mirror the bytes in the 64 bit segments
+      "vrev64.8    q0, q0                      \n"
+
+      "vst1.8      {d0}, [%1], r12             \n"  // dst_a -= 8
+      "vst1.8      {d1}, [%2], r12             \n"  // dst_b -= 8
+
+      "subs        %3, #8                      \n"
+      "bge         1b                          \n"
+
+    // add 8 back to the counter.  if the result is 0 there is no
+    // residuals so return
+    "adds        %3, #8                        \n"
+    "beq         4f                            \n"
+
+    "add         %1, #8                        \n"
+    "add         %2, #8                        \n"
+
+    "2:                                        \n"
+
+    "mov         r12, #-1                      \n"
+
+    "sub         %1, #1                        \n"
+    "sub         %2, #1                        \n"
+
+    "3:                                        \n"
+      "vld2.8      {d0[0], d1[0]}, [%0]!       \n"  // src += 2
+
+      "vst1.8      {d0[0]}, [%1], r12          \n"  // dst_a -= 1
+      "vst1.8      {d1[0]}, [%2], r12          \n"  // dst_b -= 1
+
+      "subs        %3, %3, #1                  \n"
+      "bgt         3b                          \n"
+    "4:                                        \n"
+    : "+r"(src),    // %0
+      "+r"(dst_a),  // %1
+      "+r"(dst_b),  // %2
+      "+r"(width)   // %3
+    :
+    : "memory", "cc", "r12", "q0"
+  );
+}
+#endif
+
 #endif  // __ARM_NEON__

 #ifdef __cplusplus
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -1493,7 +1493,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
 #endif

 #ifdef HAS_MIRRORROW_SSSE3
-
 // Shuffle table for reversing the bytes.
 CONST uvec8 kShuffleMirror = {
  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
@ -1524,7 +1523,6 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 #endif

 #ifdef HAS_MIRRORROW_SSE2
-
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
@ -1554,6 +1552,40 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 }
 #endif

+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+CONST uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile (
+    "movdqa    %4,%%xmm1                       \n"
+    "lea       -16(%0,%3,2),%0                 \n"
+    "sub       %1,%2                           \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "lea       -16(%0),%0                      \n"
+    "pshufb    %%xmm1,%%xmm0                   \n"
+    "sub       $8,%3                           \n"
+    "movlpd    %%xmm0,(%1)                     \n"
+    "movhpd    %%xmm0,(%1,%2)                  \n"
+    "lea       8(%1),%1                        \n"
+    "ja        1b                              \n"
+  : "+r"(src),      // %0
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(temp_width)  // %3
+  : "m"(kShuffleMirrorUV) // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+#endif
+
 #ifdef HAS_SPLITUV_SSE2
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -1501,7 +1501,6 @@ __asm {
 #endif

 #ifdef HAS_MIRRORROW_SSE2
-
 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
 // version can not.
 __declspec(naked)
@ -1529,6 +1528,41 @@ __asm {
 }
 #endif

+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+
+__declspec(naked)
+void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  __asm {
+    push      edi
+    mov       eax, [esp + 4 + 4]   // src
+    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       edi, [esp + 4 + 12]  // dst_v
+    mov       ecx, [esp + 4 + 16]  // width
+    movdqa    xmm1, kShuffleMirrorUV
+    lea       eax, [eax + ecx * 2 - 16]
+    sub       edi, edx
+
+ convertloop:
+    movdqa    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm1
+    sub       ecx, 8
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [edx + edi], xmm0
+    lea       edx, [edx + 8]
+    ja        convertloop
+
+    pop       edi
+    ret
+  }
+}
+#endif
+
 #ifdef HAS_SPLITUV_SSE2
 __declspec(naked)
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {