rotate for x86 and bayer refactored - 3x faster.

BUG=1 TEST=tested with talk unittests. Review URL: http://webrtc-codereview.appspot.com/250004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@42 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-02-08 10:46:50 +08:00 · 2011-10-27 20:52:52 +00:00 · 2011-10-27 20:52:52 +00:00 · 780203897c
commit 780203897c
parent 3f4c056b1e
5 changed files with 807 additions and 525 deletions
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -25,6 +25,14 @@ int I420Copy(const uint8* src_y, int src_stride_y,
             uint8* dst_v, int dst_stride_v,
             int width, int height);
 // Draw a rectangle into I420
 int I420Rect(uint8* dst_y, int dst_stride_y,
             uint8* dst_u, int dst_stride_u,
             uint8* dst_v, int dst_stride_v,
             int x, int y,
             int width, int height,
             int value_y, int value_u, int value_v);
 // Convert I422 to I420.  Used by MJPG.
 int I422ToI420(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@ -146,7 +154,7 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
 // Convert ARGB to I400.
 int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               const uint8* dst_y, int dst_stride_y,
+               uint8* dst_y, int dst_stride_y,
               int width, int height);
 }  // namespace libyuv
--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@ -11,39 +11,41 @@
 #ifndef INCLUDE_LIBYUV_ROTATE_H_
 #define INCLUDE_LIBYUV_ROTATE_H_
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 namespace libyuv {
 // Supported rotation
 enum RotationMode {
  kRotate0 = 0, // No rotation
  kRotate90 = 90,  // Rotate 90 degrees clockwise
  kRotate180 = 180,  // Rotate 180 degrees
  kRotate270 = 270,  // Rotate 270 degrees clockwise
  // Deprecated
  kRotateNone = 0,
  kRotateClockwise = 90,
  kRotateCounterClockwise = 270,
  kRotate180 = 180,
 };
 // Rotate I420 frame
-int
+int I420Rotate(const uint8* src_y, int src_stride_y,
-I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
-           const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
-           const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
-           uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
-           uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
-           uint8* dst_v, int dst_stride_v,
+               int width, int height,
-           int width, int height,
+               RotationMode mode);
           RotationMode mode);
-// Split a NV12 input buffer into Y, U, V buffers and
+// Rotate NV12 input and store in I420
-// then rotate the buffers.
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-int
+                     const uint8* src_uv, int src_stride_uv,
-NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     uint8* dst_y, int dst_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_u, int dst_stride_u,
-                 uint8* dst_y, int dst_stride_y,
+                     uint8* dst_v, int dst_stride_v,
-                 uint8* dst_u, int dst_stride_u,
+                     int width, int height,
-                 uint8* dst_v, int dst_stride_v,
+                     RotationMode mode);
                 int width, int height,
                 RotationMode mode);
 }  // namespace libyuv
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@ -27,396 +27,6 @@ namespace libyuv {
 #define FORCE_INLINE
 #endif
 enum {
  RED = 0,
  BLUE = 1,
  GREEN_BETWEEN_RED = 2,
  GREEN_BETWEEN_BLUE = 3,
 };
 enum Position {
  LEFT = 0,
  RIGHT = 1,
  TOP = 2,
  BOTTOM = 4,
  CENTER = 6,
  // Due to the choice of the above values, these are all distinct and the
  // corner values and edge values are each contiguous. This allows us to
  // figure out the position type of a pixel with a single addition operation
  // using the above values, rather than having to use a 3x3 nested switch
  // statement.
  TOP_LEFT = TOP + LEFT,          // 2
  TOP_RIGHT = TOP + RIGHT,        // 3
  BOTTOM_LEFT = BOTTOM + LEFT,    // 4
  BOTTOM_RIGHT = BOTTOM + RIGHT,  // 5
  LEFT_EDGE = CENTER + LEFT,      // 6
  RIGHT_EDGE = CENTER + RIGHT,    // 7
  TOP_EDGE = TOP + CENTER,        // 8
  BOTTOM_EDGE = BOTTOM + CENTER,  // 10
  MIDDLE = CENTER + CENTER,       // 12
 };
 static FORCE_INLINE Position GetPosition(int x, int y, int width, int height) {
  Position xpos = CENTER;
  Position ypos = CENTER;
  if (x == 0) {
    xpos = LEFT;
  } else if (x == width - 1) {
    xpos = RIGHT;
  }
  if (y == 0) {
    ypos = TOP;
  } else if (y == height - 1) {
    ypos = BOTTOM;
  }
  return static_cast<Position>(xpos + ypos);
 }
 static FORCE_INLINE bool IsRedBlue(uint8 colour) {
  return colour <= BLUE;
 }
 static FORCE_INLINE uint32 FourCcToBayerPixelColourMap(uint32 fourcc) {
  // The colour map is a 4-byte array-as-uint32 containing the colours for the
  // four pixels in each 2x2 grid, in left-to-right and top-to-bottom order.
  switch (fourcc) {
    default:
      assert(false);
    case FOURCC_RGGB:
      return FOURCC(RED, GREEN_BETWEEN_RED, GREEN_BETWEEN_BLUE, BLUE);
    case FOURCC_BGGR:
      return FOURCC(BLUE, GREEN_BETWEEN_BLUE, GREEN_BETWEEN_RED, RED);
    case FOURCC_GRBG:
      return FOURCC(GREEN_BETWEEN_RED, RED, BLUE, GREEN_BETWEEN_BLUE);
    case FOURCC_GBRG:
      return FOURCC(GREEN_BETWEEN_BLUE, BLUE, RED, GREEN_BETWEEN_RED);
  }
 }
 static FORCE_INLINE void RGBToYUV(uint8 r, uint8 g, uint8 b,
                                  uint8* y, uint8* u, uint8* v) {
  // Taken from http://en.wikipedia.org/wiki/YUV
  *y = (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
  *u = ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
  *v = ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
 }
 static FORCE_INLINE void InterpolateBayerRGBCorner(uint8* r,
                                                   uint8* g,
                                                   uint8* b,
                                                   const uint8* src,
                                                   int src_stride,
                                                   Position pos,
                                                   uint8 colour) {
  // Compute the offsets to use for fetching the adjacent pixels.
  int adjacent_row;
  int adjacent_column;
  switch (pos) {
    case TOP_LEFT:
      adjacent_row = src_stride;
      adjacent_column = 1;
      break;
    case TOP_RIGHT:
      adjacent_row = src_stride;
      adjacent_column = -1;
      break;
    case BOTTOM_LEFT:
      adjacent_row = -src_stride;
      adjacent_column = 1;
      break;
    case BOTTOM_RIGHT:
    default:
      adjacent_row = -src_stride;
      adjacent_column = -1;
      break;
  }
  // Now interpolate.
  if (IsRedBlue(colour)) {
    uint8 current_pixel = src[0];
    // Average of the adjacent green pixels (there's only two).
    *g = (src[adjacent_column] + src[adjacent_row]) / 2;
    // Average of the oppositely-coloured corner pixels (there's only one).
    uint8 corner_average = src[adjacent_row + adjacent_column];
    if (colour == RED) {
      *r = current_pixel;
      *b = corner_average;
    } else {  // i.e., BLUE
      *b = current_pixel;
      *r = corner_average;
    }
  } else {  // i.e., GREEN_BETWEEN_*
    *g = src[0];
    // Average of the adjacent same-row pixels (there's only one).
    uint8 row_average = src[adjacent_column];
    // Average of the adjacent same-column pixels (there's only one).
    uint8 column_average = src[adjacent_row];
    if (colour == GREEN_BETWEEN_RED) {
      *r = row_average;
      *b = column_average;
    } else {  // i.e., GREEN_BETWEEN_BLUE
      *b = row_average;
      *r = column_average;
    }
  }
 }
 static FORCE_INLINE void InterpolateBayerRGBEdge(uint8* r,
                                                 uint8* g,
                                                 uint8* b,
                                                 const uint8* src,
                                                 int src_stride,
                                                 Position pos,
                                                 uint8 colour) {
  // Compute the offsets to use for fetching the adjacent pixels.
  // Goes one pixel "in" to the image (i.e. towards the center)
  int inner;
  // Goes one pixel to the side (i.e. along the edge) in either the clockwise or
  // counter-clockwise direction, and its negative value goes in the other
  // direction.
  int side;
  switch (pos) {
    case TOP_EDGE:
      inner = src_stride;
      side = 1;
      break;
    case RIGHT_EDGE:
      inner = -1;
      side = src_stride;
      break;
    case BOTTOM_EDGE:
      inner = -src_stride;
      side = 1;
      break;
    case LEFT_EDGE:
    default:
      inner = 1;
      side = src_stride;
      break;
  }
  // Now interpolate.
  if (IsRedBlue(colour)) {
    uint8 current_pixel = src[0];
    // Average of the adjacent green pixels (there's only three).
    *g = (src[inner] + src[side] + src[-side]) / 3;
    // Average of the oppositely-coloured corner pixels (there's only two).
    uint8 corner_average = (src[inner + side] + src[inner - side]) / 2;
    if (colour == RED) {
      *r = current_pixel;
      *b = corner_average;
    } else {  // i.e., BLUE
      *b = current_pixel;
      *r = corner_average;
    }
  } else {  // i.e., GREEN_BETWEEN_*
    *g = src[0];
    // Average of the adjacent side-ways pixels (there's only two).
    uint8 side_average = (src[side] + src[-side]) / 2;
    // Average of the adjacent inner-ways pixels (there's only one).
    uint8 inner_pixel = src[inner];
    // Including && side == 1 effectively transposes the colour logic for
    // processing the left/right sides, which is needed since the "T" shape
    // formed by the pixels is transposed.
    if (colour == GREEN_BETWEEN_RED && side == 1) {
      *r = side_average;
      *b = inner_pixel;
    } else {  // i.e., GREEN_BETWEEN_BLUE || side != 1
      *b = side_average;
      *r = inner_pixel;
    }
  }
 }
 // We inline this one because it runs 99% of the time, so inlining it is
 // probably beneficial.
 static FORCE_INLINE void InterpolateBayerRGBCenter(uint8* r,
                                                   uint8* g,
                                                   uint8* b,
                                                   const uint8* src,
                                                   int src_stride,
                                                   uint8 colour) {
  if (IsRedBlue(colour)) {
    uint8 current_pixel = src[0];
    // Average of the adjacent green pixels (there's four).
    // NOTE(tschmelcher): The material at
    // http://www.siliconimaging.com/RGB%20Bayer.htm discusses a way to improve
    // quality here by using only two of the green pixels based on the
    // correlation to the nearby red/blue pixels, but that is slower and would
    // result in more edge cases.
    *g = (src[1] + src[-1] + src[src_stride] + src[-src_stride]) / 4;
    // Average of the oppositely-coloured corner pixels (there's four).
    uint8 corner_average = (src[src_stride + 1] +
                            src[src_stride - 1] +
                            src[-src_stride + 1] +
                            src[-src_stride - 1]) / 4;
    if (colour == RED) {
      *r = current_pixel;
      *b = corner_average;
    } else {  // i.e., BLUE
      *b = current_pixel;
      *r = corner_average;
    }
  } else {  // i.e., GREEN_BETWEEN_*
    *g = src[0];
    // Average of the adjacent same-row pixels (there's two).
    uint8 row_adjacent = (src[1] + src[-1]) / 2;
    // Average of the adjacent same-column pixels (there's two).
    uint8 column_adjacent = (src[src_stride] + src[-src_stride]) / 2;
    if (colour == GREEN_BETWEEN_RED) {
      *r = row_adjacent;
      *b = column_adjacent;
    } else {  // i.e., GREEN_BETWEEN_BLUE
      *b = row_adjacent;
      *r = column_adjacent;
    }
  }
 }
 // Converts any Bayer RGB format to ARGB.
 int BayerRGBToARGB(const uint8* src, int src_stride, uint32 src_fourcc,
                   uint8* dst, int dst_stride,
                   int width, int height) {
  assert(width % 2 == 0);
  assert(height % 2 == 0);
  uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc);
  int src_row_inc = src_stride * 2 - width;
  int dst_row_inc = dst_stride * 2 - width * 4;
  // Iterate over the 2x2 grids.
  for (int y1 = 0; y1 < height; y1 += 2) {
    for (int x1 = 0; x1 < width; x1 += 2) {
      uint32 colours = colour_map;
      // Iterate over the four pixels within them.
      for (int y2 = 0; y2 < 2; ++y2) {
        for (int x2 = 0; x2 < 2; ++x2) {
          uint8 r, g, b;
          // The low-order byte of the colour map is the current colour.
          uint8 current_colour = static_cast<uint8>(colours);
          colours >>= 8;
          Position pos = GetPosition(x1 + x2, y1 + y2, width, height);
          const uint8* src_pixel = &src[y2 * src_stride + x2];
          uint8* dst_pixel = &dst[y2 * dst_stride + x2 * 4];
          // Convert from Bayer RGB to regular RGB.
          if (pos == MIDDLE) {
            // 99% of the image is the middle.
            InterpolateBayerRGBCenter(&r, &g, &b,
                                      src_pixel, src_stride,
                                      current_colour);
          } else if (pos >= LEFT_EDGE) {
            // Next most frequent is edges.
            InterpolateBayerRGBEdge(&r, &g, &b,
                                    src_pixel, src_stride, pos,
                                    current_colour);
          } else {
            // Last is the corners. There are only 4.
            InterpolateBayerRGBCorner(&r, &g, &b,
                                      src_pixel, src_stride, pos,
                                      current_colour);
          }
          // Store ARGB
          dst_pixel[0] = b;
          dst_pixel[1] = g;
          dst_pixel[2] = r;
          dst_pixel[3] = 255u;
        }
      }
      src += 2;
      dst += 2 * 4;
    }
    src += src_row_inc;
    dst += dst_row_inc;
  }
  return 0;
 }
 // Converts any Bayer RGB format to I420.
 int BayerRGBToI420(const uint8* src, int src_stride, uint32 src_fourcc,
                   uint8* y, int y_stride,
                   uint8* u, int u_stride,
                   uint8* v, int v_stride,
                   int width, int height) {
  assert(width % 2 == 0);
  assert(height % 2 == 0);
  uint32 colour_map = FourCcToBayerPixelColourMap(src_fourcc);
  int src_row_inc = src_stride * 2 - width;
  int y_row_inc = y_stride * 2 - width;
  int u_row_inc = u_stride - width / 2;
  int v_row_inc = v_stride - width / 2;
  // Iterate over the 2x2 grids.
  for (int y1 = 0; y1 < height; y1 += 2) {
    for (int x1 = 0; x1 < width; x1 += 2) {
      uint32 colours = colour_map;
      int total_u = 0;
      int total_v = 0;
      // Iterate over the four pixels within them.
      for (int y2 = 0; y2 < 2; ++y2) {
        for (int x2 = 0; x2 < 2; ++x2) {
          uint8 r, g, b;
          // The low-order byte of the colour map is the current colour.
          uint8 current_colour = static_cast<uint8>(colours);
          colours >>= 8;
          Position pos = GetPosition(x1 + x2, y1 + y2, width, height);
          const uint8* src_pixel = &src[y2 * src_stride + x2];
          uint8* y_pixel = &y[y2 * y_stride + x2];
          // Convert from Bayer RGB to regular RGB.
          if (pos == MIDDLE) {
            // 99% of the image is the middle.
            InterpolateBayerRGBCenter(&r, &g, &b,
                                      src_pixel, src_stride,
                                      current_colour);
          } else if (pos >= LEFT_EDGE) {
            // Next most frequent is edges.
            InterpolateBayerRGBEdge(&r, &g, &b,
                                    src_pixel, src_stride, pos,
                                    current_colour);
          } else {
            // Last is the corners. There are only 4.
            InterpolateBayerRGBCorner(&r, &g, &b,
                                      src_pixel, src_stride, pos,
                                      current_colour);
          }
          // Convert from RGB to YUV.
          uint8 tmp_u, tmp_v;
          RGBToYUV(r, g, b, y_pixel, &tmp_u, &tmp_v);
          total_u += tmp_u;
          total_v += tmp_v;
        }
      }
      src += 2;
      y += 2;
      *u = total_u / 4;
      *v = total_v / 4;
      ++u;
      ++v;
    }
    src += src_row_inc;
    y += y_row_inc;
    u += u_row_inc;
    v += v_row_inc;
  }
  return 0;
 }
 // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
 // and vst would select which 2 components to write.  The low level would need
 // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
@ -429,15 +39,15 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
  __asm {
    mov        eax, [esp + 4]    // src_argb
    mov        edx, [esp + 8]    // dst_bayer
-    movd       xmm0, [esp + 12]  // selector
+    movd       xmm7, [esp + 12]  // selector
    mov        ecx, [esp + 16]   // pix
-    pshufd     xmm0, xmm0, 0
+    pshufd     xmm7, xmm7, 0
  wloop:
-    movdqa     xmm1, [eax]
+    movdqa     xmm0, [eax]
    lea        eax, [eax + 16]
-    pshufb     xmm1, xmm0
+    pshufb     xmm0, xmm7
-    movd       [edx], xmm1
+    movd       [edx], xmm0
    lea        edx, [edx + 4]
    sub        ecx, 4
    ja         wloop
@ -445,37 +55,30 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
  }
 }
-#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \
+#elif (defined(__x86_64__) || defined(__i386__)) && \
-    !TARGET_IPHONE_SIMULATOR
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_ARGBTOBAYERROW_SSSE3
-extern "C" void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                                     uint32 selector, int pix);
+                                 uint32 selector, int pix) {
-  asm(
+  asm volatile(
-    ".text\n"
+    "movd   %3,%%xmm7\n"
-#if defined(OSX)
+    "pshufd $0x0,%%xmm7,%%xmm7\n"
    ".globl _ARGBToBayerRow_SSSE3\n"
 "_ARGBToBayerRow_SSSE3:\n"
 #else
    ".global ARGBToBayerRow_SSSE3\n"
 "ARGBToBayerRow_SSSE3:\n"
 #endif
    "mov    0x4(%esp),%eax\n"
    "mov    0x8(%esp),%edx\n"
    "movd   0xc(%esp),%xmm0\n"
    "mov    0x10(%esp),%ecx\n"
    "pshufd $0x0,%xmm0,%xmm0\n"
 "1:"
-    "movdqa (%eax),%xmm1\n"
+    "movdqa (%0),%%xmm0\n"
-    "lea    0x10(%eax),%eax\n"
+    "lea    0x10(%0),%0\n"
-    "pshufb %xmm0,%xmm1\n"
+    "pshufb %%xmm7,%%xmm0\n"
-    "movd   %xmm1,(%edx)\n"
+    "movd   %%xmm0,(%1)\n"
-    "lea    0x4(%edx),%edx\n"
+    "lea    0x4(%1),%1\n"
-    "sub    $0x4,%ecx\n"
+    "sub    $0x4,%2\n"
    "ja     1b\n"
-    "ret\n"
+  : "+r"(src_argb),  // %0
    "+r"(dst_bayer), // %1
    "+r"(pix)        // %2
  : "r"(selector)    // %3
  : "memory"
 );
 }
 #endif
 static void ARGBToBayerRow_C(const uint8* src_argb,
@ -483,12 +86,15 @@ static void ARGBToBayerRow_C(const uint8* src_argb,
  int index0 = selector & 0xff;
  int index1 = (selector >> 8) & 0xff;
  // Copy a row of Bayer.
-  for (int x = 0; x < pix; x += 2) {
+  for (int x = 0; x < (pix - 1); x += 2) {
    dst_bayer[0] = src_argb[index0];
    dst_bayer[1] = src_argb[index1];
    src_argb += 8;
    dst_bayer += 2;
  }
  if (pix & 1) {
    dst_bayer[0] = src_argb[index0];
  }
 }
 // generate a selector mask useful for pshufb
@ -504,7 +110,11 @@ int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
                   uint8* dst_bayer, int dst_stride_bayer,
                   uint32 dst_fourcc_bayer,
                   int width, int height) {
-  assert(width % 2 == 0);
+  if (height < 0) {
    height = -height;
    src_rgb = src_rgb + (height - 1) * src_stride_rgb;
    src_stride_rgb = -src_stride_rgb;
  }
  void (*ARGBToBayerRow)(const uint8* src_argb,
                         uint8* dst_bayer, uint32 selector, int pix);
 #if defined(HAS_ARGBTOBAYERROW_SSSE3)
@ -556,4 +166,277 @@ int ARGBToBayerRGB(const uint8* src_rgb, int src_stride_rgb,
  return 0;
 }
 #define AVG(a,b) (((a) + (b)) >> 1)
 static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
                       uint8* dst_rgb, int pix) {
  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
  uint8 g = src_bayer0[1];
  uint8 r = src_bayer1[1];
  for (int x = 0; x < (pix - 2); x += 2) {
    dst_rgb[0] = src_bayer0[0];
    dst_rgb[1] = AVG(g, src_bayer0[1]);
    dst_rgb[2] = AVG(r, src_bayer1[1]);
    dst_rgb[3] = 255U;
    dst_rgb[4] = AVG(src_bayer0[0], src_bayer0[2]);
    dst_rgb[5] = src_bayer0[1];
    dst_rgb[6] = src_bayer1[1];
    dst_rgb[7] = 255U;
    g = src_bayer0[1];
    r = src_bayer1[1];
    src_bayer0 += 2;
    src_bayer1 += 2;
    dst_rgb += 8;
  }
  dst_rgb[0] = src_bayer0[0];
  dst_rgb[1] = AVG(g, src_bayer0[1]);
  dst_rgb[2] = AVG(r, src_bayer1[1]);
  dst_rgb[3] = 255U;
  dst_rgb[4] = src_bayer0[0];
  dst_rgb[5] = src_bayer0[1];
  dst_rgb[6] = src_bayer1[1];
  dst_rgb[7] = 255U;
 }
 static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
                       uint8* dst_rgb, int pix) {
  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
  uint8 g = src_bayer0[1];
  uint8 b = src_bayer1[1];
  for (int x = 0; x < (pix - 2); x += 2) {
    dst_rgb[0] = AVG(b, src_bayer1[1]);
    dst_rgb[1] = AVG(g, src_bayer0[1]);
    dst_rgb[2] = src_bayer0[0];
    dst_rgb[3] = 255U;
    dst_rgb[4] = src_bayer1[1];
    dst_rgb[5] = src_bayer0[1];
    dst_rgb[6] = AVG(src_bayer0[0], src_bayer0[2]);
    dst_rgb[7] = 255U;
    g = src_bayer0[1];
    b = src_bayer1[1];
    src_bayer0 += 2;
    src_bayer1 += 2;
    dst_rgb += 8;
  }
  dst_rgb[0] = AVG(b, src_bayer1[1]);
  dst_rgb[1] = AVG(g, src_bayer0[1]);
  dst_rgb[2] = src_bayer0[0];
  dst_rgb[3] = 255U;
  dst_rgb[4] = src_bayer1[1];
  dst_rgb[5] = src_bayer0[1];
  dst_rgb[6] = src_bayer0[0];
  dst_rgb[7] = 255U;
 }
 static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
                       uint8* dst_rgb, int pix) {
  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
  uint8 b = src_bayer0[1];
  for (int x = 0; x < (pix - 2); x += 2) {
    dst_rgb[0] = AVG(b, src_bayer0[1]);
    dst_rgb[1] = src_bayer0[0];
    dst_rgb[2] = src_bayer1[0];
    dst_rgb[3] = 255U;
    dst_rgb[4] = src_bayer0[1];
    dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
    dst_rgb[6] = AVG(src_bayer1[0], src_bayer1[2]);
    dst_rgb[7] = 255U;
    b = src_bayer0[1];
    src_bayer0 += 2;
    src_bayer1 += 2;
    dst_rgb += 8;
  }
  dst_rgb[0] = AVG(b, src_bayer0[1]);
  dst_rgb[1] = src_bayer0[0];
  dst_rgb[2] = src_bayer1[0];
  dst_rgb[3] = 255U;
  dst_rgb[4] = src_bayer0[1];
  dst_rgb[5] = src_bayer0[0];
  dst_rgb[6] = src_bayer1[0];
  dst_rgb[7] = 255U;
 }
 static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
                       uint8* dst_rgb, int pix) {
  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
  uint8 r = src_bayer0[1];
  for (int x = 0; x < (pix - 2); x += 2) {
    dst_rgb[0] = src_bayer1[0];
    dst_rgb[1] = src_bayer0[0];
    dst_rgb[2] = AVG(r, src_bayer0[1]);
    dst_rgb[3] = 255U;
    dst_rgb[4] = AVG(src_bayer1[0], src_bayer1[2]);
    dst_rgb[5] = AVG(src_bayer0[0], src_bayer0[2]);
    dst_rgb[6] = src_bayer0[1];
    dst_rgb[7] = 255U;
    r = src_bayer0[1];
    src_bayer0 += 2;
    src_bayer1 += 2;
    dst_rgb += 8;
  }
  dst_rgb[0] = src_bayer1[0];
  dst_rgb[1] = src_bayer0[0];
  dst_rgb[2] = AVG(r, src_bayer0[1]);
  dst_rgb[3] = 255U;
  dst_rgb[4] = src_bayer1[0];
  dst_rgb[5] = src_bayer0[0];
  dst_rgb[6] = src_bayer0[1];
  dst_rgb[7] = 255U;
 }
 // Converts any Bayer RGB format to ARGB.
 int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
                   uint32 src_fourcc_bayer,
                   uint8* dst_rgb, int dst_stride_rgb,
                   int width, int height) {
  if (height < 0) {
    height = -height;
    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
    dst_stride_rgb = -dst_stride_rgb;
  }
  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
                    uint8* dst_rgb, int pix);
  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
                    uint8* dst_rgb, int pix);
  switch (src_fourcc_bayer) {
    default:
      assert(false);
    case FOURCC_RGGB:
      BayerRow0 = BayerRowRG;
      BayerRow1 = BayerRowGB;
      break;
    case FOURCC_BGGR:
      BayerRow0 = BayerRowBG;
      BayerRow1 = BayerRowGR;
      break;
    case FOURCC_GRBG:
      BayerRow0 = BayerRowGR;
      BayerRow1 = BayerRowBG;
      break;
    case FOURCC_GBRG:
      BayerRow0 = BayerRowGB;
      BayerRow1 = BayerRowRG;
      break;
  }
  for (int y = 0; y < (height - 1); y += 2) {
    BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width);
    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
        dst_rgb + dst_stride_rgb, width);
    src_bayer += src_stride_bayer * 2;
    dst_rgb += dst_stride_rgb * 2;
  }
  if (height & 1) {
    BayerRow0(src_bayer, -src_stride_bayer, dst_rgb, width);
  }
  return 0;
 }
 // Taken from http://en.wikipedia.org/wiki/YUV
 static FORCE_INLINE int RGBToY(uint8 r, uint8 g, uint8 b) {
  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
 }
 static FORCE_INLINE int RGBToU(uint8 r, uint8 g, uint8 b) {
  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
 }
 static FORCE_INLINE int RGBToV(uint8 r, uint8 g, uint8 b) {
  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
 }
 static void ARGBtoYRow(const uint8* src_argb0,
                       uint8* dst_y, int width) {
  for (int x = 0; x < width; ++x) {
    dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
    src_argb0 += 4;
    dst_y += 1;
  }
 }
 static void ARGBtoUVRow(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u,
                        uint8* dst_v,
                        int width) {
  const uint8* src_argb1 = src_argb0 + src_stride_argb;
  for (int x = 0; x < width - 1; x += 2) {
    uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
    uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
    uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
    dst_u[0] = RGBToU(ar, ag, ab);
    dst_v[0] = RGBToV(ar, ag, ab);
    src_argb0 += 8;
    src_argb1 += 8;
    dst_u += 1;
    dst_v += 1;
  }
 }
 // Converts any Bayer RGB format to ARGB.
 int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
                   uint32 src_fourcc_bayer,
                   uint8* dst_y, int dst_stride_y,
                   uint8* dst_u, int dst_stride_u,
                   uint8* dst_v, int dst_stride_v,
                   int width, int height) {
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    int halfheight = (height + 1) >> 1;
    dst_y = dst_y + (height - 1) * dst_stride_y;
    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
    dst_stride_y = -dst_stride_y;
    dst_stride_u = -dst_stride_u;
    dst_stride_v = -dst_stride_v;
  }
  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
                    uint8* dst_rgb, int pix);
  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
                    uint8* dst_rgb, int pix);
  switch (src_fourcc_bayer) {
    default:
      assert(false);
    case FOURCC_RGGB:
      BayerRow0 = BayerRowRG;
      BayerRow1 = BayerRowGB;
      break;
    case FOURCC_BGGR:
      BayerRow0 = BayerRowBG;
      BayerRow1 = BayerRowGR;
      break;
    case FOURCC_GRBG:
      BayerRow0 = BayerRowGR;
      BayerRow1 = BayerRowBG;
      break;
    case FOURCC_GBRG:
      BayerRow0 = BayerRowGB;
      BayerRow1 = BayerRowRG;
      break;
  }
 #define kMaxStride 2048 * 4
  uint8 row[kMaxStride * 2];
  for (int y = 0; y < (height - 1); y += 2) {
    BayerRow0(src_bayer, src_stride_bayer, row, width);
    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
              row + kMaxStride, width);
    ARGBtoYRow(row, dst_y, width);
    ARGBtoYRow(row + kMaxStride, dst_y + dst_stride_y, width);
    ARGBtoUVRow(row, kMaxStride, dst_u, dst_v, width);
    src_bayer += src_stride_bayer * 2;
    dst_y += dst_stride_y * 2;
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
  if (height & 1) {
    BayerRow0(src_bayer, src_stride_bayer, row, width);
    ARGBtoYRow(row, dst_y, width);
    ARGBtoUVRow(row, 0, dst_u, dst_v, width);
  }
  return 0;
 }
 }  // namespace libyuv
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -49,28 +49,33 @@ static void SplitUV_NEON(const uint8* src_uv,
 #endif
 // Shuffle table for converting ABGR to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) =
+extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
-  { 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u };
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
 };
 // Shuffle table for converting BGRA to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) =
+extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
-  { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u };
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
 };
 // Shuffle table for converting BG24 to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) =
+extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
-  { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u };
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
 };
 // Shuffle table for converting RAW to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) =
+extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
-  { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u };
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 // Constant multiplication table for converting ARGB to I400.
-extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) =
+extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
-  { 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u,
+  13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
-    13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u };
+};
-extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400_2[16]) =
+extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400_2[16]) = {
-  { 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u };
+  1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
 };
 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
 #define HAS_SPLITUV_SSE2
@ -169,28 +174,7 @@ static void I420CopyPlane(const uint8* src_y, int src_stride_y,
  }
 }
-static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+// Copy I420 with optional flipping
                           uint8* dst, int dst_stride,
                           int width, int height) {
  // Copy plane
  for (int y = 0; y < height; y += 2) {
    memcpy(dst, src, width);
    src += src_stride_0;
    dst += dst_stride;
    memcpy(dst, src, width);
    src += src_stride_1;
    dst += dst_stride;
  }
 }
 // TODO(fbarchard): For biplanar formats (ie NV21), the Y plane is the same
 // as I420, and only the chroma plane varies. Copy the Y plane by reference,
 // and just convert the UV.  This method can be used for NV21, NV12, I420,
 // I422, M422.  8 of the 12 bits is Y, so this would copy 3 times less data,
 // which is approximately how much faster it would be.
 // Helper function to copy yuv data without scaling.  Used
 // by our jpeg conversion callbacks to incrementally fill a yuv image.
 int I420Copy(const uint8* src_y, int src_stride_y,
             const uint8* src_u, int src_stride_u,
             const uint8* src_v, int src_stride_v,
@ -198,6 +182,12 @@ int I420Copy(const uint8* src_y, int src_stride_y,
             uint8* dst_u, int dst_stride_u,
             uint8* dst_v, int dst_stride_v,
             int width, int height) {
  if (!src_y || !src_u || !src_v ||
      !dst_y || !dst_u || !dst_v ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
@ -218,6 +208,137 @@ int I420Copy(const uint8* src_y, int src_stride_y,
  return 0;
 }
 // SetRows32 writes 'count' bytes using a 32 bit value repeated
 #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
 #define HAS_SETROW_NEON
 static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
  __asm__ volatile
  (
    "vdup.u32   {q0}, %2          \n"  // duplicate 4 ints
    "1:\n"
    "vst1.u32   {q0}, [%0]!       \n"  // store
    "subs       %1, %1, #16       \n"  // 16 processed per loop
    "bhi        1b                \n"
  : "+r"(dst),  // %0
    "+r"(count) // %1
  : "r"(v32)    // %2
  : "q0", "memory"
  );
 }
 #elif defined(WIN32) && !defined(COVERAGE_ENABLED)
 #define HAS_SETROW_SSE2
 __declspec(naked)
 static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
  __asm {
    mov        eax, [esp + 4]    // dst
    movd       xmm7, [esp + 8]   // v32
    mov        ecx, [esp + 12]   // count
    pshufd     xmm7, xmm7, 0
  wloop:
    movdqa     [eax], xmm7
    lea        eax, [eax + 16]
    sub        ecx, 16
    ja         wloop
    ret
  }
 }
 #elif (defined(__x86_64__) || defined(__i386__)) && \
    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_SETROW_SSE2
 static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
  asm volatile(
  "movd       %2, %%xmm7\n"
  "pshufd     $0x0,%%xmm7,%%xmm7\n"
 "1:"
  "movdqa     %%xmm7,(%0)\n"
  "lea        0x10(%0),%0\n"
  "sub        $0x10,%1\n"
  "ja         1b\n"
  : "+r"(dst),  // %0
    "+r"(count) // %1
  : "r"(v32)    // %2
  : "memory"
 );
 }
 #endif
 static void SetRow8_C(uint8* dst, uint32 v8, int count) {
  memset(dst, v8, count);
 }
 static void I420SetPlane(uint8* dst_y, int dst_stride_y,
                         int width, int height,
                         int value) {
  void (*SetRow)(uint8* dst, uint32 value, int pix);
 #if defined(HAS_SETROW_NEON)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
      (width % 16 == 0) &&
      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
    SetRow = SetRow32_NEON;
  } else
 #elif defined(HAS_SETROW_SSE2)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
      (width % 16 == 0) &&
      IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
    SetRow = SetRow32_SSE2;
  } else
 #endif
  {
    SetRow = SetRow8_C;
  }
  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
  // Set plane
  for (int y = 0; y < height; ++y) {
    SetRow(dst_y, v32, width);
    dst_y += dst_stride_y;
  }
 }
 // Draw a rectangle into I420
 int I420Rect(uint8* dst_y, int dst_stride_y,
             uint8* dst_u, int dst_stride_u,
             uint8* dst_v, int dst_stride_v,
             int x, int y,
             int width, int height,
             int value_y, int value_u, int value_v) {
  if (!dst_y || !dst_u || !dst_v ||
      width <= 0 || height == 0 ||
      x < 0 || y < 0 ||
      value_y < 0 || value_y > 255 ||
      value_u < 0 || value_u > 255 ||
      value_v < 0 || value_v > 255) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    int halfheight = (height + 1) >> 1;
    dst_y = dst_y + (height - 1) * dst_stride_y;
    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
    dst_stride_y = -dst_stride_y;
    dst_stride_u = -dst_stride_u;
    dst_stride_v = -dst_stride_v;
  }
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
  uint8* start_y = dst_y + y * dst_stride_y + x;
  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
  I420SetPlane(start_y, dst_stride_y, width, height, value_y);
  I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
  I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
  return 0;
 }
 // Helper function to copy yuv data without scaling.  Used
 // by our jpeg conversion callbacks to incrementally fill a yuv image.
 int I422ToI420(const uint8* src_y, int src_stride_y,
@ -271,6 +392,20 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
  return 0;
 }
 static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
                           uint8* dst, int dst_stride,
                           int width, int height) {
  // Copy plane
  for (int y = 0; y < height; y += 2) {
    memcpy(dst, src, width);
    src += src_stride_0;
    dst += dst_stride;
    memcpy(dst, src, width);
    src += src_stride_1;
    dst += dst_stride;
  }
 }
 // Support converting from FOURCC_M420
 // Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
 // easy conversion to I420.
@ -1238,8 +1373,7 @@ __asm {
 #define HAS_ARGBTOI400ROW_SSSE3
 __declspec(naked)
-static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y,
+static void ARGBToI400Row_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
                                int pix) {
 __asm {
    mov       eax, [esp + 4]   // src_argb
    mov       edx, [esp + 8]   // dst_y
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -119,7 +119,7 @@ __asm {
    palignr   xmm6, xmm6, 8
    palignr   xmm7, xmm7, 8
    // Third round of bit swap.
-    // Write to the destination pointer. 
+    // Write to the destination pointer.
    punpckldq xmm0, xmm4
    movq      qword ptr [edx], xmm0
    movdqa    xmm4, xmm0
@ -146,7 +146,7 @@ __asm {
    lea       edx, [edx + 2 * esi]
    sub       ecx, 8
    ja        convertloop
-    
+
    pop       ebp
    pop       esi
    pop       edi
@ -154,6 +154,133 @@ __asm {
  }
 }
 #define HAS_TRANSPOSE_UVWX8_SSE2
 __declspec(naked)
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                uint8* dst_a, int dst_stride_a,
                                uint8* dst_b, int dst_stride_b,
                                int w) {
 __asm {
    push      ebx
    push      esi
    push      edi
    push      ebp
    mov       eax, [esp + 16 + 4]   // src
    mov       edi, [esp + 16 + 8]   // src_stride
    mov       edx, [esp + 16 + 12]  // dst_a
    mov       esi, [esp + 16 + 16]  // dst_stride_a
    mov       ebx, [esp + 16 + 20]  // dst_b
    mov       ebp, [esp + 16 + 24]  // dst_stride_b
    mov       ecx, esp
    sub       esp, 4 + 16
    and       esp, ~15
    mov       [esp + 16], ecx
    mov       ecx, [ecx + 16 + 28]  // w
 convertloop :
    // Read in the data from the source pointer.
    // First round of bit swap.
    movdqa    xmm0, [eax]
    movdqa    xmm1, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqa    xmm7, xmm0  // use xmm7 as temp register.
    punpcklbw xmm0, xmm1
    punpckhbw xmm7, xmm1
    movdqa    xmm1, xmm7
    movdqa    xmm2, [eax]
    movdqa    xmm3, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqa    xmm7, xmm2
    punpcklbw xmm2, xmm3
    punpckhbw xmm7, xmm3
    movdqa    xmm3, xmm7
    movdqa    xmm4, [eax]
    movdqa    xmm5, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqa    xmm7, xmm4
    punpcklbw xmm4, xmm5
    punpckhbw xmm7, xmm5
    movdqa    xmm5, xmm7
    movdqa    xmm6, [eax]
    movdqa    xmm7, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqa    [esp], xmm5  // backup xmm5
    neg       edi
    movdqa    xmm5, xmm6   // use xmm5 as temp register.
    punpcklbw xmm6, xmm7
    punpckhbw xmm5, xmm7
    movdqa    xmm7, xmm5
    lea       eax, [eax + 8 * edi + 16]
    neg       edi
    // Second round of bit swap.
    movdqa    xmm5, xmm0
    punpcklwd xmm0, xmm2
    punpckhwd xmm5, xmm2
    movdqa    xmm2, xmm5
    movdqa    xmm5, xmm1
    punpcklwd xmm1, xmm3
    punpckhwd xmm5, xmm3
    movdqa    xmm3, xmm5
    movdqa    xmm5, xmm4
    punpcklwd xmm4, xmm6
    punpckhwd xmm5, xmm6
    movdqa    xmm6, xmm5
    movdqa    xmm5, [esp]  // restore xmm5
    movdqa    [esp], xmm6  // backup xmm6
    movdqa    xmm6, xmm5    // use xmm6 as temp register.
    punpcklwd xmm5, xmm7
    punpckhwd xmm6, xmm7
    movdqa    xmm7, xmm6
    // Third round of bit swap.
    // Write to the destination pointer.
    movdqa    xmm6, xmm0
    punpckldq xmm0, xmm4
    punpckhdq xmm6, xmm4
    movdqa    xmm4, xmm6
    movdqa    xmm6, [esp]  // restore xmm6
    movlpd    qword ptr [edx], xmm0
    movhpd    qword ptr [ebx], xmm0
    movlpd    qword ptr [edx + esi], xmm4
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm4
    lea       ebx, [ebx + 2 * ebp]
    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
    punpckldq xmm2, xmm6
    movlpd    qword ptr [edx], xmm2
    movhpd    qword ptr [ebx], xmm2
    punpckhdq xmm0, xmm6
    movlpd    qword ptr [edx + esi], xmm0
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm0
    lea       ebx, [ebx + 2 * ebp]
    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
    punpckldq xmm1, xmm5
    movlpd    qword ptr [edx], xmm1
    movhpd    qword ptr [ebx], xmm1
    punpckhdq xmm0, xmm5
    movlpd    qword ptr [edx + esi], xmm0
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm0
    lea       ebx, [ebx + 2 * ebp]
    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
    punpckldq xmm3, xmm7
    movlpd    qword ptr [edx], xmm3
    movhpd    qword ptr [ebx], xmm3
    punpckhdq xmm0, xmm7
    movlpd    qword ptr [edx + esi], xmm0
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm0
    lea       ebx, [ebx + 2 * ebp]
    sub       ecx, 8
    ja        convertloop
    mov       esp, [esp + 16]
    pop       ebp
    pop       edi
    pop       esi
    pop       ebx
    ret
  }
 }
 #elif (defined(__i386__) || defined(__x86_64__)) && \
    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_TRANSPOSE_WX8_SSSE3
@ -204,7 +331,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
  "palignr    $0x8,%%xmm6,%%xmm6\n"
  "palignr    $0x8,%%xmm7,%%xmm7\n"
  // Third round of bit swap.
-  // Write to the destination pointer. 
+  // Write to the destination pointer.
  "punpckldq  %%xmm4,%%xmm0\n"
  "movq       %%xmm0,(%1)\n"
  "movdqa     %%xmm0,%%xmm4\n"
@ -240,15 +367,134 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
 );
 }
 // TODO(fbarchard): Port to 32 bit
 #if defined (__x86_64__)
 #define HAS_TRANSPOSE_UVWX8_SSE2
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                uint8* dst_a, int dst_stride_a,
                                uint8* dst_b, int dst_stride_b,
                                int w) {
  asm volatile(
 "1:"
  // Read in the data from the source pointer.
  // First round of bit swap.
  "movdqa     (%0),%%xmm0\n"
  "movdqa     (%0,%4),%%xmm1\n"
  "lea        (%0,%4,2),%0\n"
  "movdqa     %%xmm0,%%xmm8\n"
  "punpcklbw  %%xmm1,%%xmm0\n"
  "punpckhbw  %%xmm1,%%xmm8\n"
  "movdqa     %%xmm8,%%xmm1\n"
  "movdqa     (%0),%%xmm2\n"
  "movdqa     (%0,%4),%%xmm3\n"
  "lea        (%0,%4,2),%0\n"
  "movdqa     %%xmm2,%%xmm8\n"
  "punpcklbw  %%xmm3,%%xmm2\n"
  "punpckhbw  %%xmm3,%%xmm8\n"
  "movdqa     %%xmm8,%%xmm3\n"
  "movdqa     (%0),%%xmm4\n"
  "movdqa     (%0,%4),%%xmm5\n"
  "lea        (%0,%4,2),%0\n"
  "movdqa     %%xmm4,%%xmm8\n"
  "punpcklbw  %%xmm5,%%xmm4\n"
  "punpckhbw  %%xmm5,%%xmm8\n"
  "movdqa     %%xmm8,%%xmm5\n"
  "movdqa     (%0),%%xmm6\n"
  "movdqa     (%0,%4),%%xmm7\n"
  "lea        (%0,%4,2),%0\n"
  "movdqa     %%xmm6,%%xmm8\n"
  "punpcklbw  %%xmm7,%%xmm6\n"
  "neg        %4\n"
  "lea        0x10(%0,%4,8),%0\n"
  "punpckhbw  %%xmm7,%%xmm8\n"
  "movdqa     %%xmm8,%%xmm7\n"
  "neg        %4\n"
   // Second round of bit swap.
  "movdqa     %%xmm0,%%xmm8\n"
  "movdqa     %%xmm1,%%xmm9\n"
  "punpckhwd  %%xmm2,%%xmm8\n"
  "punpckhwd  %%xmm3,%%xmm9\n"
  "punpcklwd  %%xmm2,%%xmm0\n"
  "punpcklwd  %%xmm3,%%xmm1\n"
  "movdqa     %%xmm8,%%xmm2\n"
  "movdqa     %%xmm9,%%xmm3\n"
  "movdqa     %%xmm4,%%xmm8\n"
  "movdqa     %%xmm5,%%xmm9\n"
  "punpckhwd  %%xmm6,%%xmm8\n"
  "punpckhwd  %%xmm7,%%xmm9\n"
  "punpcklwd  %%xmm6,%%xmm4\n"
  "punpcklwd  %%xmm7,%%xmm5\n"
  "movdqa     %%xmm8,%%xmm6\n"
  "movdqa     %%xmm9,%%xmm7\n"
  // Third round of bit swap.
  // Write to the destination pointer.
  "movdqa     %%xmm0,%%xmm8\n"
  "punpckldq  %%xmm4,%%xmm0\n"
  "movlpd     %%xmm0,(%1)\n"  // Write back U channel
  "movhpd     %%xmm0,(%2)\n"  // Write back V channel
  "punpckhdq  %%xmm4,%%xmm8\n"
  "movlpd     %%xmm8,(%1,%5)\n"
  "lea        (%1,%5,2),%1\n"
  "movhpd     %%xmm8,(%2,%6)\n"
  "lea        (%2,%6,2),%2\n"
  "movdqa     %%xmm2,%%xmm8\n"
  "punpckldq  %%xmm6,%%xmm2\n"
  "movlpd     %%xmm2,(%1)\n"
  "movhpd     %%xmm2,(%2)\n"
  "punpckhdq  %%xmm6,%%xmm8\n"
  "movlpd     %%xmm8,(%1,%5)\n"
  "lea        (%1,%5,2),%1\n"
  "movhpd     %%xmm8,(%2,%6)\n"
  "lea        (%2,%6,2),%2\n"
  "movdqa     %%xmm1,%%xmm8\n"
  "punpckldq  %%xmm5,%%xmm1\n"
  "movlpd     %%xmm1,(%1)\n"
  "movhpd     %%xmm1,(%2)\n"
  "punpckhdq  %%xmm5,%%xmm8\n"
  "movlpd     %%xmm8,(%1,%5)\n"
  "lea        (%1,%5,2),%1\n"
  "movhpd     %%xmm8,(%2,%6)\n"
  "lea        (%2,%6,2),%2\n"
  "movdqa     %%xmm3,%%xmm8\n"
  "punpckldq  %%xmm7,%%xmm3\n"
  "movlpd     %%xmm3,(%1)\n"
  "movhpd     %%xmm3,(%2)\n"
  "punpckhdq  %%xmm7,%%xmm8\n"
  "movlpd     %%xmm8,(%1,%5)\n"
  "lea        (%1,%5,2),%1\n"
  "movhpd     %%xmm8,(%2,%6)\n"
  "lea        (%2,%6,2),%2\n"
  "sub        $0x8,%3\n"
  "ja         1b\n"
  : "+r"(src),    // %0
    "+r"(dst_a),  // %1
    "+r"(dst_b),  // %2
    "+r"(w)   // %3
  : "r"(static_cast<intptr_t>(src_stride)),    // %4
    "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
    "r"(static_cast<intptr_t>(dst_stride_b))   // %6
  : "memory"
 );
 }
 #endif
 #endif
 static void TransposeWx8_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
                           int w) {
-  int i, j;
+  int i;
-  for (i = 0; i < w; ++i)
+  for (i = 0; i < w; ++i) {
-    for (j = 0; j < 8; ++j)
+    dst[0] = src[0 * src_stride];
-      dst[i * dst_stride + j] = src[j * src_stride + i];
+    dst[1] = src[1 * src_stride];
    dst[2] = src[2 * src_stride];
    dst[3] = src[3 * src_stride];
    dst[4] = src[4 * src_stride];
    dst[5] = src[5 * src_stride];
    dst[6] = src[6 * src_stride];
    dst[7] = src[7 * src_stride];
    ++src;
    dst += dst_stride;
  }
 }
 static void TransposeWxH_C(const uint8* src, int src_stride,
@ -328,10 +574,10 @@ void RotatePlane270(const uint8* src, int src_stride,
 static void ReverseLine_C(const uint8* src, uint8* dst, int width) {
  int i;
-  src += width;
+  src += width - 1;
  for (i = 0; i < width; ++i) {
    --src;
    dst[i] = src[0];
    --src;
  }
 }
@ -407,15 +653,13 @@ void RotatePlane180(const uint8* src, int src_stride,
  {
    ReverseLine = ReverseLine_C;
  }
-  // Rotate by 180 is a mirror with the destination
+  // Rotate by 180 is a mirror and vertical flip
-  // written in reverse.
+  src += src_stride * (height - 1);
  dst += dst_stride * (height - 1);
  for (i = 0; i < height; ++i) {
    ReverseLine(src, dst, width);
-
+    src -= src_stride;
-    src += src_stride;
+    dst += dst_stride;
    dst -= dst_stride;
  }
 }
@ -423,12 +667,28 @@ static void TransposeUVWx8_C(const uint8* src, int src_stride,
                             uint8* dst_a, int dst_stride_a,
                             uint8* dst_b, int dst_stride_b,
                             int w) {
-  int i, j;
+  int i;
-  for (i = 0; i < w * 2; i += 2)
+  for (i = 0; i < w; ++i) {
-    for (j = 0; j < 8; ++j) {
+    dst_a[0] = src[0 * src_stride + 0];
-      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+    dst_b[0] = src[0 * src_stride + 1];
-      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    dst_a[1] = src[1 * src_stride + 0];
-    }
+    dst_b[1] = src[1 * src_stride + 1];
    dst_a[2] = src[2 * src_stride + 0];
    dst_b[2] = src[2 * src_stride + 1];
    dst_a[3] = src[3 * src_stride + 0];
    dst_b[3] = src[3 * src_stride + 1];
    dst_a[4] = src[4 * src_stride + 0];
    dst_b[4] = src[4 * src_stride + 1];
    dst_a[5] = src[5 * src_stride + 0];
    dst_b[5] = src[5 * src_stride + 1];
    dst_a[6] = src[6 * src_stride + 0];
    dst_b[6] = src[6 * src_stride + 1];
    dst_a[7] = src[7 * src_stride + 0];
    dst_b[7] = src[7 * src_stride + 1];
    src += 2;
    dst_a += dst_stride_a;
    dst_b += dst_stride_b;
  }
 }
 static void TransposeUVWxH_C(const uint8* src, int src_stride,
@ -436,7 +696,7 @@ static void TransposeUVWxH_C(const uint8* src, int src_stride,
                             uint8* dst_b, int dst_stride_b,
                             int w, int h) {
  int i, j;
-  for (i = 0; i < w*2; i += 2)
+  for (i = 0; i < w * 2; i += 2)
    for (j = 0; j < h; ++j) {
      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
@ -452,12 +712,8 @@ void TransposeUV(const uint8* src, int src_stride,
  rotate_uv_wxh_func TransposeWxH;
 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
-  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+  unsigned long long store_reg[8];
-      (width % 8 == 0) &&
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
      IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
      IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) &&
      IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0)) {
    unsigned long long store_reg[8];
    SaveRegisters_NEON(store_reg);
    TransposeWx8 = TransposeUVWx8_NEON;
    TransposeWxH = TransposeUVWxH_C;
@ -466,9 +722,9 @@ void TransposeUV(const uint8* src, int src_stride,
 #if defined(HAS_TRANSPOSE_UVWX8_SSE2)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
      (width % 8 == 0) &&
-      IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
+      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) &&
+      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0)) {
+      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) {
    TransposeWx8 = TransposeUVWx8_SSE2;
    TransposeWxH = TransposeUVWxH_C;
  } else
@ -544,7 +800,7 @@ __asm {
    mov       edi, [esp + 4 + 12]  // dst_b
    mov       ecx, [esp + 4 + 16]  // width
    movdqa    xmm7, _kShuffleReverseUV
-    lea       eax, [eax + 2 * ecx - 16]
+    lea       eax, [eax + ecx * 2 - 16]
 convertloop :
    movdqa    xmm0, [eax]
@ -610,13 +866,12 @@ void RotateUV180(const uint8* src, int src_stride,
  int i;
  reverse_uv_func ReverseLine;
  // TODO(frkoenig) : do processor detection here.
 #if defined(HAS_REVERSE_LINE_UV_NEON)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) &&
+      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0) ) {
+      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
    ReverseLine = ReverseLineUV_NEON;
  } else
 #endif
@ -624,8 +879,8 @@ void RotateUV180(const uint8* src, int src_stride,
  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
      (width % 16 == 0) &&
      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
-      IS_ALIGNED(dst_a, 16) && (dst_stride_a % 8 == 0) &&
+      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
-      IS_ALIGNED(dst_b, 16) && (dst_stride_b % 8 == 0) ) {
+      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
    ReverseLine = ReverseLineUV_SSSE3;
  } else
 #endif
@ -669,7 +924,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
  }
  switch (mode) {
-    case kRotateNone:
+    case kRotate0:
      // copy frame
      return I420Copy(src_y, src_stride_y,
                      src_u, src_stride_u,
@ -678,7 +933,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
                      dst_u, dst_stride_u,
                      dst_v, dst_stride_v,
                      width, height);
-    case kRotateClockwise:
+    case kRotate90:
      RotatePlane90(src_y, src_stride_y,
                    dst_y, dst_stride_y,
                    width, height);
@ -689,7 +944,7 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
                    dst_v, dst_stride_v,
                    halfwidth, halfheight);
      return 0;
-    case kRotateCounterClockwise:
+    case kRotate270:
      RotatePlane270(src_y, src_stride_y,
                     dst_y, dst_stride_y,
                     width, height);
@ -738,14 +993,14 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
  }
  switch (mode) {
-    case kRotateNone:
+    case kRotate0:
      // copy frame
      return NV12ToI420(src_y, src_uv, src_stride_y,
                        dst_y, dst_stride_y,
                        dst_u, dst_stride_u,
                        dst_v, dst_stride_v,
                        width, height);
-    case kRotateClockwise:
+    case kRotate90:
      RotatePlane90(src_y, src_stride_y,
                    dst_y, dst_stride_y,
                    width, height);
@ -754,7 +1009,7 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
                 dst_v, dst_stride_v,
                 halfwidth, halfheight);
      return 0;
-    case kRotateCounterClockwise:
+    case kRotate270:
      RotatePlane270(src_y, src_stride_y,
                     dst_y, dst_stride_y,
                     width, height);