RGB formats converted to YUV with Neon

BUG=none TEST=convert_test Review URL: https://webrtc-codereview.appspot.com/936013 git-svn-id: http://libyuv.googlecode.com/svn/trunk@471 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-02-06 17:59:50 +08:00 · 2012-11-05 23:40:11 +00:00 · 2012-11-05 23:40:11 +00:00 · bdf7cb5914
commit bdf7cb5914
parent d8427fd50a
20 changed files with 2528 additions and 1279 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 470
+Version: 471
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -22,22 +22,9 @@ namespace libyuv {
 extern "C" {
 #endif

-// Alias.
-#define I420ToI420 I420Copy
-
-// Copy I420 to I420.
+// Convert I444 to I420.
 LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
-
-// Convert I422 to I420.
-LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
+int I444ToI420(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
               uint8* dst_y, int dst_stride_y,
@ -45,9 +32,9 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

-// Convert I444 to I420.
+// Convert I422 to I420.
 LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
+int I422ToI420(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
               uint8* dst_y, int dst_stride_y,
@ -65,6 +52,17 @@ int I411ToI420(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
 // Convert I400 (grey) to I420.
 LIBYUV_API
 int I400ToI420(const uint8* src_y, int src_stride_y,
@ -91,6 +89,22 @@ int NV21ToI420(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
 // Convert M420 to I420.
 LIBYUV_API
 int M420ToI420(const uint8* src_m420, int src_stride_m420,
@ -108,22 +122,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

-// Convert YUY2 to I420.
-LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert UYVY to I420.
-LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
 // Convert V210 to I420.
 LIBYUV_API
 int V210ToI420(const uint8* src_uyvy, int src_stride_uyvy,
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@ -18,15 +18,27 @@ namespace libyuv {
 extern "C" {
 #endif

-// Alias.
-#define ARGBToARGB ARGBCopy
-
 // Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
 LIBYUV_API
 int ARGBCopy(const uint8* src_argb, int src_stride_argb,
             uint8* dst_argb, int dst_stride_argb,
             int width, int height);

+// Convert ARGB To BGRA. (alias)
+#define ARGBToBGRA BGRAToARGB
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert ARGB To ABGR. (alias)
+#define ARGBToABGR ABGRToARGB
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
 // Convert ARGB To RGBA.
 LIBYUV_API
 int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
@ -63,34 +75,75 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
                   uint8* dst_argb4444, int dst_stride_argb4444,
                   int width, int height);

+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
 // Convert ARGB to I400.
 LIBYUV_API
 int ARGBToI400(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               int width, int height);

-// ARGB little endian (bgra in memory) to I422.
+// Convert ARGB To NV12.
 LIBYUV_API
-int ARGBToI422(const uint8* src_frame, int src_stride_frame,
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
+               uint8* dst_uv, int dst_stride_uv,
               int width, int height);

-// Aliases.
-#define ARGBToBGRA BGRAToARGB
-#define ARGBToABGR ABGRToARGB
-
-// BGRA little endian (argb in memory) to ARGB.
+// Convert ARGB To NV21.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
               int width, int height);

-// ABGR little endian (rgba in memory) to ARGB.
+// Convert ARGB To NV21.
 LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
               int width, int height);

 #ifdef __cplusplus
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
--- a/include/libyuv/scale.h
+++ b/include/libyuv/scale.h
@ -66,8 +66,8 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,

 // Legacy API.  Deprecated.
 LIBYUV_API
-int ScaleOffset(const uint8* src, int src_width, int src_height,
-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
+                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
                bool interpolate);

 // For testing, allow disabling of specialized scalers.
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 470
+#define LIBYUV_VERSION 471

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@ -533,11 +533,9 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
      YUY2ToYRow_C;
 #if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    }
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
@ -550,12 +548,10 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
    }
  }
 #elif defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      YUY2ToYRow = YUY2ToYRow_Any_NEON;
-      if (width > 16) {
-        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (width >= 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
    }
    if (IS_ALIGNED(width, 16)) {
      YUY2ToYRow = YUY2ToYRow_NEON;
@ -656,11 +652,9 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
  YUY2ToYRow = YUY2ToYRow_C;
  YUY2ToUVRow = YUY2ToUVRow_C;
 #if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    }
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
@ -673,12 +667,10 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
    }
  }
 #elif defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      YUY2ToYRow = YUY2ToYRow_Any_NEON;
-      if (width > 16) {
-        YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (width >= 16) {
+      YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
    }
    if (IS_ALIGNED(width, 16)) {
      YUY2ToYRow = YUY2ToYRow_NEON;
@ -723,11 +715,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
  UYVYToYRow = UYVYToYRow_C;
  UYVYToUVRow = UYVYToUVRow_C;
 #if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      UYVYToUVRow = UYVYToUVRow_Any_SSE2;
-      UYVYToYRow = UYVYToYRow_Any_SSE2;
-    }
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
@ -740,12 +730,10 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
    }
  }
 #elif defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      UYVYToYRow = UYVYToYRow_Any_NEON;
-      if (width > 16) {
-        UYVYToUVRow = UYVYToUVRow_Any_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (width >= 16) {
+      UYVYToUVRow = UYVYToUVRow_Any_NEON;
    }
    if (IS_ALIGNED(width, 16)) {
      UYVYToYRow = UYVYToYRow_NEON;
@ -827,10 +815,9 @@ int V210ToI420(const uint8* src_v210, int src_stride_v210,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height) {
-  if (width * 2 * 2 > kMaxStride) {  // 2 rows of UYVY are required.
-    return -1;
-  } else if (!src_v210 || !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
+  if (!src_v210 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      width * 2 * 2 > kMaxStride) {
    return -1;
  }
  // Negative height means invert the image.
@ -858,12 +845,10 @@ int V210ToI420(const uint8* src_v210, int src_stride_v210,
    }
  }
 #elif defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      UYVYToYRow = UYVYToYRow_Any_NEON;
-      if (width > 16) {
-        UYVYToUVRow = UYVYToUVRow_Any_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (width >= 16) {
+      UYVYToUVRow = UYVYToUVRow_Any_NEON;
    }
    if (IS_ALIGNED(width, 16)) {
      UYVYToYRow = UYVYToYRow_NEON;
@ -873,11 +858,9 @@ int V210ToI420(const uint8* src_v210, int src_stride_v210,
 #endif

 #if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    if (width > 16) {
-      UYVYToUVRow = UYVYToUVRow_Any_SSE2;
-      UYVYToYRow = UYVYToYRow_Any_SSE2;
-    }
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
      UYVYToUVRow = UYVYToUVRow_SSE2;
@ -887,12 +870,10 @@ int V210ToI420(const uint8* src_v210, int src_stride_v210,
    }
  }
 #elif defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      UYVYToYRow = UYVYToYRow_Any_NEON;
-      if (width > 16) {
-        UYVYToUVRow = UYVYToUVRow_Any_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (width >= 16) {
+      UYVYToUVRow = UYVYToUVRow_Any_NEON;
    }
    if (IS_ALIGNED(width, 16)) {
      UYVYToYRow = UYVYToYRow_NEON;
@ -920,6 +901,7 @@ int V210ToI420(const uint8* src_v210, int src_stride_v210,
  return 0;
 }

+// Convert ARGB to I420.
 LIBYUV_API
 int ARGBToI420(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
@ -942,11 +924,9 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-      ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    }
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
@ -959,10 +939,8 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
    }
  }
 #elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      ARGBToYRow = ARGBToYRow_Any_NEON;
-    }
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      ARGBToYRow = ARGBToYRow_NEON;
    }
@ -985,6 +963,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
  return 0;
 }

+// Convert BGRA to I420.
 LIBYUV_API
 int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
               uint8* dst_y, int dst_stride_y,
@ -1002,18 +981,14 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
    src_stride_bgra = -src_stride_bgra;
  }
-  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix);
  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  BGRAToYRow = BGRAToYRow_C;
-  BGRAToUVRow = BGRAToUVRow_C;
+                      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
+      BGRAToYRow_C;
 #if defined(HAS_BGRATOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
-      BGRAToYRow = BGRAToYRow_Any_SSSE3;
-    }
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
      BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
@ -1025,6 +1000,13 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
      }
    }
  }
+#elif defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    BGRAToYRow = BGRAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_NEON;
+    }
+  }
 #endif

  for (int y = 0; y < height - 1; y += 2) {
@ -1043,6 +1025,7 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
  return 0;
 }

+// Convert ABGR to I420.
 LIBYUV_API
 int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
               uint8* dst_y, int dst_stride_y,
@ -1060,18 +1043,14 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
    src_stride_abgr = -src_stride_abgr;
  }
-  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix);
  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ABGRToYRow = ABGRToYRow_C;
-  ABGRToUVRow = ABGRToUVRow_C;
+                      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
+      ABGRToYRow_C;
 #if defined(HAS_ABGRTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
-      ABGRToYRow = ABGRToYRow_Any_SSSE3;
-    }
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
      ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
@ -1083,6 +1062,13 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
      }
    }
  }
+#elif defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
 #endif

  for (int y = 0; y < height - 1; y += 2) {
@ -1101,6 +1087,7 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
  return 0;
 }

+// Convert RGBA to I420.
 LIBYUV_API
 int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
               uint8* dst_y, int dst_stride_y,
@ -1118,18 +1105,14 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
    src_stride_rgba = -src_stride_rgba;
  }
-  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix);
  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  RGBAToYRow = RGBAToYRow_C;
-  RGBAToUVRow = RGBAToUVRow_C;
+                      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
+      RGBAToYRow_C;
 #if defined(HAS_RGBATOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
-      RGBAToYRow = RGBAToYRow_Any_SSSE3;
-    }
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+    RGBAToYRow = RGBAToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
      RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
@ -1141,6 +1124,13 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
      }
    }
  }
+#elif defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGBAToYRow = RGBAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_NEON;
+    }
+  }
 #endif

  for (int y = 0; y < height - 1; y += 2) {
@ -1159,18 +1149,17 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
  return 0;
 }

+// Convert RGB24 to I420.
 LIBYUV_API
 int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (width * 4 > kMaxStride) {  // Row buffer is required.
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      width * 4 > kMaxStride) {
    return -1;
-  } else if (!src_rgb24 ||
-             !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
-      return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
@ -1179,44 +1168,71 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
    src_stride_rgb24 = -src_stride_rgb24;
  }
  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
-
-  RGB24ToARGBRow = RGB24ToARGBRow_C;
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      TestReadSafe(src_rgb24, src_stride_rgb24, width, height, 3, 48)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#elif defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
  }
 #endif

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ARGBToYRow = ARGBToYRow_C;
-  ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    }
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+
+#if defined(HAS_RGB24TOYROW_NEON)
+  void (*RGB24ToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      RGB24ToYRow_C;
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB24ToYRow = RGB24ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYRow = RGB24ToYRow_NEON;
+    }
+  }
+#else
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
        ARGBToYRow = ARGBToYRow_SSSE3;
      }
    }
  }
-#endif
+#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif  // HAS_RGB24TOYROW_NEON

  for (int y = 0; y < height - 1; y += 2) {
    RGB24ToARGBRow(src_rgb24, row, width);
    RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kMaxStride, width);
    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+#if defined(HAS_RGB24TOYROW_NEON)
+    RGB24ToYRow(src_rgb24, dst_y, width);
+    RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
    ARGBToYRow(row, dst_y, width);
    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+#endif
    src_rgb24 += src_stride_rgb24 * 2;
    dst_y += dst_stride_y * 2;
    dst_u += dst_stride_u;
@ -1225,23 +1241,27 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
  if (height & 1) {
    RGB24ToARGBRow_C(src_rgb24, row, width);
    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+#if defined(HAS_RGB24TOYROW_NEON)
+    RGB24ToYRow(src_rgb24, dst_y, width);
+#else
    ARGBToYRow(row, dst_y, width);
+#endif
  }
  return 0;
 }

+// Convert RAW to I420.
+// Same as RGB24 but RGB vs BGR
 LIBYUV_API
 int RAWToI420(const uint8* src_raw, int src_stride_raw,
              uint8* dst_y, int dst_stride_y,
              uint8* dst_u, int dst_stride_u,
              uint8* dst_v, int dst_stride_v,
              int width, int height) {
-  if (width * 4 > kMaxStride) {  // Row buffer is required.
+  if (!src_raw || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      width * 4 > kMaxStride) {
    return -1;
-  } else if (!src_raw ||
-             !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
-      return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
@ -1250,44 +1270,71 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
    src_stride_raw = -src_stride_raw;
  }
  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
-
-  RAWToARGBRow = RAWToARGBRow_C;
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
 #if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      TestReadSafe(src_raw, src_stride_raw, width, height, 3, 48)) {
-    RAWToARGBRow = RAWToARGBRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#elif defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
  }
 #endif

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ARGBToYRow = ARGBToYRow_C;
-  ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    }
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+
+#if defined(HAS_RAWTOYROW_NEON)
+  void (*RAWToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      RAWToYRow_C;
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RAWToYRow = RAWToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYRow = RAWToYRow_NEON;
+    }
+  }
+#else
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
        ARGBToYRow = ARGBToYRow_SSSE3;
      }
    }
  }
-#endif
+#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif  // HAS_RAWTOYROW_NEON

  for (int y = 0; y < height - 1; y += 2) {
    RAWToARGBRow(src_raw, row, width);
    RAWToARGBRow(src_raw + src_stride_raw, row + kMaxStride, width);
    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+#if defined(HAS_RAWTOYROW_NEON)
+    RAWToYRow(src_raw, dst_y, width);
+    RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
    ARGBToYRow(row, dst_y, width);
    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+#endif
    src_raw += src_stride_raw * 2;
    dst_y += dst_stride_y * 2;
    dst_u += dst_stride_u;
@ -1296,22 +1343,25 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
  if (height & 1) {
    RAWToARGBRow_C(src_raw, row, width);
    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+#if defined(HAS_RAWTOYROW_NEON)
+    RAWToYRow(src_raw, dst_y, width);
+#else
    ARGBToYRow(row, dst_y, width);
+#endif
  }
  return 0;
 }

+// Convert RGB565 to I420.
 LIBYUV_API
 int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height) {
-  if (width * 4 > kMaxStride) {  // Row buffer is required.
-    return -1;
-  } else if (!src_rgb565 ||
-             !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height) {
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      width * 4 > kMaxStride) {
    return -1;
  }
  // Negative height means invert the image.
@ -1321,44 +1371,71 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
    src_stride_rgb565 = -src_stride_rgb565;
  }
  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
-  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix);
-
-  RGB565ToARGBRow = RGB565ToARGBRow_C;
+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
 #if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      TestReadSafe(src_rgb565, src_stride_rgb565, width, height, 2, 16)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#elif defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+    }
  }
 #endif

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-  ARGBToYRow = ARGBToYRow_C;
-  ARGBToUVRow = ARGBToUVRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    }
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+
+#if defined(HAS_RGB565TOYROW_NEON)
+  void (*RGB565ToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      RGB565ToYRow_C;
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB565ToYRow = RGB565ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_NEON;
+    }
+  }
+#else
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
        ARGBToYRow = ARGBToYRow_SSSE3;
      }
    }
  }
-#endif
+#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif  // HAS_RGB565TOYROW_NEON

  for (int y = 0; y < height - 1; y += 2) {
    RGB565ToARGBRow(src_rgb565, row, width);
    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kMaxStride, width);
    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+#if defined(HAS_RGB565TOYROW_NEON)
+    RGB565ToYRow(src_rgb565, dst_y, width);
+    RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
    ARGBToYRow(row, dst_y, width);
    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+#endif
    src_rgb565 += src_stride_rgb565 * 2;
    dst_y += dst_stride_y * 2;
    dst_u += dst_stride_u;
@ -1367,7 +1444,11 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
  if (height & 1) {
    RGB565ToARGBRow_C(src_rgb565, row, width);
    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+#if defined(HAS_RGB565TOYROW_NEON)
+    RGB565ToYRow(src_rgb565, dst_y, width);
+#else
    ARGBToYRow(row, dst_y, width);
+#endif
  }
  return 0;
 }
@ -1378,12 +1459,10 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
                 uint8* dst_u, int dst_stride_u,
                 uint8* dst_v, int dst_stride_v,
                 int width, int height) {
-  if (width * 4 > kMaxStride) {  // Row buffer is required.
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      width * 4 > kMaxStride) {
    return -1;
-  } else if (!src_argb1555 ||
-             !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
-      return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
@ -1409,10 +1488,8 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
  ARGBToYRow = ARGBToYRow_C;
  ARGBToUVRow = ARGBToUVRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    }
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
    ARGBToYRow = ARGBToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
@ -1450,12 +1527,10 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
                   uint8* dst_u, int dst_stride_u,
                   uint8* dst_v, int dst_stride_v,
                   int width, int height) {
-  if (width * 4 > kMaxStride) {  // Row buffer is required.
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      width * 4 > kMaxStride) {
    return -1;
-  } else if (!src_argb4444 ||
-             !dst_y || !dst_u || !dst_v ||
-             width <= 0 || height == 0) {
-      return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
@ -1481,10 +1556,8 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
  ARGBToYRow = ARGBToYRow_C;
  ARGBToUVRow = ARGBToUVRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    }
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
    ARGBToYRow = ARGBToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -405,8 +405,8 @@ int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
 // Convert RGB24 to ARGB.
 LIBYUV_API
 int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
  if (!src_rgb24 || !dst_argb ||
      width <= 0 || height == 0) {
    return -1;
@ -417,16 +417,22 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
    src_stride_rgb24 = -src_stride_rgb24;
  }
-  void (*RGB24ToARGBRow)(const uint8* src_rgb24, uint8* dst_argb, int pix) =
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
      RGB24ToARGBRow_C;
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
  }
 #elif defined(HAS_RGB24TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
  }
 #endif

@ -441,8 +447,8 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
 // Convert RAW to ARGB.
 LIBYUV_API
 int RAWToARGB(const uint8* src_raw, int src_stride_raw,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
  if (!src_raw || !dst_argb ||
      width <= 0 || height == 0) {
    return -1;
@ -453,16 +459,22 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
    src_raw = src_raw + (height - 1) * src_stride_raw;
    src_stride_raw = -src_stride_raw;
  }
-  void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix) =
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
      RAWToARGBRow_C;
 #if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    RAWToARGBRow = RAWToARGBRow_SSSE3;
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
  }
 #elif defined(HAS_RAWTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    RAWToARGBRow = RAWToARGBRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
  }
 #endif

@ -492,10 +504,19 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
      RGB565ToARGBRow_C;
 #if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 8) &&
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#elif defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+    }
  }
 #endif

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -265,25 +265,25 @@ LIBYUV_API
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_yuy2, int dst_stride_yuy2,
               int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_frame ||
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-    dst_stride_frame = -dst_stride_frame;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
  }
  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_frame, int width) =
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
      I422ToYUY2Row_C;
 #if defined(HAS_I422TOYUY2ROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+      IS_ALIGNED(dst_yuy2, 16) && IS_ALIGNED(dst_stride_yuy2, 16)) {
    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      I422ToYUY2Row = I422ToYUY2Row_SSE2;
@ -299,11 +299,11 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
 #endif

  for (int y = 0; y < height; ++y) {
-    I422ToYUY2Row(src_y, src_u, src_y, dst_frame, width);
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
    src_y += src_stride_y;
    src_u += src_stride_u;
    src_v += src_stride_v;
-    dst_frame += dst_stride_frame;
+    dst_yuy2 += dst_stride_yuy2;
  }
  return 0;
 }
@ -312,25 +312,25 @@ LIBYUV_API
 int I420ToYUY2(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_yuy2, int dst_stride_yuy2,
               int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_frame ||
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-    dst_stride_frame = -dst_stride_frame;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
  }
  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_frame, int width) =
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
      I422ToYUY2Row_C;
 #if defined(HAS_I422TOYUY2ROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+      IS_ALIGNED(dst_yuy2, 16) && IS_ALIGNED(dst_stride_yuy2, 16)) {
    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      I422ToYUY2Row = I422ToYUY2Row_SSE2;
@ -346,16 +346,16 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
 #endif

  for (int y = 0; y < height - 1; y += 2) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_frame, width);
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
-                  dst_frame + dst_stride_frame, width);
+                  dst_yuy2 + dst_stride_yuy2, width);
    src_y += src_stride_y * 2;
    src_u += src_stride_u;
    src_v += src_stride_v;
-    dst_frame += dst_stride_frame * 2;
+    dst_yuy2 += dst_stride_yuy2 * 2;
  }
  if (height & 1) {
-    I422ToYUY2Row(src_y, src_u, src_v, dst_frame, width);
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
  }
  return 0;
 }
@ -365,25 +365,25 @@ LIBYUV_API
 int I422ToUYVY(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_uyvy, int dst_stride_uyvy,
               int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_frame ||
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-    dst_stride_frame = -dst_stride_frame;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
  }
  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_frame, int width) =
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
      I422ToUYVYRow_C;
 #if defined(HAS_I422TOUYVYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+      IS_ALIGNED(dst_uyvy, 16) && IS_ALIGNED(dst_stride_uyvy, 16)) {
    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      I422ToUYVYRow = I422ToUYVYRow_SSE2;
@ -399,11 +399,11 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
 #endif

  for (int y = 0; y < height; ++y) {
-    I422ToUYVYRow(src_y, src_u, src_y, dst_frame, width);
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
    src_y += src_stride_y;
    src_u += src_stride_u;
    src_v += src_stride_v;
-    dst_frame += dst_stride_frame;
+    dst_uyvy += dst_stride_uyvy;
  }
  return 0;
 }
@ -412,25 +412,25 @@ LIBYUV_API
 int I420ToUYVY(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_uyvy, int dst_stride_uyvy,
               int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_frame ||
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-    dst_stride_frame = -dst_stride_frame;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
  }
  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_frame, int width) =
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
      I422ToUYVYRow_C;
 #if defined(HAS_I422TOUYVYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) {
+      IS_ALIGNED(dst_uyvy, 16) && IS_ALIGNED(dst_stride_uyvy, 16)) {
    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      I422ToUYVYRow = I422ToUYVYRow_SSE2;
@ -446,16 +446,16 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
 #endif

  for (int y = 0; y < height - 1; y += 2) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_frame, width);
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
-                  dst_frame + dst_stride_frame, width);
+                  dst_uyvy + dst_stride_uyvy, width);
    src_y += src_stride_y * 2;
    src_u += src_stride_u;
    src_v += src_stride_v;
-    dst_frame += dst_stride_frame * 2;
+    dst_uyvy += dst_stride_uyvy * 2;
  }
  if (height & 1) {
-    I422ToUYVYRow(src_y, src_u, src_v, dst_frame, width);
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
  }
  return 0;
 }
@ -464,35 +464,35 @@ LIBYUV_API
 int I420ToV210(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
+               uint8* dst_v210, int dst_stride_v210,
               int width, int height) {
  if (width * 16 / 6 > kMaxStride ||
-      !src_y || !src_u || !src_v || !dst_frame ||
+      !src_y || !src_u || !src_v || !dst_v210 ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_frame = dst_frame + (height - 1) * dst_stride_frame;
-    dst_stride_frame = -dst_stride_frame;
+    dst_v210 = dst_v210 + (height - 1) * dst_stride_v210;
+    dst_stride_v210 = -dst_stride_v210;
  }

  SIMD_ALIGNED(uint8 row[kMaxStride]);

  for (int y = 0; y < height - 1; y += 2) {
    I422ToUYVYRow_C(src_y, src_u, src_v, row, width);
-    UYVYToV210Row_C(row, dst_frame, width);
+    UYVYToV210Row_C(row, dst_v210, width);
    I422ToUYVYRow_C(src_y + src_stride_y, src_u, src_v, row, width);
-    UYVYToV210Row_C(row, dst_frame + dst_stride_frame, width);
+    UYVYToV210Row_C(row, dst_v210 + dst_stride_v210, width);
    src_y += src_stride_y * 2;
    src_u += src_stride_u;
    src_v += src_stride_v;
-    dst_frame += dst_stride_frame * 2;
+    dst_v210 += dst_stride_v210 * 2;
  }
  if (height & 1) {
    I422ToUYVYRow_C(src_y, src_u, src_v, row, width);
-    UYVYToV210Row_C(row, dst_frame, width);
+    UYVYToV210Row_C(row, dst_v210, width);
  }
  return 0;
 }
@ -521,7 +521,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
  int halfwidth = (width + 1) >> 1;
  void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                  int width) = MergeUV_C;
-#if defined(HAS_SPLITUV_SSE2)
+#if defined(HAS_MERGEUV_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
    MergeUV = MergeUV_Any_SSE2;
    if (IS_ALIGNED(halfwidth, 16)) {
@ -534,7 +534,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
-#if defined(HAS_SPLITUV_AVX2)
+#if defined(HAS_MERGEUV_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
    MergeUV = MergeUV_Any_AVX2;
    if (IS_ALIGNED(halfwidth, 32)) {
@ -547,7 +547,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
-#if defined(HAS_SPLITUV_NEON)
+#if defined(HAS_MERGEUV_NEON)
  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
    MergeUV = MergeUV_Any_NEON;
    if (IS_ALIGNED(halfwidth, 16)) {
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@ -21,6 +21,522 @@ namespace libyuv {
 extern "C" {
 #endif

+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ARGBToUV444Row_C(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) = ARGBToUV422Row_C;
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+      }
+    }
+  }
+#endif
+
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ARGBToUV422Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I411
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    ARGBToUV411Row_C(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0 ||
+      width > kMaxStride) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUVRow = ARGBToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          ARGBToYRow = ARGBToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+  int halfwidth = (width + 1) >> 1;
+  void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) = MergeUV_C;
+#if defined(HAS_MERGEUV_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+    MergeUV = MergeUV_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUV = MergeUV_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+        MergeUV = MergeUV_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_MERGEUV_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+    MergeUV = MergeUV_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUV = MergeUV_Unaligned_AVX2;
+      if (IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) {
+        MergeUV = MergeUV_AVX2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_MERGEUV_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+    MergeUV = MergeUV_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUV = MergeUV_Unaligned_NEON;
+      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+        MergeUV = MergeUV_NEON;
+      }
+    }
+  }
+#endif
+
+  SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+  SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+
+  for (int y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+    MergeUV(row_u, row_v, dst_uv, halfwidth);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+    MergeUV(row_u, row_v, dst_uv, halfwidth);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + 0, dst_y + dst_stride_y, width);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0 ||
+      width > kMaxStride) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUVRow = ARGBToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          ARGBToYRow = ARGBToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+  int halfwidth = (width + 1) >> 1;
+  void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) = MergeUV_C;
+#if defined(HAS_MERGEUV_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+    MergeUV = MergeUV_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUV = MergeUV_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+        MergeUV = MergeUV_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_MERGEUV_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+    MergeUV = MergeUV_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUV = MergeUV_Unaligned_AVX2;
+      if (IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) {
+        MergeUV = MergeUV_AVX2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_MERGEUV_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+    MergeUV = MergeUV_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUV = MergeUV_Unaligned_NEON;
+      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+        MergeUV = MergeUV_NEON;
+      }
+    }
+  }
+#endif
+
+  SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+  SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+
+  for (int y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+    MergeUV(row_v, row_u, dst_uv, halfwidth);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+    MergeUV(row_v, row_u, dst_uv, halfwidth);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + 0, dst_y + dst_stride_y, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  if (!src_argb || !dst_yuy2 ||
+      width <= 0 || height == 0 ||
+      width > kMaxStride) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUVRow = ARGBToUVRow_SSSE3;
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
+      IS_ALIGNED(dst_yuy2, 16) && IS_ALIGNED(dst_stride_yuy2, 16)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+  SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+  SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+  SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+
+  for (int y = 0; y < height; ++y) {
+    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+    ARGBToYRow(src_argb, row_y, width);
+    I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+    src_argb += src_stride_argb;
+    dst_yuy2 += dst_stride_yuy2;
+  }
+  return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  if (!src_argb || !dst_uyvy ||
+      width <= 0 || height == 0 ||
+      width > kMaxStride) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUVRow = ARGBToUVRow_SSSE3;
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16 &&
+      IS_ALIGNED(dst_uyvy, 16) && IS_ALIGNED(dst_stride_uyvy, 16)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+  SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+  SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+  SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+
+  for (int y = 0; y < height; ++y) {
+    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+    ARGBToYRow(src_argb, row_y, width);
+    I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+    src_argb += src_stride_argb;
+    dst_uyvy += dst_stride_uyvy;
+  }
+  return 0;
+}
+
 // Convert ARGB to I400.
 LIBYUV_API
 int ARGBToI400(const uint8* src_argb, int src_stride_argb,
@ -37,10 +553,8 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    }
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
@ -50,10 +564,8 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
    }
  }
 #elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      ARGBToYRow = ARGBToYRow_Any_NEON;
-    }
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      ARGBToYRow = ARGBToYRow_NEON;
    }
@ -68,64 +580,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
  return 0;
 }

-// ARGB little endian (bgra in memory) to I422
-// same as I420 except UV plane is full height
-LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-      ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVRow = ARGBToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-      }
-    }
-  }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      ARGBToYRow = ARGBToYRow_Any_NEON;
-    }
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-
-  for (int y = 0; y < height; ++y) {
-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
 // Convert ARGB to RGBA.
 LIBYUV_API
 int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@ -268,7 +268,7 @@ int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
    dst_argb += dst_stride_argb * 2;
  }
  if (height & 1) {
-    BayerRow0(src_bayer, -src_stride_bayer, dst_argb, width);
+    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
  }
  return 0;
 }
@ -305,11 +305,9 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
      ARGBToYRow_C;
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width > 16) {
-      ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-      ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    }
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
@ -319,10 +317,8 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
    }
  }
 #elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width > 8) {
-      ARGBToYRow = ARGBToYRow_Any_NEON;
-    }
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      ARGBToYRow = ARGBToYRow_NEON;
    }
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -617,7 +617,7 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
    dst_stride_rgb565 = -dst_stride_rgb565;
  }
  void (*NV21ToRGB565Row)(const uint8* y_buf,
-                          const uint8* vu_buf,
+                          const uint8* src_vu,
                          uint8* rgb_buf,
                          int width) = NV21ToRGB565Row_C;
 #if defined(HAS_NV21TORGB565ROW_SSSE3)
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -45,7 +45,7 @@ extern "C" {
 #define HAS_MIRRORROW_NEON
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
 #define HAS_MIRRORROW_UV_NEON
-void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
+void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
 #define HAS_TRANSPOSE_WX8_NEON
 void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride, int width);
@ -1049,21 +1049,21 @@ void RotateUV180(const uint8* src, int src_stride,
                 uint8* dst_b, int dst_stride_b,
                 int width, int height) {
  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
-      MirrorRowUV_C;
+      MirrorUVRow_C;
 #if defined(HAS_MIRRORROW_UV_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRowUV = MirrorRowUV_NEON;
+    MirrorRowUV = MirrorUVRow_NEON;
  }
 #elif defined(HAS_MIRRORROW_UV_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3) &&
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
-    MirrorRowUV = MirrorRowUV_SSSE3;
+    MirrorRowUV = MirrorUVRow_SSSE3;
  }
-#elif defined(HAS_MIRRORROWUV_MIPS_DSPR2)
+#elif defined(HAS_MirrorUVRow_MIPS_DSPR2)
  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-    MirrorRowUV = MirrorRowUV_MIPS_DSPR2;
+    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
  }
 #endif

--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -121,12 +121,12 @@ NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
 // NEON RGB24 is multiple of 8 pixels, unaligned source and destination.
 // I400 To ARGB does multiple of 8 pixels with SIMD and remainder with C.
 #define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP)          \
-    void NAMEANY(const uint8* argb_buf,                                        \
-                 uint8* rgb_buf,                                               \
+    void NAMEANY(const uint8* src,                                             \
+                 uint8* dst,                                                   \
                 int width) {                                                  \
      int n = width & ~MASK;                                                   \
-      ARGBTORGB_SIMD(argb_buf, rgb_buf, n);                                    \
-      ARGBTORGB_C(argb_buf + n * SBPP, rgb_buf + n * BPP, width & MASK);       \
+      ARGBTORGB_SIMD(src, dst, n);                                             \
+      ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK);                \
    }

 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
@ -167,30 +167,37 @@ RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,

 // RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
 // TODO(fbarchard): Use last 16 method for all unsubsampled conversions.
-#define YANY(NAMEANY, ARGBTOY_SIMD, BPP, NUM)                                  \
+#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM)                            \
    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
      ARGBTOY_SIMD(src_argb, dst_y, width - NUM);                              \
-      ARGBTOY_SIMD(src_argb + (width - NUM) * BPP, dst_y + (width - NUM), NUM);\
+      ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP,                            \
+                   dst_y + (width - NUM) * BPP, NUM);                          \
    }

 #ifdef HAS_ARGBTOYROW_SSSE3
-YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 16)
-YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 16)
-YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 16)
-#endif
-#ifdef HAS_RGBATOYROW_SSSE3
-YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 16)
+YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
+YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
+YANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 3, 4, 16)
+YANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 3, 4, 16)
+YANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 2, 4, 8)
 #endif
 #ifdef HAS_ARGBTOYROW_NEON
-YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 8)
-#endif
-#ifdef HAS_YUY2TOYROW_SSE2
-YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 16)
-YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 16)
-#endif
-#ifdef HAS_YUY2TOYROW_NEON
-YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 16)
-YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 16)
+YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
+YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
+YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
+YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
+YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
+YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
+YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
+YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
+YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
+YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
+YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
+YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
 #endif
 #undef YANY

@ -201,17 +208,15 @@ YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 16)
      int n = width & ~15;                                                     \
      ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n);                \
      ANYTOUV_C(src_argb  + n * BPP, src_stride_argb,                          \
-                 dst_u + (n >> 1),                                             \
-                 dst_v + (n >> 1),                                             \
-                 width & 15);                                                  \
+                dst_u + (n >> 1),                                              \
+                dst_v + (n >> 1),                                              \
+                width & 15);                                                   \
    }

 #ifdef HAS_ARGBTOUVROW_SSSE3
 UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
 UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
 UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
-#endif
-#ifdef HAS_RGBATOYROW_SSSE3
 UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
 #endif
 #ifdef HAS_YUY2TOUVROW_SSE2
@ -230,11 +235,15 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
      int n = width & ~15;                                                     \
      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \
      ANYTOUV_C(src_uv  + n * BPP,                                             \
-                 dst_u + (n >> 1),                                             \
-                 dst_v + (n >> 1),                                             \
-                 width & 15);                                                  \
+                dst_u + (n >> 1),                                              \
+                dst_v + (n >> 1),                                              \
+                width & 15);                                                   \
    }

+#ifdef HAS_ARGBTOUVROW_SSSE3
+UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
+         ARGBToUV422Row_C, 4)
+#endif
 #ifdef HAS_YUY2TOUV422ROW_SSE2
 UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
         YUY2ToUV422Row_C, 2)
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -95,47 +95,47 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
  }
 }

-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
  for (int x = 0; x < width; ++x) {
-    uint8 b = src_rgb[0] & 0x1f;
-    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
-    uint8 r = src_rgb[1] >> 3;
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
    dst_argb[0] = (b << 3) | (b >> 2);
    dst_argb[1] = (g << 2) | (g >> 4);
    dst_argb[2] = (r << 3) | (r >> 2);
    dst_argb[3] = 255u;
    dst_argb += 4;
-    src_rgb += 2;
+    src_rgb565 += 2;
  }
 }

-void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, int width) {
  for (int x = 0; x < width; ++x) {
-    uint8 b = src_rgb[0] & 0x1f;
-    uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
-    uint8 r = (src_rgb[1] & 0x7c) >> 2;
-    uint8 a = src_rgb[1] >> 7;
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 a = src_argb1555[1] >> 7;
    dst_argb[0] = (b << 3) | (b >> 2);
    dst_argb[1] = (g << 3) | (g >> 2);
    dst_argb[2] = (r << 3) | (r >> 2);
    dst_argb[3] = -a;
    dst_argb += 4;
-    src_rgb += 2;
+    src_argb1555 += 2;
  }
 }

-void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, int width) {
  for (int x = 0; x < width; ++x) {
-    uint8 b = src_rgb[0] & 0x0f;
-    uint8 g = src_rgb[0] >> 4;
-    uint8 r = src_rgb[1] & 0x0f;
-    uint8 a = src_rgb[1] >> 4;
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    uint8 a = src_argb4444[1] >> 4;
    dst_argb[0] = (b << 4) | b;
    dst_argb[1] = (g << 4) | g;
    dst_argb[2] = (r << 4) | r;
    dst_argb[3] = (a << 4) | a;
    dst_argb += 4;
-    src_rgb += 2;
+    src_argb4444 += 2;
  }
 }

@ -265,11 +265,11 @@ static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
 }

-#define MAKEROWY(NAME, R, G, B) \
+#define MAKEROWY(NAME, R, G, B, BPP) \
 void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
  for (int x = 0; x < width; ++x) {                                            \
    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
-    src_argb0 += 4;                                                            \
+    src_argb0 += BPP;                                                          \
    dst_y += 1;                                                                \
  }                                                                            \
 }                                                                              \
@ -277,16 +277,16 @@ void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
                       uint8* dst_u, uint8* dst_v, int width) {                \
  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
  for (int x = 0; x < width - 1; x += 2) {                                     \
-    uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] +                                \
-               src_rgb1[B] + src_rgb1[B + 4]) >> 2;                            \
-    uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] +                                \
-               src_rgb1[G] + src_rgb1[G + 4]) >> 2;                            \
-    uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] +                                \
-               src_rgb1[R] + src_rgb1[R + 4]) >> 2;                            \
+    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
+               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
+    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
+               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
+    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
+               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
    dst_u[0] = RGBToU(ar, ag, ab);                                             \
    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-    src_rgb0 += 8;                                                             \
-    src_rgb1 += 8;                                                             \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
    dst_u += 1;                                                                \
    dst_v += 1;                                                                \
  }                                                                            \
@ -299,10 +299,95 @@ void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
  }                                                                            \
 }

-MAKEROWY(ARGB, 2, 1, 0)
-MAKEROWY(BGRA, 1, 2, 3)
-MAKEROWY(ABGR, 0, 1, 2)
-MAKEROWY(RGBA, 3, 2, 1)
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
+    b = (b << 3) | (b >> 2);
+    g = (g << 2) | (g >> 4);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_rgb565 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  for (int x = 0; x < width; ++x) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+void ARGBToUV422Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if ((width & 3) == 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
+
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  for (int x = 0; x < width - 3; x += 4) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 16;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if ((width & 3) == 3) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}

 // http://en.wikipedia.org/wiki/Grayscale.
 // 0.11 * B + 0.59 * G + 0.30 * R
@ -470,104 +555,104 @@ static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
 #if defined(__ARM_NEON__)
 // C mimic assembly.
 // TODO(fbarchard): Remove subsampling from Neon.
-void I444ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    uint8 u = (u_buf[0] + u_buf[1] + 1) >> 1;
-    uint8 v = (v_buf[0] + v_buf[1] + 1) >> 1;
-    YuvPixel(y_buf[0], u, v, rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], u, v, rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
-    u_buf += 2;
-    v_buf += 2;
+    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
+    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+    YuvPixel(src_y[0], u, v, rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[1], u, v, rgb_buf + 4, 24, 16, 8, 0);
+    src_y += 2;
+    src_u += 2;
+    src_v += 2;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 24, 16, 8, 0);
  }
 }
 #else
-void I444ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width; ++x) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
-    y_buf += 1;
-    u_buf += 1;
-    v_buf += 1;
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf, 24, 16, 8, 0);
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
    rgb_buf += 4;  // Advance 1 pixel.
  }
 }
 #endif
 // Also used for 420
-void I422ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, 24, 16, 8, 0);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 24, 16, 8, 0);
  }
 }

-void I422ToRGB24Row_C(const uint8* y_buf,
-                      const uint8* u_buf,
-                      const uint8* v_buf,
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
                      uint8* rgb_buf,
                      int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+    YuvPixel2(src_y[0], src_u[0], src_v[0],
              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
-    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
+    YuvPixel2(src_y[1], src_u[0], src_v[0],
              rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
    rgb_buf += 6;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+    YuvPixel2(src_y[0], src_u[0], src_v[0],
              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
  }
 }

-void I422ToRAWRow_C(const uint8* y_buf,
-                    const uint8* u_buf,
-                    const uint8* v_buf,
+void I422ToRAWRow_C(const uint8* src_y,
+                    const uint8* src_u,
+                    const uint8* src_v,
                    uint8* rgb_buf,
                    int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+    YuvPixel2(src_y[0], src_u[0], src_v[0],
              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
-    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
+    YuvPixel2(src_y[1], src_u[0], src_v[0],
              rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
    rgb_buf += 6;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+    YuvPixel2(src_y[0], src_u[0], src_v[0],
              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
  }
 }

-void I422ToARGB4444Row_C(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
                         uint8* dst_argb4444,
                         int width) {
  uint8 b0;
@ -577,8 +662,8 @@ void I422ToARGB4444Row_C(const uint8* y_buf,
  uint8 g1;
  uint8 r1;
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0], &b0, &g0, &r0);
-    YuvPixel2(y_buf[1], u_buf[0], v_buf[0], &b1, &g1, &r1);
+    YuvPixel2(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel2(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
    b0 = b0 >> 4;
    g0 = g0 >> 4;
    r0 = r0 >> 4;
@ -587,13 +672,13 @@ void I422ToARGB4444Row_C(const uint8* y_buf,
    r1 = r1 >> 4;
    *reinterpret_cast<uint32*>(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
    dst_argb4444 += 4;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0], &b0, &g0, &r0);
+    YuvPixel2(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
    b0 = b0 >> 4;
    g0 = g0 >> 4;
    r0 = r0 >> 4;
@ -602,9 +687,9 @@ void I422ToARGB4444Row_C(const uint8* y_buf,
  }
 }

-void I422ToARGB1555Row_C(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
                         uint8* dst_argb1555,
                         int width) {
  uint8 b0;
@ -614,8 +699,8 @@ void I422ToARGB1555Row_C(const uint8* y_buf,
  uint8 g1;
  uint8 r1;
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0], &b0, &g0, &r0);
-    YuvPixel2(y_buf[1], u_buf[0], v_buf[0], &b1, &g1, &r1);
+    YuvPixel2(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel2(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
    b0 = b0 >> 3;
    g0 = g0 >> 3;
    r0 = r0 >> 3;
@ -624,13 +709,13 @@ void I422ToARGB1555Row_C(const uint8* y_buf,
    r1 = r1 >> 3;
    *reinterpret_cast<uint32*>(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
    dst_argb1555 += 4;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0], &b0, &g0, &r0);
+    YuvPixel2(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
    b0 = b0 >> 3;
    g0 = g0 >> 3;
    r0 = r0 >> 3;
@ -639,9 +724,9 @@ void I422ToARGB1555Row_C(const uint8* y_buf,
  }
 }

-void I422ToRGB565Row_C(const uint8* y_buf,
-                      const uint8* u_buf,
-                      const uint8* v_buf,
+void I422ToRGB565Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
                      uint8* dst_rgb565,
                      int width) {
  uint8 b0;
@ -651,8 +736,8 @@ void I422ToRGB565Row_C(const uint8* y_buf,
  uint8 g1;
  uint8 r1;
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0], &b0, &g0, &r0);
-    YuvPixel2(y_buf[1], u_buf[0], v_buf[0], &b1, &g1, &r1);
+    YuvPixel2(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel2(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@ -661,13 +746,13 @@ void I422ToRGB565Row_C(const uint8* y_buf,
    r1 = r1 >> 3;
    *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
        (b1 << 16) | (g1 << 21) | (r1 << 27);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
    dst_rgb565 += 4;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel2(y_buf[0], u_buf[0], v_buf[0], &b0, &g0, &r0);
+    YuvPixel2(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@ -675,66 +760,66 @@ void I422ToRGB565Row_C(const uint8* y_buf,
  }
 }

-void I411ToARGBRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 3; x += 4) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
-    YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0);
-    YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0);
-    y_buf += 4;
-    u_buf += 1;
-    v_buf += 1;
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, 24, 16, 8, 0);
+    YuvPixel(src_y[2], src_u[0], src_v[0], rgb_buf + 8, 24, 16, 8, 0);
+    YuvPixel(src_y[3], src_u[0], src_v[0], rgb_buf + 12, 24, 16, 8, 0);
+    src_y += 4;
+    src_u += 1;
+    src_v += 1;
    rgb_buf += 16;  // Advance 4 pixels.
  }
  if (width & 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, 24, 16, 8, 0);
+    src_y += 2;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 24, 16, 8, 0);
  }
 }

-void NV12ToARGBRow_C(const uint8* y_buf,
-                     const uint8* uv_buf,
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* usrc_v,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], uv_buf[0], uv_buf[1], rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
-    uv_buf += 2;
+    YuvPixel(src_y[0], usrc_v[0], usrc_v[1], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[1], usrc_v[0], usrc_v[1], rgb_buf + 4, 24, 16, 8, 0);
+    src_y += 2;
+    usrc_v += 2;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[0], usrc_v[0], usrc_v[1], rgb_buf + 0, 24, 16, 8, 0);
  }
 }

-void NV21ToARGBRow_C(const uint8* y_buf,
-                     const uint8* vu_buf,
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_vu,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(y_buf[1], vu_buf[1], vu_buf[0], rgb_buf + 4, 24, 16, 8, 0);
-    y_buf += 2;
-    vu_buf += 2;
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, 24, 16, 8, 0);
+    src_y += 2;
+    src_vu += 2;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, 24, 16, 8, 0);
  }
 }

-void NV12ToRGB565Row_C(const uint8* y_buf,
-                       const uint8* uv_buf,
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* usrc_v,
                       uint8* dst_rgb565,
                       int width) {
  uint8 b0;
@ -744,8 +829,8 @@ void NV12ToRGB565Row_C(const uint8* y_buf,
  uint8 g1;
  uint8 r1;
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel2(y_buf[0], uv_buf[0], uv_buf[1], &b0, &g0, &r0);
-    YuvPixel2(y_buf[1], uv_buf[0], uv_buf[1], &b1, &g1, &r1);
+    YuvPixel2(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
+    YuvPixel2(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@ -754,12 +839,12 @@ void NV12ToRGB565Row_C(const uint8* y_buf,
    r1 = r1 >> 3;
    *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
        (b1 << 16) | (g1 << 21) | (r1 << 27);
-    y_buf += 2;
-    uv_buf += 2;
+    src_y += 2;
+    usrc_v += 2;
    dst_rgb565 += 4;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel2(y_buf[0], uv_buf[0], uv_buf[1], &b0, &g0, &r0);
+    YuvPixel2(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@ -767,8 +852,8 @@ void NV12ToRGB565Row_C(const uint8* y_buf,
  }
 }

-void NV21ToRGB565Row_C(const uint8* y_buf,
-                       const uint8* vu_buf,
+void NV21ToRGB565Row_C(const uint8* src_y,
+                       const uint8* vsrc_u,
                       uint8* dst_rgb565,
                       int width) {
  uint8 b0;
@ -778,8 +863,8 @@ void NV21ToRGB565Row_C(const uint8* y_buf,
  uint8 g1;
  uint8 r1;
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel2(y_buf[0], vu_buf[1], vu_buf[0], &b0, &g0, &r0);
-    YuvPixel2(y_buf[1], vu_buf[1], vu_buf[0], &b1, &g1, &r1);
+    YuvPixel2(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+    YuvPixel2(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@ -788,12 +873,12 @@ void NV21ToRGB565Row_C(const uint8* y_buf,
    r1 = r1 >> 3;
    *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
        (b1 << 16) | (g1 << 21) | (r1 << 27);
-    y_buf += 2;
-    vu_buf += 2;
+    src_y += 2;
+    vsrc_u += 2;
    dst_rgb565 += 4;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel2(y_buf[0], vu_buf[1], vu_buf[0], &b0, &g0, &r0);
+    YuvPixel2(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@ -801,92 +886,92 @@ void NV21ToRGB565Row_C(const uint8* y_buf,
  }
 }

-void YUY2ToARGBRow_C(const uint8* yuy2_buf,
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(yuy2_buf[0], yuy2_buf[1], yuy2_buf[3], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(yuy2_buf[2], yuy2_buf[1], yuy2_buf[3], rgb_buf + 4, 24, 16, 8, 0);
-    yuy2_buf += 4;
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, 24, 16, 8, 0);
+    src_yuy2 += 4;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(yuy2_buf[0], yuy2_buf[1], yuy2_buf[3], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, 24, 16, 8, 0);
  }
 }

-void UYVYToARGBRow_C(const uint8* uyvy_buf,
+void UYVYToARGBRow_C(const uint8* src_uyvy,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(uyvy_buf[1], uyvy_buf[0], uyvy_buf[2], rgb_buf + 0, 24, 16, 8, 0);
-    YuvPixel(uyvy_buf[3], uyvy_buf[0], uyvy_buf[2], rgb_buf + 4, 24, 16, 8, 0);
-    uyvy_buf += 4;
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, 24, 16, 8, 0);
+    src_uyvy += 4;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(uyvy_buf[1], uyvy_buf[0], uyvy_buf[2], rgb_buf + 0, 24, 16, 8, 0);
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, 24, 16, 8, 0);
  }
 }

-void I422ToBGRARow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void I422ToBGRARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 0, 8, 16, 24);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, 0, 8, 16, 24);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf, 0, 8, 16, 24);
  }
 }

-void I422ToABGRRow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void I422ToABGRRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 24, 0, 8, 16);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, 24, 0, 8, 16);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 24, 0, 8, 16);
  }
 }

-void I422ToRGBARow_C(const uint8* y_buf,
-                     const uint8* u_buf,
-                     const uint8* v_buf,
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
                     uint8* rgb_buf,
                     int width) {
  for (int x = 0; x < width - 1; x += 2) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
-    YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 24, 16, 8);
-    y_buf += 2;
-    u_buf += 1;
-    v_buf += 1;
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 0, 24, 16, 8);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, 0, 24, 16, 8);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 24, 16, 8);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, 0, 24, 16, 8);
  }
 }

-void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
+void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
  for (int x = 0; x < width; ++x) {
-    YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
-    y_buf += 1;
+    YuvPixel(src_y[0], 128, 128, rgb_buf, 24, 16, 8, 0);
+    src_y += 1;
    rgb_buf += 4;  // Advance 1 pixel.
  }
 }
@ -903,7 +988,7 @@ void MirrorRow_C(const uint8* src, uint8* dst, int width) {
  }
 }

-void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
  src_uv += (width - 1) << 1;
  for (int x = 0; x < width - 1; x += 2) {
    dst_u[x] = src_uv[0];
@ -1399,35 +1484,35 @@ void I422ToUYVYRow_C(const uint8* src_y,
 // row_win.cc has asm version, but GCC uses 2 step wrapper.  5% slower.
 // TODO(fbarchard): Handle width > kMaxStride here instead of calling code.
 #if defined(__x86_64__) || defined(__i386__)
-void I422ToRGB565Row_SSSE3(const uint8* y_buf,
-                           const uint8* u_buf,
-                           const uint8* v_buf,
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
                           uint8* rgb_buf,
                           int width) {
  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  I422ToARGBRow_SSSE3(y_buf, u_buf, v_buf, row, width);
+  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
  ARGBToRGB565Row_SSE2(row, rgb_buf, width);
 }
 #endif  // defined(__x86_64__) || defined(__i386__)

 #if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
-void I422ToARGB1555Row_SSSE3(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
                             uint8* rgb_buf,
                             int width) {
  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  I422ToARGBRow_SSSE3(y_buf, u_buf, v_buf, row, width);
+  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
  ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
 }

-void I422ToARGB4444Row_SSSE3(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
                             uint8* rgb_buf,
                             int width) {
  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  I422ToARGBRow_SSSE3(y_buf, u_buf, v_buf, row, width);
+  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
  ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
 }

@ -1452,45 +1537,45 @@ void NV21ToRGB565Row_SSSE3(const uint8* src_y,
 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
                         uint8* dst_argb,
                         int width) {
-  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowu[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowv[kMaxStride]);
-  YUY2ToUV422Row_SSE2(src_yuy2, rowu, rowv, width);
-  YUY2ToYRow_SSE2(src_yuy2, rowy, width);
-  I422ToARGBRow_SSSE3(rowy, rowu, rowv, dst_argb, width);
+  SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+  SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+  SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+  YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
+  YUY2ToYRow_SSE2(src_yuy2, row_y, width);
+  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
 }

 void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
                                   uint8* dst_argb,
                                   int width) {
-  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowu[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowv[kMaxStride]);
-  YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, rowu, rowv, width);
-  YUY2ToYRow_Unaligned_SSE2(src_yuy2, rowy, width);
-  I422ToARGBRow_Unaligned_SSSE3(rowy, rowu, rowv, dst_argb, width);
+  SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+  SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+  SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+  YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
+  YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
+  I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
 }

 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
                         uint8* dst_argb,
                         int width) {
-  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowu[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowv[kMaxStride]);
-  UYVYToUV422Row_SSE2(src_uyvy, rowu, rowv, width);
-  UYVYToYRow_SSE2(src_uyvy, rowy, width);
-  I422ToARGBRow_SSSE3(rowy, rowu, rowv, dst_argb, width);
+  SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+  SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+  SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+  UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
+  UYVYToYRow_SSE2(src_uyvy, row_y, width);
+  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
 }

 void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
                                   uint8* dst_argb,
                                   int width) {
-  SIMD_ALIGNED(uint8 rowy[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowu[kMaxStride]);
-  SIMD_ALIGNED(uint8 rowv[kMaxStride]);
-  UYVYToUV422Row_Unaligned_SSE2(src_uyvy, rowu, rowv, width);
-  UYVYToYRow_Unaligned_SSE2(src_uyvy, rowy, width);
-  I422ToARGBRow_Unaligned_SSSE3(rowy, rowu, rowv, dst_argb, width);
+  SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+  SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+  SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+  UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
+  UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
+  I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
 }

 #endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
--- a/source/row_mips.cc
+++ b/source/row_mips.cc
@ -225,8 +225,8 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_MIPS_DSPR2

-#ifdef HAS_MIRRORROWUV_MIPS_DSPR2
-void MirrorRowUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+#ifdef HAS_MirrorUVRow_MIPS_DSPR2
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                            int width) {
  int x = 0;
  int y = 0;
@ -315,7 +315,7 @@ void MirrorRowUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
        "t5", "t7", "t8", "t9"
  );
 }
-#endif  // HAS_MIRRORROWUV_MIPS_DSPR2
+#endif  // HAS_MirrorUVRow_MIPS_DSPR2



--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -629,9 +629,9 @@ void NV21ToARGBRow_NEON(const uint8* src_y,

 #ifdef HAS_NV12TORGB565ROW_NEON
 void NV12ToRGB565Row_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_rgb565,
-                        int width) {
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
  asm volatile (
    "vld1.u8    {d24}, [%4]                    \n"
    "vld1.u8    {d25}, [%5]                    \n"
@ -660,9 +660,9 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,

 #ifdef HAS_NV21TORGB565ROW_NEON
 void NV21ToRGB565Row_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_rgb565,
-                        int width) {
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
  asm volatile (
    "vld1.u8    {d24}, [%4]                    \n"
    "vld1.u8    {d25}, [%5]                    \n"
@ -955,8 +955,8 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_NEON

-#ifdef HAS_MIRRORROWUV_NEON
-void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
+#ifdef HAS_MirrorUVRow_NEON
+void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
  asm volatile (
    // compute where to start writing destination
    "add         %1, %3                        \n"  // dst_a + width
@ -1013,7 +1013,7 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
    : "memory", "cc", "r12", "q0"
  );
 }
-#endif  // HAS_MIRRORROWUV_NEON
+#endif  // HAS_MirrorUVRow_NEON

 #ifdef HAS_BGRATOARGBROW_NEON
 void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
@ -1112,6 +1112,41 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
 }
 #endif  // HAS_RAWTOARGBROW_NEON

+#ifdef HAS_RGB565TOARGBROW_NEON
+
+#define RGB565TOARGB                                                           \
+    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
+    "vshrn.u16  d5, q0, #5                     \n"  /* G xxGGGGGG           */ \
+    "vshrn.u16  d6, q0, #8                     \n"  /* R RRRRRxxx           */ \
+    "vshl.u8    d0, d4, #3                     \n"  /* B BBBBB000 upper 5   */ \
+    "vshl.u8    d1, d5, #2                     \n"  /* G GGGGGG00 upper 6   */ \
+    "vbic.u8    d2, d6, d7                     \n"  /* R RRRRR000 upper 5   */ \
+    "vshr.u8    d4, d0, #5                     \n"  /* B 00000BBB lower 3   */ \
+    "vshr.u8    d5, d1, #6                     \n"  /* G 000000GG lower 2   */ \
+    "vshr.u8    d6, d2, #5                     \n"  /* R 00000RRR lower 3   */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+    "vorr.u8    d2, d2, d6                     \n"  /* R                    */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    "vmov.u8    d7, #7                         \n"  // 5 bit mask
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 pixels of RGB565.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+#endif  // HAS_RGB565TOARGBROW_NEON
+
 #ifdef HAS_ARGBTORGBAROW_NEON
 void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
  asm volatile (
@ -1436,9 +1471,9 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
    ARGBTOARGB4444
    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
+  : "+r"(src_argb),      // %0
    "+r"(dst_argb4444),  // %1
-    "+r"(pix)        // %2
+    "+r"(pix)            // %2
  :
  : "memory", "cc", "q0", "q8", "q9", "q10", "q11"
  );
@ -1447,6 +1482,117 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,

 #ifdef HAS_ARGBTOYROW_NEON
 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+#endif  // HAS_ARGBTOYROW_NEON
+
+#ifdef HAS_RGB565TOYROW_NEON
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d7, #7                         \n"  // 5 bit mask
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 pixels of RGB565.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(pix)          // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+#endif  // HAS_RGB565TOYROW_NEON
+
+
+#ifdef HAS_BGRATOYROW_NEON
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // R
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+#endif  // HAS_BGRATOYROW_NEON
+
+#ifdef HAS_ABGRTOYROW_NEON
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // R
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+#endif  // HAS_ABGRTOYROW_NEON
+
+#ifdef HAS_RGBATOYROW_NEON
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
  asm volatile (
    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
@ -1454,7 +1600,34 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
    "vmov.u8    d7, #16                        \n"  // Add 16 constant
    ".p2align  2                               \n"
  "1:                                          \n"
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // B
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+#endif  // HAS_RGBATOYROW_NEON
+
+#ifdef HAS_RGB24TOYROW_NEON
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q8, d0, d4                     \n"  // B
    "vmlal.u8   q8, d1, d5                     \n"  // G
@ -1463,14 +1636,41 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
    "vqadd.u8   d0, d7                         \n"
    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
+  : "+r"(src_rgb24),  // %0
    "+r"(dst_y),  // %1
    "+r"(pix)        // %2
  :
  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  );
 }
-#endif  // HAS_ARGBTOYROW_NEON
+#endif  // HAS_RGB24TOYROW_NEON
+
+#ifdef HAS_RAWTOYROW_NEON
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+#endif  // HAS_RAWTOYROW_NEON

 #endif  // __ARM_NEON__

--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -925,6 +925,120 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
  );
 }

+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kARGBToU),  // %0
+    "m"(kARGBToV),  // %1
+    "m"(kAddUV128)  // %2
+  );
+  asm volatile (
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm6                 \n"
+    "lea       0x40(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
+    "movlps    %%xmm0,(%1)                     \n"
+    "movhps    %%xmm0,(%1,%2,1)                \n"
+    "lea       0x8(%1),%1                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+  );
+}
+
+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
+                                    uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kARGBToU),  // %0
+    "m"(kARGBToV),  // %1
+    "m"(kAddUV128)  // %2
+  );
+  asm volatile (
+    "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm2                 \n"
+    "movdqu    0x30(%0),%%xmm6                 \n"
+    "lea       0x40(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
+    "movlps    %%xmm0,(%1)                     \n"
+    "movhps    %%xmm0,(%1,%2,1)                \n"
+    "lea       0x8(%1),%1                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+  );
+}
+
 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
  asm volatile (
    "movdqa    %4,%%xmm5                       \n"
@ -1652,7 +1766,7 @@ struct {
 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
-                                uint8* argb_buf,
+                                uint8* dst_argb,
                                int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -1688,7 +1802,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
-                                 uint8* rgb24_buf,
+                                 uint8* dst_rgb24,
                                 int width) {
 // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
 #ifdef __APPLE__
@ -1743,7 +1857,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
-                               uint8* raw_buf,
+                               uint8* dst_raw,
                               int width) {
 // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
 #ifdef __APPLE__
@ -1798,7 +1912,7 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
-                                uint8* argb_buf,
+                                uint8* dst_argb,
                                int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -1834,7 +1948,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
-                                uint8* argb_buf,
+                                uint8* dst_argb,
                                int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -1869,7 +1983,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,

 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* uv_buf,
-                                uint8* argb_buf,
+                                uint8* dst_argb,
                                int width) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
@ -1901,8 +2015,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
 }

 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* vu_buf,
-                                uint8* argb_buf,
+                                const uint8* src_vu,
+                                uint8* dst_argb,
                                int width) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
@ -1936,7 +2050,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
 void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
-                                          uint8* argb_buf,
+                                          uint8* dst_argb,
                                          int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -1972,7 +2086,7 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
-                                          uint8* argb_buf,
+                                          uint8* dst_argb,
                                          int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -2008,7 +2122,7 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
-                                          uint8* argb_buf,
+                                          uint8* dst_argb,
                                          int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -2043,7 +2157,7 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

 void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                          const uint8* uv_buf,
-                                          uint8* argb_buf,
+                                          uint8* dst_argb,
                                          int width) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
@ -2075,8 +2189,8 @@ void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 }

 void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* vu_buf,
-                                          uint8* argb_buf,
+                                          const uint8* src_vu,
+                                          uint8* dst_argb,
                                          int width) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
@ -2110,7 +2224,7 @@ void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
-                                uint8* bgra_buf,
+                                uint8* dst_bgra,
                                int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -2147,7 +2261,7 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
-                                uint8* abgr_buf,
+                                uint8* dst_abgr,
                                int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -2183,7 +2297,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
-                                uint8* rgba_buf,
+                                uint8* dst_rgba,
                                int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -2220,7 +2334,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
 void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
-                                          uint8* bgra_buf,
+                                          uint8* dst_bgra,
                                          int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -2257,7 +2371,7 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
 void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
-                                          uint8* abgr_buf,
+                                          uint8* dst_abgr,
                                          int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -2293,7 +2407,7 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
 void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
                                          const uint8* u_buf,
                                          const uint8* v_buf,
-                                          uint8* rgba_buf,
+                                          uint8* dst_rgba,
                                          int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
@ -2446,7 +2560,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 CONST uvec8 kShuffleMirrorUV = {
  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
 };
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
                       int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -1101,6 +1101,124 @@ __asm {
  }
 }

+__declspec(naked) __declspec(align(16))
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // pix
+    movdqa     xmm7, kARGBToU
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm5, kAddUV128
+    sub        edi, edx             // stride from u to v
+
+    align      16
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
+                                    uint8* dst_u, uint8* dst_v, int width) {
+__asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // pix
+    movdqa     xmm7, kARGBToU
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm5, kAddUV128
+    sub        edi, edx             // stride from u to v
+
+    align      16
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    sub        ecx, 16
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
 __declspec(naked) __declspec(align(16))
 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
@ -1656,7 +1774,7 @@ __declspec(naked) __declspec(align(16))
 void I444ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* argb_buf,
+                         uint8* dst_argb,
                         int width) {
  __asm {
    push       esi
@ -1699,7 +1817,7 @@ __declspec(naked) __declspec(align(16))
 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
-                          uint8* rgb24_buf,
+                          uint8* dst_rgb24,
                          int width) {
  __asm {
    push       esi
@ -1746,7 +1864,7 @@ __declspec(naked) __declspec(align(16))
 void I422ToRAWRow_SSSE3(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
-                        uint8* raw_buf,
+                        uint8* dst_raw,
                        int width) {
  __asm {
    push       esi
@ -1866,7 +1984,7 @@ __declspec(naked) __declspec(align(16))
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* argb_buf,
+                         uint8* dst_argb,
                         int width) {
  __asm {
    push       esi
@ -1910,7 +2028,7 @@ __declspec(naked) __declspec(align(16))
 void I411ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* argb_buf,
+                         uint8* dst_argb,
                         int width) {
  __asm {
    push       esi
@ -1952,7 +2070,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
 __declspec(naked) __declspec(align(16))
 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* uv_buf,
-                         uint8* argb_buf,
+                         uint8* dst_argb,
                         int width) {
  __asm {
    push       esi
@ -1990,7 +2108,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
 __declspec(naked) __declspec(align(16))
 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
                         const uint8* uv_buf,
-                         uint8* argb_buf,
+                         uint8* dst_argb,
                         int width) {
  __asm {
    push       esi
@ -2029,7 +2147,7 @@ __declspec(naked) __declspec(align(16))
 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                                   uint8* argb_buf,
+                                   uint8* dst_argb,
                                   int width) {
  __asm {
    push       esi
@ -2072,7 +2190,7 @@ __declspec(naked) __declspec(align(16))
 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                                   uint8* argb_buf,
+                                   uint8* dst_argb,
                                   int width) {
  __asm {
    push       esi
@ -2116,7 +2234,7 @@ __declspec(naked) __declspec(align(16))
 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                                   uint8* argb_buf,
+                                   uint8* dst_argb,
                                   int width) {
  __asm {
    push       esi
@ -2158,7 +2276,7 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 __declspec(naked) __declspec(align(16))
 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* uv_buf,
-                                   uint8* argb_buf,
+                                   uint8* dst_argb,
                                   int width) {
  __asm {
    push       esi
@ -2196,7 +2314,7 @@ void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 __declspec(naked) __declspec(align(16))
 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* uv_buf,
-                                   uint8* argb_buf,
+                                   uint8* dst_argb,
                                   int width) {
  __asm {
    push       esi
@ -2233,7 +2351,7 @@ __declspec(naked) __declspec(align(16))
 void I422ToBGRARow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* bgra_buf,
+                         uint8* dst_bgra,
                         int width) {
  __asm {
    push       esi
@ -2274,7 +2392,7 @@ __declspec(naked) __declspec(align(16))
 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                                   uint8* bgra_buf,
+                                   uint8* dst_bgra,
                                   int width) {
  __asm {
    push       esi
@ -2315,7 +2433,7 @@ __declspec(naked) __declspec(align(16))
 void I422ToABGRRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* abgr_buf,
+                         uint8* dst_abgr,
                         int width) {
  __asm {
    push       esi
@ -2356,7 +2474,7 @@ __declspec(naked) __declspec(align(16))
 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                                   uint8* abgr_buf,
+                                   uint8* dst_abgr,
                                   int width) {
  __asm {
    push       esi
@ -2397,7 +2515,7 @@ __declspec(naked) __declspec(align(16))
 void I422ToRGBARow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
-                         uint8* rgba_buf,
+                         uint8* dst_rgba,
                         int width) {
  __asm {
    push       esi
@ -2438,7 +2556,7 @@ __declspec(naked) __declspec(align(16))
 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
-                                   uint8* rgba_buf,
+                                   uint8* dst_rgba,
                                   int width) {
  __asm {
    push       esi
@ -2591,7 +2709,7 @@ static const uvec8 kShuffleMirrorUV = {
 };

 __declspec(naked) __declspec(align(16))
-void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
                       int width) {
  __asm {
    push      edi
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -220,7 +220,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                       \
    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
-                    benchmark_width_ - 4, _Any, +, 0)                         \
+                    benchmark_width_ - 4, _Any, +, 0)                          \
    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
                    benchmark_width_, _Unaligned, +, 1)                        \
@ -614,10 +614,9 @@ TESTATOPLANAR(RGB24, 3, I420, 2, 2)
 TESTATOPLANAR(RGB565, 2, I420, 2, 2)
 TESTATOPLANAR(ARGB1555, 2, I420, 2, 2)
 TESTATOPLANAR(ARGB4444, 2, I420, 2, 2)
-// TESTATOPLANAR(ARGB, 4, I411, 4, 1)
+TESTATOPLANAR(ARGB, 4, I411, 4, 1)
 TESTATOPLANAR(ARGB, 4, I422, 2, 1)
-// TESTATOPLANAR(ARGB, 4, I444, 1, 1)
-// TODO(fbarchard): Implement and test 411 and 444
+TESTATOPLANAR(ARGB, 4, I444, 1, 1)
 TESTATOPLANAR(V210, 16 / 6, I420, 2, 2)
 TESTATOPLANAR(YUY2, 2, I420, 2, 2)
 TESTATOPLANAR(UYVY, 2, I420, 2, 2)
@ -629,30 +628,103 @@ TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2)
 TESTATOPLANAR(BayerGBRG, 1, I420, 2, 2)
 TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2)

-#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, W1280, DIFF,           \
-                  N, NEG, OFF)                                                 \
+#define TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,       \
+                       W1280, N, NEG, OFF)                                     \
+TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) {                                 \
+  const int kWidth = W1280;                                                    \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStride = (kWidth * 8 * BPP_A + 7) / 8;                            \
+  align_buffer_16(src_argb, kStride * kHeight + OFF);                          \
+  align_buffer_16(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_16(dst_uv_c, kWidth / SUBSAMP_X * 2 * kHeight / SUBSAMP_Y);     \
+  align_buffer_16(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_16(dst_uv_opt, kWidth / SUBSAMP_X * 2 * kHeight / SUBSAMP_Y);   \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kStride; ++j)                                          \
+      src_argb[(i * kStride) + j + OFF] = (random() & 0xff);                   \
+  MaskCpuFlags(0);                                                             \
+  FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                               \
+                        dst_y_c, kWidth,                                       \
+                        dst_uv_c, kWidth / SUBSAMP_X * 2,                      \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                             \
+                          dst_y_opt, kWidth,                                   \
+                          dst_uv_opt, kWidth / SUBSAMP_X * 2,                  \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 2);                                                      \
+  for (int i = 0; i < kHeight / SUBSAMP_Y; ++i) {                              \
+    for (int j = 0; j < kWidth / SUBSAMP_X * 2; ++j) {                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_uv_c[i * kWidth / SUBSAMP_X * 2 + j]) -      \
+              static_cast<int>(dst_uv_opt[i * kWidth / SUBSAMP_X * 2 + j]));    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 2);                                                      \
+  free_aligned_buffer_16(dst_y_c)                                              \
+  free_aligned_buffer_16(dst_uv_c)                                             \
+  free_aligned_buffer_16(dst_y_opt)                                            \
+  free_aligned_buffer_16(dst_uv_opt)                                           \
+  free_aligned_buffer_16(src_argb)                                             \
+}
+
+#define TESTATOBIPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)        \
+    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
+                   benchmark_width_ - 4, _Any, +, 0)                           \
+    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
+                   benchmark_width_, _Unaligned, +, 1)                         \
+    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
+                   benchmark_width_, _Invert, -, 0)                            \
+    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
+                   benchmark_width_, _Opt, +, 0)
+
+TESTATOBIPLANAR(ARGB, 4, NV12, 2, 2)
+TESTATOBIPLANAR(ARGB, 4, NV21, 2, 2)
+
+#define TESTATOBI(FMT_A, BPP_A, STRIDE_A,                                      \
+                  FMT_B, BPP_B, STRIDE_B,                                      \
+                  W1280, DIFF, N, NEG, OFF)                                    \
 TEST_F(libyuvTest, FMT_A##To##FMT_B##N) {                                      \
  const int kWidth = W1280;                                                    \
  const int kHeight = benchmark_height_;                                       \
-  align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight + OFF);                 \
-  align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight);                     \
-  align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight);                   \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
+  align_buffer_16(src_argb, kStrideA * kHeight + OFF);                         \
+  align_buffer_16(dst_argb_c, kStrideB * kHeight);                             \
+  align_buffer_16(dst_argb_opt, kStrideB * kHeight);                           \
  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) {                         \
+  for (int i = 0; i < kStrideA * kHeight; ++i) {                               \
    src_argb[i + OFF] = (random() & 0xff);                                     \
  }                                                                            \
  MaskCpuFlags(0);                                                             \
-  FMT_A##To##FMT_B(src_argb + OFF, kWidth * STRIDE_A,                          \
-                   dst_argb_c, kWidth * BPP_B,                                 \
+  FMT_A##To##FMT_B(src_argb + OFF, kStrideA,                                   \
+                   dst_argb_c, kStrideB,                                       \
                   kWidth, NEG kHeight);                                       \
  MaskCpuFlags(-1);                                                            \
  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    FMT_A##To##FMT_B(src_argb + OFF, kWidth * STRIDE_A,                        \
-                     dst_argb_opt, kWidth * BPP_B,                             \
+    FMT_A##To##FMT_B(src_argb + OFF, kStrideA,                                 \
+                     dst_argb_opt, kStrideB,                                   \
                     kWidth, NEG kHeight);                                     \
  }                                                                            \
  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) {                         \
+  for (int i = 0; i < kStrideB * kHeight; ++i) {                               \
    int abs_diff =                                                             \
        abs(static_cast<int>(dst_argb_c[i]) -                                  \
            static_cast<int>(dst_argb_opt[i]));                                \
@ -665,65 +737,26 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N) {                                      \
  free_aligned_buffer_16(dst_argb_c)                                           \
  free_aligned_buffer_16(dst_argb_opt)                                         \
 }
-#define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, DIFF)                   \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, benchmark_width_, DIFF,    \
-              _Any, +, 0)                                                      \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, benchmark_width_, DIFF,    \
-              _Unaligned, +, 1)                                                \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, benchmark_width_, DIFF,    \
-              _Invert, -, 0)                                                   \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, benchmark_width_, DIFF,    \
-              _Opt, +, 0)

-TESTATOB(ARGB, 4, 4, ARGB, 4, 0)
-TESTATOB(ARGB, 4, 4, BGRA, 4, 0)
-TESTATOB(ARGB, 4, 4, ABGR, 4, 0)
-TESTATOB(ARGB, 4, 4, RGBA, 4, 0)
-TESTATOB(ARGB, 4, 4, RAW, 3, 0)
-TESTATOB(ARGB, 4, 4, RGB24, 3, 0)
-TESTATOB(ARGB, 4, 4, RGB565, 2, 0)
-TESTATOB(ARGB, 4, 4, ARGB1555, 2, 0)
-TESTATOB(ARGB, 4, 4, ARGB4444, 2, 0)
-TESTATOB(ARGB, 4, 4, BayerBGGR, 1, 0)
-TESTATOB(ARGB, 4, 4, BayerRGGB, 1, 0)
-TESTATOB(ARGB, 4, 4, BayerGBRG, 1, 0)
-TESTATOB(ARGB, 4, 4, BayerGRBG, 1, 0)
-TESTATOB(ARGB, 4, 4, I400, 1, 2)
-TESTATOB(BGRA, 4, 4, ARGB, 4, 0)
-TESTATOB(ABGR, 4, 4, ARGB, 4, 0)
-TESTATOB(RGBA, 4, 4, ARGB, 4, 0)
-TESTATOB(RAW, 3, 3, ARGB, 4, 0)
-TESTATOB(RGB24, 3, 3, ARGB, 4, 0)
-TESTATOB(RGB565, 2, 2, ARGB, 4, 0)
-TESTATOB(ARGB1555, 2, 2, ARGB, 4, 0)
-TESTATOB(ARGB4444, 2, 2, ARGB, 4, 0)
-TESTATOB(YUY2, 2, 2, ARGB, 4, 0)
-TESTATOB(UYVY, 2, 2, ARGB, 4, 0)
-TESTATOB(M420, 3 / 2, 1, ARGB, 4, 0)
-TESTATOB(BayerBGGR, 1, 1, ARGB, 4, 0)
-TESTATOB(BayerRGGB, 1, 1, ARGB, 4, 0)
-TESTATOB(BayerGBRG, 1, 1, ARGB, 4, 0)
-TESTATOB(BayerGRBG, 1, 1, ARGB, 4, 0)
-TESTATOB(I400, 1, 1, ARGB, 4, 0)
-TESTATOB(I400, 1, 1, I400, 1, 0)
-TESTATOB(I400, 1, 1, I400Mirror, 1, 0)
-TESTATOB(Y, 1, 1, ARGB, 4, 0)
-TESTATOB(ARGB, 4, 4, ARGBMirror, 4, 0)
-
-#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, STRIDE_B, DIFF)   \
+#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                       \
+                       FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                 \
 TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) {                                \
  srandom(time(NULL));                                                         \
  for (int times = 0; times < benchmark_iterations_; ++times) {                \
    const int kWidth = (random() & 63) + 1;                                    \
    const int kHeight = (random() & 31) + 1;                                   \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;       \
    const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
    const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
-    align_buffer_page_end(src_argb, kStrideA * kHeight);                       \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeight);                     \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeight);                   \
-    for (int i = 0; i < kStrideA * kHeight; ++i) {                             \
+    align_buffer_page_end(src_argb, kStrideA * kHeightA);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                    \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                  \
+    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
      src_argb[i] = (random() & 0xff);                                         \
    }                                                                          \
+    memset(dst_argb_c, 0, kStrideB * kHeightB);                                \
+    memset(dst_argb_opt, 0, kStrideB * kHeightB);                              \
    MaskCpuFlags(0);                                                           \
    FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
                     dst_argb_c, kStrideB,                                     \
@ -733,7 +766,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) {                                \
                     dst_argb_opt, kStrideB,                                   \
                     kWidth, kHeight);                                         \
    int max_diff = 0;                                                          \
-    for (int i = 0; i < kStrideB * kHeight; ++i) {                             \
+    for (int i = 0; i < kStrideB * kHeightB; ++i) {                            \
      int abs_diff =                                                           \
          abs(static_cast<int>(dst_argb_c[i]) -                                \
              static_cast<int>(dst_argb_opt[i]));                              \
@ -748,33 +781,58 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) {                                \
  }                                                                            \
 }

-TESTATOBRANDOM(ARGB, 4, 4, ARGB, 4, 4, 0)
-TESTATOBRANDOM(ARGB, 4, 4, BGRA, 4, 4, 0)
-TESTATOBRANDOM(ARGB, 4, 4, ABGR, 4, 4, 0)
-TESTATOBRANDOM(ARGB, 4, 4, RGBA, 4, 4, 0)
-TESTATOBRANDOM(ARGB, 4, 4, RAW, 3, 3, 0)
-TESTATOBRANDOM(ARGB, 4, 4, RGB24, 3, 3, 0)
-TESTATOBRANDOM(ARGB, 4, 4, RGB565, 2, 2, 0)
-TESTATOBRANDOM(ARGB, 4, 4, ARGB1555, 2, 2, 0)
-TESTATOBRANDOM(ARGB, 4, 4, ARGB4444, 2, 2, 0)
-TESTATOBRANDOM(ARGB, 4, 4, I400, 1, 1, 2)
-// TODO(fbarchard): Implement YUY2
-// TESTATOBRANDOM(ARGB, 4, 4, YUY2, 4, 2, 0)
-// TESTATOBRANDOM(ARGB, 4, 4, UYVY, 4, 2, 0)
-TESTATOBRANDOM(BGRA, 4, 4, ARGB, 4, 4, 0)
-TESTATOBRANDOM(ABGR, 4, 4, ARGB, 4, 4, 0)
-TESTATOBRANDOM(RGBA, 4, 4, ARGB, 4, 4, 0)
-TESTATOBRANDOM(RAW, 3, 3, ARGB, 4, 4, 0)
-TESTATOBRANDOM(RGB24, 3, 3, ARGB, 4, 4, 0)
-TESTATOBRANDOM(RGB565, 2, 2, ARGB, 4, 4, 0)
-TESTATOBRANDOM(ARGB1555, 2, 2, ARGB, 4, 4, 0)
-TESTATOBRANDOM(ARGB4444, 2, 2, ARGB, 4, 4, 0)
-TESTATOBRANDOM(I400, 1, 1, ARGB, 4, 4, 0)
-TESTATOBRANDOM(YUY2, 4, 2, ARGB, 4, 4, 0)
-TESTATOBRANDOM(UYVY, 4, 2, ARGB, 4, 4, 0)
-TESTATOBRANDOM(I400, 1, 1, I400, 1, 1, 0)
-TESTATOBRANDOM(I400, 1, 1, I400Mirror, 1, 1, 0)
-TESTATOBRANDOM(ARGB, 4, 4, ARGBMirror, 4, 4, 0)
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                             \
+                 FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                       \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A,                                          \
+              FMT_B, BPP_B, STRIDE_B,                                          \
+              benchmark_width_, DIFF, _Any, +, 0)                              \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A,                                          \
+              FMT_B, BPP_B, STRIDE_B,                                          \
+              benchmark_width_, DIFF, _Unaligned, +, 1)                        \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A,                                          \
+              FMT_B, BPP_B, STRIDE_B,                                          \
+              benchmark_width_, DIFF, _Invert, -, 0)                           \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A,                                          \
+              FMT_B, BPP_B, STRIDE_B,                                          \
+              benchmark_width_, DIFF, _Opt, +, 0)                              \
+    TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                           \
+                   FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
+
+TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerBGGR, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerRGGB, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerGBRG, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerGRBG, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
+TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerBGGR, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerRGGB, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerGBRG, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerGRBG, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
+TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)

 TEST_F(libyuvTest, Test565) {
  SIMD_ALIGNED(uint8 orig_pixels[256][4]);