From 9eefb2e8dd2c40a8b6bd0f02d794fe78332fc08f Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Wed, 18 Jan 2012 23:56:30 +0000
Subject: [PATCH] ARGBToRGB functions optimized BUG=none
 TEST=media_unittest.exe --gunit_catch_exceptions=0 --yuvconverter_repeat=1000
 --gunit_filter=LmiVideoFrameTest.ConvertTo*R* Review URL:
 https://webrtc-codereview.appspot.com/355002

git-svn-id: http://libyuv.googlecode.com/svn/trunk@138 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 include/libyuv/planar_functions.h |  10 +
 source/planar_functions.cc        | 318 +++++++++++++++++++++---------
 source/row.h                      |  20 ++
 source/row_win.cc                 | 204 +++++++++++++++++++
 4 files changed, 462 insertions(+), 90 deletions(-)

diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 26e9eb275..7a01c5129 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -192,6 +192,16 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
+// Convert ARGB To RGB24.
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height);
+
+// Convert ARGB To RAW.
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_rgb, int dst_stride_rgb,
+              int width, int height);
+
 // Convert ARGB to I400.
 int ARGBToI400(const uint8* src_argb, int src_stride_argb,
                uint8* dst_y, int dst_stride_y,
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 20ae26a6c..018b9c11a 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1633,27 +1633,40 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*FastConvertYUVToRGB24Row)(const uint8* y_buf,
-                                    const uint8* u_buf,
-                                    const uint8* v_buf,
-                                    uint8* rgb_buf,
-                                    int width);
-#if defined(HAS_FASTCONVERTYUVTORGB24ROW_NEON)
+  void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    FastConvertYUVToRGB24Row = FastConvertYUVToRGB24Row_NEON;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
   } else
-#elif defined(HAS_FASTCONVERTYUVTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    FastConvertYUVToRGB24Row = FastConvertYUVToRGB24Row_SSSE3;
+#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
   } else
 #endif
   {
-    FastConvertYUVToRGB24Row = FastConvertYUVToRGB24Row_C;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
   }
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
+#if defined(HAS_ARGBTORGB24ROW_SSSE3_DISABLED)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+  } else
+#endif
+  {
+    ARGBToRGB24Row = ARGBToRGB24Row_C;
+  }
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYUVToRGB24Row(src_y, src_u, src_v, dst_argb, width);
+    FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
+    ARGBToRGB24Row(row, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
@@ -1666,37 +1679,50 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to RAW.
 int I420ToRAW(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*FastConvertYUVToRAWRow)(const uint8* y_buf,
-                                    const uint8* u_buf,
-                                    const uint8* v_buf,
-                                    uint8* rgb_buf,
-                                    int width);
-#if defined(HAS_FASTCONVERTYUVTORAWROW_NEON)
+  void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    FastConvertYUVToRAWRow = FastConvertYUVToRAWRow_NEON;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
   } else
-#elif defined(HAS_FASTCONVERTYUVTORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    FastConvertYUVToRAWRow = FastConvertYUVToRAWRow_SSSE3;
+#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
   } else
 #endif
   {
-    FastConvertYUVToRAWRow = FastConvertYUVToRAWRow_C;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
   }
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix);
+#if defined(HAS_ARGBTORAWROW_SSSE3_DISABLED)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToRAWRow = ARGBToRAWRow_C;
+  }
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYUVToRAWRow(src_y, src_u, src_v, dst_argb, width);
+    FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
+    ARGBToRAWRow(row, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
@@ -1719,27 +1745,40 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*FastConvertYUVToRGB565Row)(const uint8* y_buf,
-                                    const uint8* u_buf,
-                                    const uint8* v_buf,
-                                    uint8* rgb_buf,
-                                    int width);
-#if defined(HAS_FASTCONVERTYUVTORGB565ROW_NEON)
+  void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_NEON;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
   } else
-#elif defined(HAS_FASTCONVERTYUVTORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_SSSE3;
+#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
   } else
 #endif
   {
-    FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_C;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
   }
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
+#if defined(HAS_ARGBTORGB565ROW_SSE2_DISABLED)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+  } else
+#endif
+  {
+    ARGBToRGB565Row = ARGBToRGB565Row_C;
+  }
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYUVToRGB565Row(src_y, src_u, src_v, dst_argb, width);
+    FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
+    ARGBToRGB565Row(row, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
@@ -1762,27 +1801,40 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*FastConvertYUVToARGB1555Row)(const uint8* y_buf,
-                                    const uint8* u_buf,
-                                    const uint8* v_buf,
-                                    uint8* rgb_buf,
-                                    int width);
-#if defined(HAS_FASTCONVERTYUVTOARGB1555ROW_NEON)
+  void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    FastConvertYUVToARGB1555Row = FastConvertYUVToARGB1555Row_NEON;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
   } else
-#elif defined(HAS_FASTCONVERTYUVTOARGB1555ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    FastConvertYUVToARGB1555Row = FastConvertYUVToARGB1555Row_SSSE3;
+#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
   } else
 #endif
   {
-    FastConvertYUVToARGB1555Row = FastConvertYUVToARGB1555Row_C;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
   }
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2_DISABLED)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+  } else
+#endif
+  {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_C;
+  }
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYUVToARGB1555Row(src_y, src_u, src_v, dst_argb, width);
+    FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
+    ARGBToARGB1555Row(row, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
@@ -1792,6 +1844,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
   }
   return 0;
 }
+
 // Convert I420 to ARGB4444.
 int I420ToARGB4444(const uint8* src_y, int src_stride_y,
                  const uint8* src_u, int src_stride_u,
@@ -1804,27 +1857,40 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  void (*FastConvertYUVToARGB4444Row)(const uint8* y_buf,
-                                    const uint8* u_buf,
-                                    const uint8* v_buf,
-                                    uint8* rgb_buf,
-                                    int width);
-#if defined(HAS_FASTCONVERTYUVTOARGB4444ROW_NEON)
+  void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    FastConvertYUVToARGB4444Row = FastConvertYUVToARGB4444Row_NEON;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
   } else
-#elif defined(HAS_FASTCONVERTYUVTOARGB4444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    FastConvertYUVToARGB4444Row = FastConvertYUVToARGB4444Row_SSSE3;
+#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
   } else
 #endif
   {
-    FastConvertYUVToARGB4444Row = FastConvertYUVToARGB4444Row_C;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
   }
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+  } else
+#endif
+  {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_C;
+  }
+
   for (int y = 0; y < height; ++y) {
-    FastConvertYUVToARGB4444Row(src_y, src_u, src_v, dst_argb, width);
+    FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width);
+    ARGBToARGB4444Row(row, dst_argb, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
@@ -2119,6 +2185,65 @@ int BG24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
   return 0;
 }
 
+// Convert ARGB To RGB24.
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
+#if defined(HAS_ARGBTORGB24ROW_SSSE3_DISABLED)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+  } else
+#endif
+  {
+    ARGBToRGB24Row = ARGBToRGB24Row_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    ARGBToRGB24Row(src_argb, dst_rgb24, width);
+    src_argb += src_stride_argb;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
+// Convert ARGB To RAW.
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix);
+#if defined(HAS_ARGBTORAWROW_SSSE3_DISABLED)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
+    ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+  } else
+#endif
+  {
+    ARGBToRAWRow = ARGBToRAWRow_C;
+  }
+
+  for (int y = 0; y < height; ++y) {
+    ARGBToRAWRow(src_argb, dst_raw, width);
+    src_argb += src_stride_argb;
+    dst_raw += dst_stride_raw;
+  }
+  return 0;
+}
 
 // Convert NV12 to RGB565.
 int NV12ToRGB565(const uint8* src_y, int src_stride_y,
@@ -2131,25 +2256,37 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
     dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
     dst_stride_rgb = -dst_stride_rgb;
   }
-  void (*FastConvertYUVToRGB565Row)(const uint8* y_buf,
-                                    const uint8* u_buf,
-                                    const uint8* v_buf,
-                                    uint8* rgb_buf,
-                                    int width);
-#if defined(HAS_FASTCONVERTYUVTORGB565ROW_NEON)
+  void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
+                                  const uint8* u_buf,
+                                  const uint8* v_buf,
+                                  uint8* rgb_buf,
+                                  int width);
+#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_NEON;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON;
   } else
-#elif defined(HAS_FASTCONVERTYUVTORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_rgb, 16) && IS_ALIGNED(dst_stride_rgb, 16)) {
-    FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_SSSE3;
+#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
   } else
 #endif
   {
-    FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_C;
+    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
   }
+
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix);
+#if defined(HAS_ARGBTORGB565ROW_SSE2_DISABLED)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+  } else
+#endif
+  {
+    ARGBToRGB565Row = ARGBToRGB565Row_C;
+  }
+
   int halfwidth = (width + 1) >> 1;
   void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 #if defined(HAS_SPLITUV_NEON)
@@ -2166,15 +2303,16 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
   {
     SplitUV = SplitUV_C;
   }
-  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  SIMD_ALIGNED(uint8 rowuv[kMaxStride * 2]);
 
   for (int y = 0; y < height; ++y) {
     if ((y & 1) == 0) {
       // Copy a row of UV.
-      SplitUV(src_uv, row, row + kMaxStride, halfwidth);
+      SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth);
       src_uv += src_stride_uv;
     }
-    FastConvertYUVToRGB565Row(src_y, row, row + kMaxStride, dst_rgb, width);
+    FastConvertYUVToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width);
+    ARGBToRGB565Row(row, dst_rgb, width);
     dst_rgb += dst_stride_rgb;
     src_y += src_stride_y;
   }
diff --git a/source/row.h b/source/row.h
index 0cbd7f0a7..7bca7c28f 100644
--- a/source/row.h
+++ b/source/row.h
@@ -63,6 +63,12 @@
 #define HAS_RGB565TOARGBROW_SSE2
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
+
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
 #endif
 
 // The following are available on Neon platforms
@@ -210,6 +216,20 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
 void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
 void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
 
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+// ARGBToABGRRow_C is same as ABGRToARGB
+// ARGBToBGRARow_C is same as BGRAToARGB
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
 
diff --git a/source/row_win.cc b/source/row_win.cc
index 6fd398593..ecd9a82e9 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -85,6 +85,15 @@ static const uvec8 kShuffleMaskBGRAToARGB = {
   3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
 };
 
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u };
+
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u,0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u };
+
 __declspec(naked)
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   __asm {
@@ -494,6 +503,201 @@ __asm {
   }
 }
 
+// TODO(fbarchard): Port to gcc
+__declspec(naked)
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm5, kShuffleMaskARGBToRGB24
+
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqa    xmm1, [eax + 16]
+    movdqa    xmm2, [eax + 32]
+    movdqa    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm5    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm5
+    pshufb    xmm2, xmm5
+    pshufb    xmm3, xmm5
+    movdqa    xmm4, xmm1
+    psllq     xmm4, 12
+    por       xmm4, xmm0
+    movdqa    [edx], xmm4   // first 16 bytes
+    movdqa    xmm4, xmm2
+    psrlq     xmm1, 4
+    psllq     xmm4, 8
+    por       xmm1, xmm4
+    movdqa    [edx + 16], xmm1   // middle 16 bytes
+    psrlq     xmm2, 8
+    psllq     xmm3, 4
+    por       xmm2, xmm3
+    movdqa    [edx + 32], xmm2   // last 16 bytes
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    ja        convertloop
+    ret
+  }
+}
+
+// TODO(fbarchard): Port to gcc
+__declspec(naked)
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm5, kShuffleMaskARGBToRAW
+
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqa    xmm1, [eax + 16]
+    movdqa    xmm2, [eax + 32]
+    movdqa    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm5    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm5
+    pshufb    xmm2, xmm5
+    pshufb    xmm3, xmm5
+    movdqa    xmm4, xmm1
+    psllq     xmm4, 12
+    por       xmm4, xmm0
+    movdqa    [edx], xmm4   // first 16 bytes
+    movdqa    xmm4, xmm2
+    psrlq     xmm1, 4
+    psllq     xmm4, 8
+    por       xmm1, xmm4
+    movdqa    [edx + 16], xmm1   // middle 16 bytes
+    psrlq     xmm2, 8
+    psllq     xmm3, 4
+    por       xmm2, xmm3
+    movdqa    [edx + 32], xmm2   // last 16 bytes
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    ja        convertloop
+    ret
+  }
+}
+
+// TODO(fbarchard): Port to gcc
+__declspec(naked)
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+    pcmpeqb   xmm3, xmm3       // generate mask 0x001f001f
+    psrlw     xmm3, 11
+    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0
+    psrlw     xmm4, 10
+    psllw     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xf800f800
+    psrlw     xmm5, 11
+
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    lea       eax, [eax + 16]
+    movdqa    xmm1, xmm0    // B
+    psrlw     xmm1, 3
+    pand      xmm1, xmm3
+    movdqa    xmm2, xmm0    // G
+    psrlw     xmm2, 5
+    pand      xmm2, xmm4
+    por       xmm1, xmm2
+    psrlw     xmm0, 8       // R
+    pand      xmm0, xmm5
+    por       xmm0, xmm1
+    pslld     xmm0, 16
+    psrad     xmm0, 16
+    packssdw  xmm0, xmm0
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    ja        convertloop
+    ret
+  }
+}
+
+// TODO(fbarchard): Port to gcc
+__declspec(naked)
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+    pcmpeqb   xmm3, xmm3       // generate mask 0x001f001f
+    psrlw     xmm3, 11
+    movdqa    xmm4, xmm3       // generate mask 0x03e003e0
+    psllw     xmm4, 5
+    movdqa    xmm5, xmm3       // generate mask 0x7c007c00
+    psllw     xmm5, 10
+    pcmpeqb   xmm6, xmm6       // generate mask 0x80008000
+    psrlw     xmm6, 15
+
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    lea       eax, [eax + 16]
+    movdqa    xmm1, xmm0    // B
+    psrlw     xmm1, 3
+    pand      xmm1, xmm3
+    movdqa    xmm2, xmm0    // G
+    psrlw     xmm2, 6
+    pand      xmm2, xmm4
+    por       xmm1, xmm2
+    movdqa    xmm2, xmm0    // R
+    psrlw     xmm2, 9
+    pand      xmm2, xmm5
+    por       xmm1, xmm2
+    movdqa    xmm2, xmm0    // A
+    psrlw     xmm2, 16
+    pand      xmm2, xmm6
+    por       xmm1, xmm2
+    pslld     xmm0, 16
+    psrad     xmm0, 16
+    packssdw  xmm1, xmm1
+    movq      qword ptr [edx], xmm1  // store 4 pixels of ARGB1555
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    ja        convertloop
+    ret
+  }
+}
+
+// TODO(fbarchard): Port to gcc
+__declspec(naked)
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+__asm {
+    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
+    psllw     xmm4, 12
+    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
+    psrlw     xmm3, 8
+
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    lea       eax, [eax + 16]
+    movdqa    xmm1, xmm0
+    pand      xmm0, xmm3    // low nibble
+    pand      xmm1, xmm4    // high nibble
+    psrl      xmm0, 4
+    psrl      xmm1, 8
+    por       xmm0, xmm1
+    packuswb  xmm0, xmm0
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    ja        convertloop
+    ret
+  }
+}
+
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values
 __declspec(naked)
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {