From 228bdc24e44264baf3402124aaa6d4d81c8896f5 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Tue, 15 Nov 2011 21:58:26 +0000
Subject: [PATCH] port yuv to rgb ssse3 to gcc BUG=none TEST=media_unittest
 Review URL: http://webrtc-codereview.appspot.com/269015

git-svn-id: http://libyuv.googlecode.com/svn/trunk@80 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium            |   2 +-
 source/planar_functions.cc |  38 +--
 source/row.h               |  17 +-
 source/row_posix.cc        | 475 ++++++++++++++++++++++---------------
 source/row_win.cc          |  38 +--
 5 files changed, 299 insertions(+), 271 deletions(-)

diff --git a/README.chromium b/README.chromium
index 3c60491ee..40059043c 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 79
+Version: 80
 License: BSD
 License File: LICENSE
 
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index f84b4cacb..b63b8a7a4 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1136,19 +1136,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
     FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
   } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      (width % 4 == 0) &&
-      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
-    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow4_SSE2;
-  } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      (width % 2 == 0)) {
-    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
-  } else
 #endif
   {
     FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
@@ -1188,12 +1175,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
     FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3;
   } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      (width % 2 == 0)) {
-    FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2;
-  } else
 #endif
   {
     FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C;
@@ -1233,12 +1214,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
     FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3;
   } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      (width % 2 == 0)) {
-    FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2;
-  } else
 #endif
   {
     FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C;
@@ -1278,12 +1253,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
     FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
   } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      (width % 2 == 0)) {
-    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
-  } else
 #endif
   {
     FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
@@ -1321,11 +1290,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
     FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3;
   } else
-#endif
-#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
-  } else
 #endif
   {
     FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C;
@@ -1354,7 +1318,7 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
                                  uint8* rgb_buf,
                                  int width);
 #if defined(HAS_FASTCONVERTYTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSE2) &&
       (width % 8 == 0) &&
       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
     FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
diff --git a/source/row.h b/source/row.h
index 92ed6a805..53cf8a67a 100644
--- a/source/row.h
+++ b/source/row.h
@@ -37,28 +37,17 @@
 #define HAS_BGRATOUVROW_SSSE3
 #define HAS_ABGRTOUVROW_SSSE3
 #define HAS_I400TOARGBROW_SSE2
-#endif
-
-// The following are available on Linux (32/64 bit)
-// TODO(fbarchard): enable for fpic on linux
-#if (defined(__x86_64__) || \
-    (defined(__i386__) && !defined(__pic__))) && \
-    !defined(LIBYUV_DISABLE_ASM)
-#define HAS_FASTCONVERTYUVTOARGBROW_SSE2
-#define HAS_FASTCONVERTYUVTOBGRAROW_SSE2
-#define HAS_FASTCONVERTYUVTOABGRROW_SSE2
-#define HAS_FASTCONVERTYUV444TOARGBROW_SSE2
 #define HAS_FASTCONVERTYTOARGBROW_SSE2
 #endif
 
-// The following are available on Windows
-#if defined(WIN32) && \
+// The following are available on all x86 platforms except 32 bit OSX
+#if (defined(WIN32) || defined(__x86_64__) || \
+    (defined(__i386__) && !defined(__APPLE__))) && \
     !defined(LIBYUV_DISABLE_ASM)
 #define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
 #define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
 #define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
 #define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
-#define HAS_FASTCONVERTYTOARGBROW_SSE2
 #endif
 
 extern "C" {
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 8db175548..2eb5fc3af 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -14,49 +14,49 @@
 
 extern "C" {
 
-#ifdef HAS_ARGBTOYROW_SSSE3
-
-// Constant multiplication table for converting ARGB to I400.
-static const vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
-
-static const uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
-
 #ifdef HAS_ARGBTOUVROW_SSSE3
-static const vec8 kARGBToU = {
+vec8 kARGBToU = {
   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
 };
 
-static const uvec8 kARGBToV = {
+uvec8 kARGBToV = {
   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
 };
 
-static const uvec8 kAddUV128 = {
+uvec8 kAddUV128 = {
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
 #endif
 
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constant multiplication table for converting ARGB to I400.
+vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
 // Shuffle table for converting BG24 to ARGB.
-static const uvec8 kShuffleMaskBG24ToARGB = {
+uvec8 kShuffleMaskBG24ToARGB = {
   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
 };
 
 // Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {
+uvec8 kShuffleMaskRAWToARGB = {
   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 
 // Shuffle table for converting ABGR to ARGB.
-static const uvec8 kShuffleMaskABGRToARGB = {
+uvec8 kShuffleMaskABGRToARGB = {
   2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
 };
 
 // Shuffle table for converting BGRA to ARGB.
-static const uvec8 kShuffleMaskBGRAToARGB = {
+uvec8 kShuffleMaskBGRAToARGB = {
   3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
 };
 
@@ -145,17 +145,17 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
   "movdqa     0x20(%0),%%xmm3                  \n"
   "lea        0x30(%0),%0                      \n"
   "movdqa     %%xmm3,%%xmm2                    \n"
-  "palignr    $0x8,%%xmm1,%%xmm2               \n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
+  "palignr    $0x8,%%xmm1,%%xmm2               \n"
   "pshufb     %%xmm4,%%xmm2                    \n"
   "por        %%xmm5,%%xmm2                    \n"
-  "palignr    $0xc,%%xmm0,%%xmm1               \n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
+  "palignr    $0xc,%%xmm0,%%xmm1               \n"
   "pshufb     %%xmm4,%%xmm0                    \n"
   "movdqa     %%xmm2,0x20(%1)                  \n"
   "por        %%xmm5,%%xmm0                    \n"
   "pshufb     %%xmm4,%%xmm1                    \n"
   "movdqa     %%xmm0,(%1)                      \n"
   "por        %%xmm5,%%xmm1                    \n"
-  "palignr    $0x4,%%xmm3,%%xmm3               \n"  // xmm3 = { xmm3[4:15] }
+  "palignr    $0x4,%%xmm3,%%xmm3               \n"
   "pshufb     %%xmm4,%%xmm3                    \n"
   "movdqa     %%xmm1,0x10(%1)                  \n"
   "por        %%xmm5,%%xmm3                    \n"
@@ -185,17 +185,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
   "movdqa     0x20(%0),%%xmm3                  \n"
   "lea        0x30(%0),%0                      \n"
   "movdqa     %%xmm3,%%xmm2                    \n"
-  "palignr    $0x8,%%xmm1,%%xmm2               \n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
+  "palignr    $0x8,%%xmm1,%%xmm2               \n"
   "pshufb     %%xmm4,%%xmm2                    \n"
   "por        %%xmm5,%%xmm2                    \n"
-  "palignr    $0xc,%%xmm0,%%xmm1               \n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
+  "palignr    $0xc,%%xmm0,%%xmm1               \n"
   "pshufb     %%xmm4,%%xmm0                    \n"
   "movdqa     %%xmm2,0x20(%1)                  \n"
   "por        %%xmm5,%%xmm0                    \n"
   "pshufb     %%xmm4,%%xmm1                    \n"
   "movdqa     %%xmm0,(%1)                      \n"
   "por        %%xmm5,%%xmm1                    \n"
-  "palignr    $0x4,%%xmm3,%%xmm3               \n"  // xmm3 = { xmm3[4:15] }
+  "palignr    $0x4,%%xmm3,%%xmm3               \n"
   "pshufb     %%xmm4,%%xmm3                    \n"
   "movdqa     %%xmm1,0x10(%1)                  \n"
   "por        %%xmm5,%%xmm3                    \n"
@@ -318,229 +318,320 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 }
 #endif
 
-// The following code requires 6 registers and prefers 7 registers.
-// 7 registers requires -fpic to be off, and -fomit-frame-pointer
-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
-#if defined(__x86_64__)
-#define REG_a "rax"
-#define REG_d "rdx"
-#else
-#define REG_a "eax"
-#define REG_d "edx"
+
+#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
+vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
 #endif
+
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+vec8 kUVToB = {
+  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+
+vec8 kUVToR = {
+  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+
+vec8 kUVToG = {
+  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+
+
+vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
+vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
+vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+
 #if defined(__APPLE__) || defined(__x86_64__)
 #define OMITFP
 #else
 #define OMITFP __attribute__((optimize("omit-frame-pointer")))
 #endif
 
-#define CLOBBER "%"REG_a, "%"REG_d
-// This version produces 2 pixels
+// This version produces 8 pixels
 #define YUVTORGB                                                               \
-"1:                                            \n"                             \
-  "movzb      (%1),%%"REG_a"                   \n"                             \
-  "lea        1(%1),%1                         \n"                             \
-  "movzb      (%2),%%"REG_d"                   \n"                             \
-  "lea        1(%2),%2                         \n"                             \
-  "movq       2048(%5,%%"REG_a",8),%%xmm0      \n"                             \
-  "movzb      0(%0),%%"REG_a"                  \n"                             \
-  "movq       4096(%5,%%"REG_d",8),%%xmm1      \n"                             \
-  "paddsw     %%xmm1,%%xmm0                    \n"                             \
-  "movzb      1(%0),%%"REG_d"                  \n"                             \
-  "punpcklqdq %%xmm0,%%xmm0                    \n"                             \
-  "lea        2(%0),%0                         \n"                             \
-  "movq       0(%5,%%"REG_a",8),%%xmm1         \n"                             \
-  "movhps     0(%5,%%"REG_d",8),%%xmm1         \n"                             \
-  "paddsw     %%xmm0,%%xmm1                    \n"                             \
-  "psraw      $6,%%xmm1                        \n"                             \
-  "packuswb   %%xmm1,%%xmm1                    \n"                             \
-  "movq       %%xmm1,0(%3)                     \n"                             \
-  "lea        8(%3),%3                         \n"                             \
-  "sub        $0x2,%4                          \n"                             \
-  "ja         1b                               \n"
-// This version produces 4 pixels
-#define YUVTORGB4                                                              \
-"1:                                            \n"                             \
-  "movzb      0(%1),%%"REG_a"                  \n"                             \
-  "movzb      0(%2),%%"REG_d"                  \n"                             \
-  "movq       2048(%5,%%"REG_a",8),%%xmm0      \n"                             \
-  "movzb      0(%0),%%"REG_a"                  \n"                             \
-  "movq       4096(%5,%%"REG_d",8),%%xmm1      \n"                             \
-  "paddsw     %%xmm1,%%xmm0                    \n"                             \
-  "movzb      1(%0),%%"REG_d"                  \n"                             \
-  "punpcklqdq %%xmm0,%%xmm0                    \n"                             \
-  "movq       0(%5,%%"REG_a",8),%%xmm2         \n"                             \
-  "movhps     0(%5,%%"REG_d",8),%%xmm2         \n"                             \
-  "paddsw     %%xmm0,%%xmm2                    \n"                             \
-  "psraw      $6,%%xmm2                        \n"                             \
-  "movzb      1(%1),%%"REG_a"                  \n"                             \
-  "movzb      1(%2),%%"REG_d"                  \n"                             \
-  "movq       2048(%5,%%"REG_a",8),%%xmm0      \n"                             \
-  "movzb      2(%0),%%"REG_a"                  \n"                             \
-  "movq       4096(%5,%%"REG_d",8),%%xmm1      \n"                             \
-  "paddsw     %%xmm1,%%xmm0                    \n"                             \
-  "movzb      3(%0),%%"REG_d"                  \n"                             \
-  "punpcklqdq %%xmm0,%%xmm0                    \n"                             \
-  "movq       0(%5,%%"REG_a",8),%%xmm3         \n"                             \
-  "movhps     0(%5,%%"REG_d",8),%%xmm3         \n"                             \
-  "paddsw     %%xmm0,%%xmm3                    \n"                             \
-  "psraw      $6,%%xmm3                        \n"                             \
-  "lea        2(%1),%1                         \n"                             \
-  "lea        2(%2),%2                         \n"                             \
-  "lea        4(%0),%0                         \n"                             \
-  "packuswb   %%xmm3,%%xmm2                    \n"                             \
-  "movdqa     %%xmm2,0(%3)                     \n"                             \
-  "lea        16(%3),%3                        \n"                             \
-  "sub        $0x4,%4                          \n"                             \
-  "ja         1b                               \n"                             \
+  "movd        (%1),%%xmm0                     \n"                             \
+  "movd        (%1,%2,1),%%xmm1                \n"                             \
+  "lea         0x4(%1),%1                      \n"                             \
+  "punpcklbw   %%xmm1,%%xmm0                   \n"                             \
+  "punpcklwd   %%xmm0,%%xmm0                   \n"                             \
+  "movdqa      %%xmm0,%%xmm1                   \n"                             \
+  "movdqa      %%xmm0,%%xmm2                   \n"                             \
+  "pmaddubsw   %5,%%xmm0                       \n"                             \
+  "pmaddubsw   %6,%%xmm1                       \n"                             \
+  "pmaddubsw   %7,%%xmm2                       \n"                             \
+  "psubw       %8,%%xmm0                       \n"                             \
+  "psubw       %9,%%xmm1                       \n"                             \
+  "psubw       %10,%%xmm2                      \n"                             \
+  "movq        (%0),%%xmm3                     \n"                             \
+  "lea         0x8(%0),%0                      \n"                             \
+  "punpcklbw   %%xmm4,%%xmm3                   \n"                             \
+  "psubsw      %11,%%xmm3                      \n"                             \
+  "pmullw      %12,%%xmm3                      \n"                             \
+  "paddw       %%xmm3,%%xmm0                   \n"                             \
+  "paddw       %%xmm3,%%xmm1                   \n"                             \
+  "paddw       %%xmm3,%%xmm2                   \n"                             \
+  "psraw       $0x6,%%xmm0                     \n"                             \
+  "psraw       $0x6,%%xmm1                     \n"                             \
+  "psraw       $0x6,%%xmm2                     \n"                             \
+  "packuswb    %%xmm0,%%xmm0                   \n"                             \
+  "packuswb    %%xmm1,%%xmm1                   \n"                             \
+  "packuswb    %%xmm2,%%xmm2                   \n"
 
-// 6 or 7 registers
-void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,  // rdi
-                                         const uint8* u_buf,  // rsi
-                                         const uint8* v_buf,  // rdx
-                                         uint8* rgb_buf,      // rcx
-                                         int width) {         // r8
-  asm volatile (
-    YUVTORGB
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
-    "+rm"(width)    // %4
-  : "r" (kCoefficientsRgbY)  // %5
-  : "memory", "cc", CLOBBER
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
-);
-}
-
-// 6 or 7 registers
-void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,  // rdi
+void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,  // rdi
                                           const uint8* u_buf,  // rsi
                                           const uint8* v_buf,  // rdx
                                           uint8* rgb_buf,      // rcx
                                           int width) {         // r8
   asm volatile (
-    YUVTORGB4
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(rgb_buf),  // %3
-    "+rm"(width)    // %4
-  : "r" (kCoefficientsRgbY)  // %5
-  : "memory", "cc", CLOBBER
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
-);
-}
+    "sub         %1,%2                         \n"
+    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+    "pxor        %%xmm4,%%xmm4                 \n"
 
-void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,  // rdi
-                                         const uint8* u_buf,  // rsi
-                                         const uint8* v_buf,  // rdx
-                                         uint8* rgb_buf,      // rcx
-                                         int width) {         // r8
-  asm volatile (
+  "1:                                          \n"
     YUVTORGB
+    "punpcklbw   %%xmm1,%%xmm0                 \n"
+    "punpcklbw   %%xmm5,%%xmm2                 \n"
+    "movdqa      %%xmm0,%%xmm1                 \n"
+    "punpcklwd   %%xmm2,%%xmm0                 \n"
+    "movdqa      %%xmm0,(%3)                   \n"
+    "punpckhwd   %%xmm2,%%xmm1                 \n"
+    "movdqa      %%xmm1,0x10(%3)               \n"
+    "lea         0x20(%3),%3                   \n"
+    "sub         $0x8,%4                       \n"
+    "ja          1b                            \n"
   : "+r"(y_buf),    // %0
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
     "+r"(rgb_buf),  // %3
     "+rm"(width)    // %4
-  : "r" (kCoefficientsBgraY)  // %5
-  : "memory", "cc", CLOBBER
+  : "m" (kUVToB),   // %5
+    "m" (kUVToG),   // %6
+    "m" (kUVToR),   // %7
+    "m" (kUVBiasB), // %8
+    "m" (kUVBiasG), // %9
+    "m" (kUVBiasR), // %10
+    "m" (kYSub16),  // %11
+    "m" (kYToRgb)   // %12
+  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
-);
+  );
 }
 
-void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,  // rdi
-                                         const uint8* u_buf,  // rsi
-                                         const uint8* v_buf,  // rdx
-                                         uint8* rgb_buf,      // rcx
-                                         int width) {         // r8
+void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,  // rdi
+                                          const uint8* u_buf,  // rsi
+                                          const uint8* v_buf,  // rdx
+                                          uint8* rgb_buf,      // rcx
+                                          int width) {         // r8
   asm volatile (
+    "sub         %1,%2                         \n"
+    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+    "pxor        %%xmm4,%%xmm4                 \n"
+
+  "1:                                          \n"
     YUVTORGB
+    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+    "punpcklbw   %%xmm0,%%xmm1                 \n"
+    "punpcklbw   %%xmm2,%%xmm5                 \n"
+    "movdqa      %%xmm5,%%xmm0                 \n"
+    "punpcklwd   %%xmm1,%%xmm5                 \n"
+    "movdqa      %%xmm5,(%3)                   \n"
+    "punpckhwd   %%xmm1,%%xmm0                 \n"
+    "movdqa      %%xmm0,0x10(%3)               \n"
+    "lea         0x20(%3),%3                   \n"
+    "sub         $0x8,%4                       \n"
+    "ja          1b                            \n"
   : "+r"(y_buf),    // %0
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
     "+r"(rgb_buf),  // %3
     "+rm"(width)    // %4
-  : "r" (kCoefficientsAbgrY)  // %5
-  : "memory", "cc", CLOBBER
+  : "m" (kUVToB),   // %5
+    "m" (kUVToG),   // %6
+    "m" (kUVToR),   // %7
+    "m" (kUVBiasB), // %8
+    "m" (kUVBiasG), // %9
+    "m" (kUVBiasR), // %10
+    "m" (kYSub16),  // %11
+    "m" (kYToRgb)   // %12
+  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
-);
+  );
 }
 
-// 6 registers
-void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,  // rdi
-                                            const uint8* u_buf,  // rsi
-                                            const uint8* v_buf,  // rdx
-                                            uint8* rgb_buf,      // rcx
-                                            int width) {         // r8
+void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,  // rdi
+                                          const uint8* u_buf,  // rsi
+                                          const uint8* v_buf,  // rdx
+                                          uint8* rgb_buf,      // rcx
+                                          int width) {         // r8
   asm volatile (
-"1:                                            \n"
-  "movzb  (%1),%%"REG_a"                       \n"
-  "lea    1(%1),%1                             \n"
-  "movq   2048(%5,%%"REG_a",8),%%xmm0          \n"
-  "movzb  (%2),%%"REG_a"                       \n"
-  "lea    1(%2),%2                             \n"
-  "movq   4096(%5,%%"REG_a",8),%%xmm1          \n"
-  "paddsw %%xmm1,%%xmm0                        \n"
-  "movzb  (%0),%%"REG_a"                       \n"
-  "lea    1(%0),%0                             \n"
-  "movq   0(%5,%%"REG_a",8),%%xmm2             \n"
-  "paddsw %%xmm0,%%xmm2                        \n"
-  "shufps $0x44,%%xmm2,%%xmm2                  \n"
-  "psraw  $0x6,%%xmm2                          \n"
-  "packuswb %%xmm2,%%xmm2                      \n"
-  "movd   %%xmm2,0x0(%3)                       \n"
-  "lea    4(%3),%3                             \n"
-  "sub    $0x1,%4                              \n"
-  "ja     1b                                   \n"
+    "sub         %1,%2                         \n"
+    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+    "pxor        %%xmm4,%%xmm4                 \n"
+
+  "1:                                          \n"
+    YUVTORGB
+    "packuswb    %%xmm0,%%xmm0                 \n"
+    "packuswb    %%xmm1,%%xmm1                 \n"
+    "packuswb    %%xmm2,%%xmm2                 \n"
+    "punpcklbw   %%xmm1,%%xmm2                 \n"
+    "punpcklbw   %%xmm5,%%xmm0                 \n"
+    "movdqa      %%xmm2,%%xmm1                 \n"
+    "punpcklwd   %%xmm0,%%xmm2                 \n"
+    "movdqa      %%xmm2,(%3)                   \n"
+    "punpckhwd   %%xmm0,%%xmm1                 \n"
+    "movdqa      %%xmm1,0x10(%3)               \n"
+    "lea         0x20(%3),%3                   \n"
+    "sub         $0x8,%4                       \n"
+    "ja          1b                            \n"
   : "+r"(y_buf),    // %0
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
     "+r"(rgb_buf),  // %3
     "+rm"(width)    // %4
-  : "r" (kCoefficientsRgbY)  // %5
-  : "memory", "cc", "%"REG_a
+  : "m" (kUVToB),   // %5
+    "m" (kUVToG),   // %6
+    "m" (kUVToR),   // %7
+    "m" (kUVBiasB), // %8
+    "m" (kUVBiasG), // %9
+    "m" (kUVBiasR), // %10
+    "m" (kYSub16),  // %11
+    "m" (kYToRgb)   // %12
+  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
-);
+  );
 }
 
-// 5 registers
+void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,  // rdi
+                                             const uint8* u_buf,  // rsi
+                                             const uint8* v_buf,  // rdx
+                                             uint8* rgb_buf,      // rcx
+                                             int width) {         // r8
+  asm volatile (
+    "sub         %1,%2                         \n"
+    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+    "pxor        %%xmm4,%%xmm4                 \n"
+
+  "1:                                          \n"
+    "movd        (%1),%%xmm0                   \n"
+    "movd        (%1,%2,1),%%xmm1              \n"
+    "lea         0x4(%1),%1                    \n"
+    "punpcklbw   %%xmm1,%%xmm0                 \n"
+    "movdqa      %%xmm0,%%xmm1                 \n"
+    "movdqa      %%xmm0,%%xmm2                 \n"
+    "pmaddubsw   %5,%%xmm0                     \n"
+    "pmaddubsw   %6,%%xmm1                     \n"
+    "pmaddubsw   %7,%%xmm2                     \n"
+    "psubw       %8,%%xmm0                     \n"
+    "psubw       %9,%%xmm1                     \n"
+    "psubw       %10,%%xmm2                    \n"
+    "movd        (%0),%%xmm3                   \n"
+    "lea         0x4(%0),%0                    \n"
+    "punpcklbw   %%xmm4,%%xmm3                 \n"
+    "psubsw      %11,%%xmm3                    \n"
+    "pmullw      %12,%%xmm3                    \n"
+    "paddw       %%xmm3,%%xmm0                 \n"
+    "paddw       %%xmm3,%%xmm1                 \n"
+    "paddw       %%xmm3,%%xmm2                 \n"
+    "psraw       $0x6,%%xmm0                   \n"
+    "psraw       $0x6,%%xmm1                   \n"
+    "psraw       $0x6,%%xmm2                   \n"
+    "packuswb    %%xmm0,%%xmm0                 \n"
+    "packuswb    %%xmm1,%%xmm1                 \n"
+    "packuswb    %%xmm2,%%xmm2                 \n"
+    "punpcklbw   %%xmm1,%%xmm0                 \n"
+    "punpcklbw   %%xmm5,%%xmm2                 \n"
+    "punpcklwd   %%xmm2,%%xmm0                 \n"
+    "movdqa      %%xmm0,(%3)                   \n"
+    "lea         0x10(%3),%3                   \n"
+    "sub         $0x4,%4                       \n"
+    "ja          1b                            \n"
+  : "+r"(y_buf),    // %0
+    "+r"(u_buf),    // %1
+    "+r"(v_buf),    // %2
+    "+r"(rgb_buf),  // %3
+    "+rm"(width)    // %4
+  : "m" (kUVToB),   // %5
+    "m" (kUVToG),   // %6
+    "m" (kUVToR),   // %7
+    "m" (kUVBiasB), // %8
+    "m" (kUVBiasG), // %9
+    "m" (kUVBiasR), // %10
+    "m" (kYSub16),  // %11
+    "m" (kYToRgb)   // %12
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif
+
+#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
 void FastConvertYToARGBRow_SSE2(const uint8* y_buf,  // rdi
                                 uint8* rgb_buf,      // rcx
                                 int width) {         // r8
   asm volatile (
-"1:                                            \n"
-  "movzb  (%0),%%"REG_a"                       \n"
-  "movzb  0x1(%0),%%"REG_d"                    \n"
-  "movq   (%3,%%"REG_a",8),%%xmm2              \n"
-  "lea    2(%0),%0                             \n"
-  "movhps (%3,%%"REG_d",8),%%xmm2              \n"
-  "psraw  $0x6,%%xmm2                          \n"
-  "packuswb %%xmm2,%%xmm2                      \n"
-  "movq   %%xmm2,0x0(%1)                       \n"
-  "lea    8(%1),%1                             \n"
-  "sub    $0x2,%2                              \n"
-  "ja     1b                                   \n"
+  "pcmpeqb     %%xmm5,%%xmm5                   \n"
+  "pslld       $0x18,%%xmm5                    \n"
+  "pxor        %%xmm4,%%xmm4                   \n"
+  "movdqa      %3,%%xmm3                       \n"
+  "movdqa      %4,%%xmm2                       \n"
+
+  "1:                                          \n"
+  // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+  "movq        (%0),%%xmm0                     \n"
+  "lea         0x8(%0),%0                      \n"
+  "punpcklbw   %%xmm4,%%xmm0                   \n"
+  "psubsw      %%xmm3,%%xmm0                   \n"
+  "pmullw      %%xmm2,%%xmm0                   \n"
+  "psraw       $0x6,%%xmm0                     \n"
+  "packuswb    %%xmm0,%%xmm0                   \n"
+
+  // Step 2: Weave into ARGB
+  "punpcklbw   %%xmm0,%%xmm0                   \n"
+  "movdqa      %%xmm0,%%xmm1                   \n"
+  "punpcklwd   %%xmm0,%%xmm0                   \n"
+  "por         %%xmm5,%%xmm0                   \n"
+  "movdqa      %%xmm0,(%1)                     \n"
+  "punpckhwd   %%xmm1,%%xmm1                   \n"
+  "por         %%xmm5,%%xmm1                   \n"
+  "movdqa      %%xmm1,16(%1)                   \n"
+  "lea         32(%1),%1                       \n"
+
+  "sub         $0x8,%2                         \n"
+  "ja          1b                              \n"
   : "+r"(y_buf),    // %0
     "+r"(rgb_buf),  // %1
     "+rm"(width)    // %2
-  : "r" (kCoefficientsRgbY)  // %3
-  : "memory", "cc", "%"REG_a, "%"REG_d
+  : "m" (kYSub16),  // %3
+    "m" (kYToRgb)   // %4
+  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
-);
+  );
 }
 #endif
 
diff --git a/source/row_win.cc b/source/row_win.cc
index bd8c33cce..12716fb03 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -54,8 +54,7 @@ static const vec8 kABGRToV = {
 };
 
 static const uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 
 static const uvec8 kAddUV128 = {
@@ -548,27 +547,13 @@ static const vec8 kUVToG = {
   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
 };
 
-static const vec16 kYToRgb = {
-  YG, YG, YG, YG, YG, YG, YG, YG
-};
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
 
-static const vec16 kYSub16 = {
-  16, 16, 16, 16, 16, 16, 16, 16
-};
-
-static const vec16 kUVBiasB = {
-  BB, BB, BB, BB, BB, BB, BB, BB
-};
-
-static const vec16 kUVBiasG = {
-  BG, BG, BG, BG, BG, BG, BG, BG
-};
-
-static const vec16 kUVBiasR = {
-  BR, BR, BR, BR, BR, BR, BR, BR
-};
-
-#define YUVTORGB_SSSE3 __asm {                                                 \
+#define YUVTORGB __asm {                                                 \
     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
     __asm movd       xmm0, [esi]          /* U */                              \
     __asm movd       xmm1, [esi + edi]    /* V */                              \
@@ -619,7 +604,7 @@ void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
     pxor       xmm4, xmm4
 
  convertloop:
-    YUVTORGB_SSSE3
+    YUVTORGB
 
     // Step 3: Weave into ARGB
     punpcklbw  xmm0, xmm1           // BG
@@ -658,7 +643,7 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
     pxor       xmm4, xmm4
 
  convertloop:
-    YUVTORGB_SSSE3
+    YUVTORGB
 
     // Step 3: Weave into BGRA
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
@@ -699,7 +684,7 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
     pxor       xmm4, xmm4
 
  convertloop:
-    YUVTORGB_SSSE3
+    YUVTORGB
 
     // Step 3: Weave into ARGB
     punpcklbw  xmm2, xmm1           // RG
@@ -787,7 +772,6 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
 #endif
 
 #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
-
 __declspec(naked)
 void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
                                 uint8* rgb_buf,
@@ -829,8 +813,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
     ret
   }
 }
-
 #endif
+
 #endif
 
 }  // extern "C"