port yuv to rgb ssse3 to gcc

BUG=none TEST=media_unittest Review URL: http://webrtc-codereview.appspot.com/269015 git-svn-id: http://libyuv.googlecode.com/svn/trunk@80 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2011-11-15 21:58:26 +00:00 · 2011-11-15 21:58:26 +00:00 · 228bdc24e4
commit 228bdc24e4
parent 4cf70bd6db
5 changed files with 299 additions and 271 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 79
+Version: 80
 License: BSD
 License File: LICENSE
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -1136,19 +1136,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      (width % 4 == 0) &&
      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow4_SSE2;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      (width % 2 == 0)) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
  } else
 #endif
  {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
@ -1188,12 +1175,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
    FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      (width % 2 == 0)) {
    FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2;
  } else
 #endif
  {
    FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C;
@ -1233,12 +1214,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
    FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      (width % 2 == 0)) {
    FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2;
  } else
 #endif
  {
    FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C;
@ -1278,12 +1253,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      (width % 2 == 0)) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
  } else
 #endif
  {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
@ -1321,11 +1290,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
    FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
  } else
 #endif
  {
    FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C;
@ -1354,7 +1318,7 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
                                 uint8* rgb_buf,
                                 int width);
 #if defined(HAS_FASTCONVERTYTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSE2) &&
      (width % 8 == 0) &&
      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
    FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
--- a/source/row.h
+++ b/source/row.h
@ -37,28 +37,17 @@
 #define HAS_BGRATOUVROW_SSSE3
 #define HAS_ABGRTOUVROW_SSSE3
 #define HAS_I400TOARGBROW_SSE2
 #endif
 // The following are available on Linux (32/64 bit)
 // TODO(fbarchard): enable for fpic on linux
 #if (defined(__x86_64__) || \
    (defined(__i386__) && !defined(__pic__))) && \
    !defined(LIBYUV_DISABLE_ASM)
 #define HAS_FASTCONVERTYUVTOARGBROW_SSE2
 #define HAS_FASTCONVERTYUVTOBGRAROW_SSE2
 #define HAS_FASTCONVERTYUVTOABGRROW_SSE2
 #define HAS_FASTCONVERTYUV444TOARGBROW_SSE2
 #define HAS_FASTCONVERTYTOARGBROW_SSE2
 #endif
-// The following are available on Windows
+// The following are available on all x86 platforms except 32 bit OSX
-#if defined(WIN32) && \
+#if (defined(WIN32) || defined(__x86_64__) || \
    (defined(__i386__) && !defined(__APPLE__))) && \
    !defined(LIBYUV_DISABLE_ASM)
 #define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
 #define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
 #define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
 #define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
 #define HAS_FASTCONVERTYTOARGBROW_SSE2
 #endif
 extern "C" {
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -14,49 +14,49 @@
 extern "C" {
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Constant multiplication table for converting ARGB to I400.
 static const vec8 kARGBToY = {
  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
 };
 static const uvec8 kAddY16 = {
  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 #ifdef HAS_ARGBTOUVROW_SSSE3
-static const vec8 kARGBToU = {
+vec8 kARGBToU = {
  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
 };
-static const uvec8 kARGBToV = {
+uvec8 kARGBToV = {
  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
 };
-static const uvec8 kAddUV128 = {
+uvec8 kAddUV128 = {
  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
 #endif
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Constant multiplication table for converting ARGB to I400.
 vec8 kARGBToY = {
  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
 };
 uvec8 kAddY16 = {
  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 // Shuffle table for converting BG24 to ARGB.
-static const uvec8 kShuffleMaskBG24ToARGB = {
+uvec8 kShuffleMaskBG24ToARGB = {
  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
 };
 // Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {
+uvec8 kShuffleMaskRAWToARGB = {
  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 // Shuffle table for converting ABGR to ARGB.
-static const uvec8 kShuffleMaskABGRToARGB = {
+uvec8 kShuffleMaskABGRToARGB = {
  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
 };
 // Shuffle table for converting BGRA to ARGB.
-static const uvec8 kShuffleMaskBGRAToARGB = {
+uvec8 kShuffleMaskBGRAToARGB = {
  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
 };
@ -145,17 +145,17 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
  "movdqa     0x20(%0),%%xmm3                  \n"
  "lea        0x30(%0),%0                      \n"
  "movdqa     %%xmm3,%%xmm2                    \n"
-  "palignr    $0x8,%%xmm1,%%xmm2               \n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
+  "palignr    $0x8,%%xmm1,%%xmm2               \n"
  "pshufb     %%xmm4,%%xmm2                    \n"
  "por        %%xmm5,%%xmm2                    \n"
-  "palignr    $0xc,%%xmm0,%%xmm1               \n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
+  "palignr    $0xc,%%xmm0,%%xmm1               \n"
  "pshufb     %%xmm4,%%xmm0                    \n"
  "movdqa     %%xmm2,0x20(%1)                  \n"
  "por        %%xmm5,%%xmm0                    \n"
  "pshufb     %%xmm4,%%xmm1                    \n"
  "movdqa     %%xmm0,(%1)                      \n"
  "por        %%xmm5,%%xmm1                    \n"
-  "palignr    $0x4,%%xmm3,%%xmm3               \n"  // xmm3 = { xmm3[4:15] }
+  "palignr    $0x4,%%xmm3,%%xmm3               \n"
  "pshufb     %%xmm4,%%xmm3                    \n"
  "movdqa     %%xmm1,0x10(%1)                  \n"
  "por        %%xmm5,%%xmm3                    \n"
@ -185,17 +185,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
  "movdqa     0x20(%0),%%xmm3                  \n"
  "lea        0x30(%0),%0                      \n"
  "movdqa     %%xmm3,%%xmm2                    \n"
-  "palignr    $0x8,%%xmm1,%%xmm2               \n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
+  "palignr    $0x8,%%xmm1,%%xmm2               \n"
  "pshufb     %%xmm4,%%xmm2                    \n"
  "por        %%xmm5,%%xmm2                    \n"
-  "palignr    $0xc,%%xmm0,%%xmm1               \n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
+  "palignr    $0xc,%%xmm0,%%xmm1               \n"
  "pshufb     %%xmm4,%%xmm0                    \n"
  "movdqa     %%xmm2,0x20(%1)                  \n"
  "por        %%xmm5,%%xmm0                    \n"
  "pshufb     %%xmm4,%%xmm1                    \n"
  "movdqa     %%xmm0,(%1)                      \n"
  "por        %%xmm5,%%xmm1                    \n"
-  "palignr    $0x4,%%xmm3,%%xmm3               \n"  // xmm3 = { xmm3[4:15] }
+  "palignr    $0x4,%%xmm3,%%xmm3               \n"
  "pshufb     %%xmm4,%%xmm3                    \n"
  "movdqa     %%xmm1,0x10(%1)                  \n"
  "por        %%xmm5,%%xmm3                    \n"
@ -318,227 +318,318 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 }
 #endif
-// The following code requires 6 registers and prefers 7 registers.
+
-// 7 registers requires -fpic to be off, and -fomit-frame-pointer
+#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
-#if defined(__x86_64__)
+
-#define REG_a "rax"
+vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-#define REG_d "rdx"
+vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
 #else
 #define REG_a "eax"
 #define REG_d "edx"
 #endif
 #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
 #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
 #define UR 0
 #define VB 0
 #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
 #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
 // Bias
 #define BB UB * 128 + VB * 128
 #define BG UG * 128 + VG * 128
 #define BR UR * 128 + VR * 128
 vec8 kUVToB = {
  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
 };
 vec8 kUVToR = {
  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
 };
 vec8 kUVToG = {
  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
 };
 vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
 vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
 vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
 #if defined(__APPLE__) || defined(__x86_64__)
 #define OMITFP
 #else
 #define OMITFP __attribute__((optimize("omit-frame-pointer")))
 #endif
-#define CLOBBER "%"REG_a, "%"REG_d
+// This version produces 8 pixels
 // This version produces 2 pixels
 #define YUVTORGB                                                               \
-"1:                                            \n"                             \
+  "movd        (%1),%%xmm0                     \n"                             \
-  "movzb      (%1),%%"REG_a"                   \n"                             \
+  "movd        (%1,%2,1),%%xmm1                \n"                             \
-  "lea        1(%1),%1                         \n"                             \
+  "lea         0x4(%1),%1                      \n"                             \
-  "movzb      (%2),%%"REG_d"                   \n"                             \
+  "punpcklbw   %%xmm1,%%xmm0                   \n"                             \
-  "lea        1(%2),%2                         \n"                             \
+  "punpcklwd   %%xmm0,%%xmm0                   \n"                             \
-  "movq       2048(%5,%%"REG_a",8),%%xmm0      \n"                             \
+  "movdqa      %%xmm0,%%xmm1                   \n"                             \
-  "movzb      0(%0),%%"REG_a"                  \n"                             \
+  "movdqa      %%xmm0,%%xmm2                   \n"                             \
-  "movq       4096(%5,%%"REG_d",8),%%xmm1      \n"                             \
+  "pmaddubsw   %5,%%xmm0                       \n"                             \
-  "paddsw     %%xmm1,%%xmm0                    \n"                             \
+  "pmaddubsw   %6,%%xmm1                       \n"                             \
-  "movzb      1(%0),%%"REG_d"                  \n"                             \
+  "pmaddubsw   %7,%%xmm2                       \n"                             \
-  "punpcklqdq %%xmm0,%%xmm0                    \n"                             \
+  "psubw       %8,%%xmm0                       \n"                             \
-  "lea        2(%0),%0                         \n"                             \
+  "psubw       %9,%%xmm1                       \n"                             \
-  "movq       0(%5,%%"REG_a",8),%%xmm1         \n"                             \
+  "psubw       %10,%%xmm2                      \n"                             \
-  "movhps     0(%5,%%"REG_d",8),%%xmm1         \n"                             \
+  "movq        (%0),%%xmm3                     \n"                             \
-  "paddsw     %%xmm0,%%xmm1                    \n"                             \
+  "lea         0x8(%0),%0                      \n"                             \
-  "psraw      $6,%%xmm1                        \n"                             \
+  "punpcklbw   %%xmm4,%%xmm3                   \n"                             \
  "psubsw      %11,%%xmm3                      \n"                             \
  "pmullw      %12,%%xmm3                      \n"                             \
  "paddw       %%xmm3,%%xmm0                   \n"                             \
  "paddw       %%xmm3,%%xmm1                   \n"                             \
  "paddw       %%xmm3,%%xmm2                   \n"                             \
  "psraw       $0x6,%%xmm0                     \n"                             \
  "psraw       $0x6,%%xmm1                     \n"                             \
  "psraw       $0x6,%%xmm2                     \n"                             \
  "packuswb    %%xmm0,%%xmm0                   \n"                             \
  "packuswb    %%xmm1,%%xmm1                   \n"                             \
  "movq       %%xmm1,0(%3)                     \n"                             \
  "lea        8(%3),%3                         \n"                             \
  "sub        $0x2,%4                          \n"                             \
  "ja         1b                               \n"
 // This version produces 4 pixels
 #define YUVTORGB4                                                              \
 "1:                                            \n"                             \
  "movzb      0(%1),%%"REG_a"                  \n"                             \
  "movzb      0(%2),%%"REG_d"                  \n"                             \
  "movq       2048(%5,%%"REG_a",8),%%xmm0      \n"                             \
  "movzb      0(%0),%%"REG_a"                  \n"                             \
  "movq       4096(%5,%%"REG_d",8),%%xmm1      \n"                             \
  "paddsw     %%xmm1,%%xmm0                    \n"                             \
  "movzb      1(%0),%%"REG_d"                  \n"                             \
  "punpcklqdq %%xmm0,%%xmm0                    \n"                             \
  "movq       0(%5,%%"REG_a",8),%%xmm2         \n"                             \
  "movhps     0(%5,%%"REG_d",8),%%xmm2         \n"                             \
  "paddsw     %%xmm0,%%xmm2                    \n"                             \
  "psraw      $6,%%xmm2                        \n"                             \
  "movzb      1(%1),%%"REG_a"                  \n"                             \
  "movzb      1(%2),%%"REG_d"                  \n"                             \
  "movq       2048(%5,%%"REG_a",8),%%xmm0      \n"                             \
  "movzb      2(%0),%%"REG_a"                  \n"                             \
  "movq       4096(%5,%%"REG_d",8),%%xmm1      \n"                             \
  "paddsw     %%xmm1,%%xmm0                    \n"                             \
  "movzb      3(%0),%%"REG_d"                  \n"                             \
  "punpcklqdq %%xmm0,%%xmm0                    \n"                             \
  "movq       0(%5,%%"REG_a",8),%%xmm3         \n"                             \
  "movhps     0(%5,%%"REG_d",8),%%xmm3         \n"                             \
  "paddsw     %%xmm0,%%xmm3                    \n"                             \
  "psraw      $6,%%xmm3                        \n"                             \
  "lea        2(%1),%1                         \n"                             \
  "lea        2(%2),%2                         \n"                             \
  "lea        4(%0),%0                         \n"                             \
  "packuswb   %%xmm3,%%xmm2                    \n"                             \
  "movdqa     %%xmm2,0(%3)                     \n"                             \
  "lea        16(%3),%3                        \n"                             \
  "sub        $0x4,%4                          \n"                             \
  "ja         1b                               \n"                             \
 // 6 or 7 registers
 void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,  // rdi
                                         const uint8* u_buf,  // rsi
                                         const uint8* v_buf,  // rdx
                                         uint8* rgb_buf,      // rcx
                                         int width) {         // r8
  asm volatile (
    YUVTORGB
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
    "+r"(rgb_buf),  // %3
    "+rm"(width)    // %4
  : "r" (kCoefficientsRgbY)  // %5
  : "memory", "cc", CLOBBER
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
 );
 }
 // 6 or 7 registers
 void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,  // rdi
                                          const uint8* u_buf,  // rsi
                                          const uint8* v_buf,  // rdx
                                          uint8* rgb_buf,      // rcx
                                          int width) {         // r8
  asm volatile (
    YUVTORGB4
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
    "+r"(rgb_buf),  // %3
    "+rm"(width)    // %4
  : "r" (kCoefficientsRgbY)  // %5
  : "memory", "cc", CLOBBER
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
 );
 }
 void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,  // rdi
                                         const uint8* u_buf,  // rsi
                                         const uint8* v_buf,  // rdx
                                         uint8* rgb_buf,      // rcx
                                         int width) {         // r8
  asm volatile (
    YUVTORGB
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
    "+r"(rgb_buf),  // %3
    "+rm"(width)    // %4
  : "r" (kCoefficientsBgraY)  // %5
  : "memory", "cc", CLOBBER
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
 );
 }
 void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,  // rdi
                                         const uint8* u_buf,  // rsi
                                         const uint8* v_buf,  // rdx
                                         uint8* rgb_buf,      // rcx
                                         int width) {         // r8
  asm volatile (
    YUVTORGB
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
    "+r"(rgb_buf),  // %3
    "+rm"(width)    // %4
  : "r" (kCoefficientsAbgrY)  // %5
  : "memory", "cc", CLOBBER
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
 );
 }
 // 6 registers
 void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,  // rdi
                                            const uint8* u_buf,  // rsi
                                            const uint8* v_buf,  // rdx
                                            uint8* rgb_buf,      // rcx
                                            int width) {         // r8
  asm volatile (
 "1:                                            \n"
  "movzb  (%1),%%"REG_a"                       \n"
  "lea    1(%1),%1                             \n"
  "movq   2048(%5,%%"REG_a",8),%%xmm0          \n"
  "movzb  (%2),%%"REG_a"                       \n"
  "lea    1(%2),%2                             \n"
  "movq   4096(%5,%%"REG_a",8),%%xmm1          \n"
  "paddsw %%xmm1,%%xmm0                        \n"
  "movzb  (%0),%%"REG_a"                       \n"
  "lea    1(%0),%0                             \n"
  "movq   0(%5,%%"REG_a",8),%%xmm2             \n"
  "paddsw %%xmm0,%%xmm2                        \n"
  "shufps $0x44,%%xmm2,%%xmm2                  \n"
  "psraw  $0x6,%%xmm2                          \n"
  "packuswb    %%xmm2,%%xmm2                   \n"
-  "movd   %%xmm2,0x0(%3)                       \n"
+
-  "lea    4(%3),%3                             \n"
+void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,  // rdi
-  "sub    $0x1,%4                              \n"
+                                          const uint8* u_buf,  // rsi
                                          const uint8* v_buf,  // rdx
                                          uint8* rgb_buf,      // rcx
                                          int width) {         // r8
  asm volatile (
    "sub         %1,%2                         \n"
    "pcmpeqb     %%xmm5,%%xmm5                 \n"
    "pxor        %%xmm4,%%xmm4                 \n"
  "1:                                          \n"
    YUVTORGB
    "punpcklbw   %%xmm1,%%xmm0                 \n"
    "punpcklbw   %%xmm5,%%xmm2                 \n"
    "movdqa      %%xmm0,%%xmm1                 \n"
    "punpcklwd   %%xmm2,%%xmm0                 \n"
    "movdqa      %%xmm0,(%3)                   \n"
    "punpckhwd   %%xmm2,%%xmm1                 \n"
    "movdqa      %%xmm1,0x10(%3)               \n"
    "lea         0x20(%3),%3                   \n"
    "sub         $0x8,%4                       \n"
    "ja          1b                            \n"
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
    "+r"(rgb_buf),  // %3
    "+rm"(width)    // %4
-  : "r" (kCoefficientsRgbY)  // %5
+  : "m" (kUVToB),   // %5
-  : "memory", "cc", "%"REG_a
+    "m" (kUVToG),   // %6
    "m" (kUVToR),   // %7
    "m" (kUVBiasB), // %8
    "m" (kUVBiasG), // %9
    "m" (kUVBiasR), // %10
    "m" (kYSub16),  // %11
    "m" (kYToRgb)   // %12
  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
  );
 }
-// 5 registers
+void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,  // rdi
                                          const uint8* u_buf,  // rsi
                                          const uint8* v_buf,  // rdx
                                          uint8* rgb_buf,      // rcx
                                          int width) {         // r8
  asm volatile (
    "sub         %1,%2                         \n"
    "pcmpeqb     %%xmm5,%%xmm5                 \n"
    "pxor        %%xmm4,%%xmm4                 \n"
  "1:                                          \n"
    YUVTORGB
    "pcmpeqb     %%xmm5,%%xmm5                 \n"
    "punpcklbw   %%xmm0,%%xmm1                 \n"
    "punpcklbw   %%xmm2,%%xmm5                 \n"
    "movdqa      %%xmm5,%%xmm0                 \n"
    "punpcklwd   %%xmm1,%%xmm5                 \n"
    "movdqa      %%xmm5,(%3)                   \n"
    "punpckhwd   %%xmm1,%%xmm0                 \n"
    "movdqa      %%xmm0,0x10(%3)               \n"
    "lea         0x20(%3),%3                   \n"
    "sub         $0x8,%4                       \n"
    "ja          1b                            \n"
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
    "+r"(rgb_buf),  // %3
    "+rm"(width)    // %4
  : "m" (kUVToB),   // %5
    "m" (kUVToG),   // %6
    "m" (kUVToR),   // %7
    "m" (kUVBiasB), // %8
    "m" (kUVBiasG), // %9
    "m" (kUVBiasR), // %10
    "m" (kYSub16),  // %11
    "m" (kYToRgb)   // %12
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
  );
 }
 void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,  // rdi
                                          const uint8* u_buf,  // rsi
                                          const uint8* v_buf,  // rdx
                                          uint8* rgb_buf,      // rcx
                                          int width) {         // r8
  asm volatile (
    "sub         %1,%2                         \n"
    "pcmpeqb     %%xmm5,%%xmm5                 \n"
    "pxor        %%xmm4,%%xmm4                 \n"
  "1:                                          \n"
    YUVTORGB
    "packuswb    %%xmm0,%%xmm0                 \n"
    "packuswb    %%xmm1,%%xmm1                 \n"
    "packuswb    %%xmm2,%%xmm2                 \n"
    "punpcklbw   %%xmm1,%%xmm2                 \n"
    "punpcklbw   %%xmm5,%%xmm0                 \n"
    "movdqa      %%xmm2,%%xmm1                 \n"
    "punpcklwd   %%xmm0,%%xmm2                 \n"
    "movdqa      %%xmm2,(%3)                   \n"
    "punpckhwd   %%xmm0,%%xmm1                 \n"
    "movdqa      %%xmm1,0x10(%3)               \n"
    "lea         0x20(%3),%3                   \n"
    "sub         $0x8,%4                       \n"
    "ja          1b                            \n"
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
    "+r"(rgb_buf),  // %3
    "+rm"(width)    // %4
  : "m" (kUVToB),   // %5
    "m" (kUVToG),   // %6
    "m" (kUVToR),   // %7
    "m" (kUVBiasB), // %8
    "m" (kUVBiasG), // %9
    "m" (kUVBiasR), // %10
    "m" (kYSub16),  // %11
    "m" (kYToRgb)   // %12
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
  );
 }
 void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,  // rdi
                                             const uint8* u_buf,  // rsi
                                             const uint8* v_buf,  // rdx
                                             uint8* rgb_buf,      // rcx
                                             int width) {         // r8
  asm volatile (
    "sub         %1,%2                         \n"
    "pcmpeqb     %%xmm5,%%xmm5                 \n"
    "pxor        %%xmm4,%%xmm4                 \n"
  "1:                                          \n"
    "movd        (%1),%%xmm0                   \n"
    "movd        (%1,%2,1),%%xmm1              \n"
    "lea         0x4(%1),%1                    \n"
    "punpcklbw   %%xmm1,%%xmm0                 \n"
    "movdqa      %%xmm0,%%xmm1                 \n"
    "movdqa      %%xmm0,%%xmm2                 \n"
    "pmaddubsw   %5,%%xmm0                     \n"
    "pmaddubsw   %6,%%xmm1                     \n"
    "pmaddubsw   %7,%%xmm2                     \n"
    "psubw       %8,%%xmm0                     \n"
    "psubw       %9,%%xmm1                     \n"
    "psubw       %10,%%xmm2                    \n"
    "movd        (%0),%%xmm3                   \n"
    "lea         0x4(%0),%0                    \n"
    "punpcklbw   %%xmm4,%%xmm3                 \n"
    "psubsw      %11,%%xmm3                    \n"
    "pmullw      %12,%%xmm3                    \n"
    "paddw       %%xmm3,%%xmm0                 \n"
    "paddw       %%xmm3,%%xmm1                 \n"
    "paddw       %%xmm3,%%xmm2                 \n"
    "psraw       $0x6,%%xmm0                   \n"
    "psraw       $0x6,%%xmm1                   \n"
    "psraw       $0x6,%%xmm2                   \n"
    "packuswb    %%xmm0,%%xmm0                 \n"
    "packuswb    %%xmm1,%%xmm1                 \n"
    "packuswb    %%xmm2,%%xmm2                 \n"
    "punpcklbw   %%xmm1,%%xmm0                 \n"
    "punpcklbw   %%xmm5,%%xmm2                 \n"
    "punpcklwd   %%xmm2,%%xmm0                 \n"
    "movdqa      %%xmm0,(%3)                   \n"
    "lea         0x10(%3),%3                   \n"
    "sub         $0x4,%4                       \n"
    "ja          1b                            \n"
  : "+r"(y_buf),    // %0
    "+r"(u_buf),    // %1
    "+r"(v_buf),    // %2
    "+r"(rgb_buf),  // %3
    "+rm"(width)    // %4
  : "m" (kUVToB),   // %5
    "m" (kUVToG),   // %6
    "m" (kUVToR),   // %7
    "m" (kUVBiasB), // %8
    "m" (kUVBiasG), // %9
    "m" (kUVBiasR), // %10
    "m" (kYSub16),  // %11
    "m" (kYToRgb)   // %12
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
  );
 }
 #endif
 #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
 void FastConvertYToARGBRow_SSE2(const uint8* y_buf,  // rdi
                                uint8* rgb_buf,      // rcx
                                int width) {         // r8
  asm volatile (
  "pcmpeqb     %%xmm5,%%xmm5                   \n"
  "pslld       $0x18,%%xmm5                    \n"
  "pxor        %%xmm4,%%xmm4                   \n"
  "movdqa      %3,%%xmm3                       \n"
  "movdqa      %4,%%xmm2                       \n"
  "1:                                          \n"
-  "movzb  (%0),%%"REG_a"                       \n"
+  // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-  "movzb  0x1(%0),%%"REG_d"                    \n"
+  "movq        (%0),%%xmm0                     \n"
-  "movq   (%3,%%"REG_a",8),%%xmm2              \n"
+  "lea         0x8(%0),%0                      \n"
-  "lea    2(%0),%0                             \n"
+  "punpcklbw   %%xmm4,%%xmm0                   \n"
-  "movhps (%3,%%"REG_d",8),%%xmm2              \n"
+  "psubsw      %%xmm3,%%xmm0                   \n"
-  "psraw  $0x6,%%xmm2                          \n"
+  "pmullw      %%xmm2,%%xmm0                   \n"
-  "packuswb %%xmm2,%%xmm2                      \n"
+  "psraw       $0x6,%%xmm0                     \n"
-  "movq   %%xmm2,0x0(%1)                       \n"
+  "packuswb    %%xmm0,%%xmm0                   \n"
-  "lea    8(%1),%1                             \n"
+
-  "sub    $0x2,%2                              \n"
+  // Step 2: Weave into ARGB
  "punpcklbw   %%xmm0,%%xmm0                   \n"
  "movdqa      %%xmm0,%%xmm1                   \n"
  "punpcklwd   %%xmm0,%%xmm0                   \n"
  "por         %%xmm5,%%xmm0                   \n"
  "movdqa      %%xmm0,(%1)                     \n"
  "punpckhwd   %%xmm1,%%xmm1                   \n"
  "por         %%xmm5,%%xmm1                   \n"
  "movdqa      %%xmm1,16(%1)                   \n"
  "lea         32(%1),%1                       \n"
  "sub         $0x8,%2                         \n"
  "ja          1b                              \n"
  : "+r"(y_buf),    // %0
    "+r"(rgb_buf),  // %1
    "+rm"(width)    // %2
-  : "r" (kCoefficientsRgbY)  // %3
+  : "m" (kYSub16),  // %3
-  : "memory", "cc", "%"REG_a, "%"REG_d
+    "m" (kYToRgb)   // %4
  : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
  );
 }
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -54,8 +54,7 @@ static const vec8 kABGRToV = {
 };
 static const uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
 };
 static const uvec8 kAddUV128 = {
@ -548,27 +547,13 @@ static const vec8 kUVToG = {
  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
 };
-static const vec16 kYToRgb = {
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-  YG, YG, YG, YG, YG, YG, YG, YG
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-};
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
-static const vec16 kYSub16 = {
+#define YUVTORGB __asm {                                                 \
  16, 16, 16, 16, 16, 16, 16, 16
 };
 static const vec16 kUVBiasB = {
  BB, BB, BB, BB, BB, BB, BB, BB
 };
 static const vec16 kUVBiasG = {
  BG, BG, BG, BG, BG, BG, BG, BG
 };
 static const vec16 kUVBiasR = {
  BR, BR, BR, BR, BR, BR, BR, BR
 };
 #define YUVTORGB_SSSE3 __asm {                                                 \
    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
    __asm movd       xmm0, [esi]          /* U */                              \
    __asm movd       xmm1, [esi + edi]    /* V */                              \
@ -619,7 +604,7 @@ void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
    pxor       xmm4, xmm4
 convertloop:
-    YUVTORGB_SSSE3
+    YUVTORGB
    // Step 3: Weave into ARGB
    punpcklbw  xmm0, xmm1           // BG
@ -658,7 +643,7 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
    pxor       xmm4, xmm4
 convertloop:
-    YUVTORGB_SSSE3
+    YUVTORGB
    // Step 3: Weave into BGRA
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
@ -699,7 +684,7 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
    pxor       xmm4, xmm4
 convertloop:
-    YUVTORGB_SSSE3
+    YUVTORGB
    // Step 3: Weave into ARGB
    punpcklbw  xmm2, xmm1           // RG
@ -787,7 +772,6 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
 #endif
 #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
 __declspec(naked)
 void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
                                uint8* rgb_buf,
@ -829,8 +813,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
    ret
  }
 }
 #endif
 #endif
 }  // extern "C"