remove mmx functions

BUG=none TEST=builds Review URL: http://webrtc-codereview.appspot.com/269010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@77 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2011-11-11 18:41:47 +00:00 · 2011-11-11 18:41:47 +00:00 · eaedc1d727
commit eaedc1d727
parent c82af4a59c
7 changed files with 77 additions and 582 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 69
+Version: 77
 License: BSD
 License File: LICENSE
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -1149,11 +1149,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
      (width % 2 == 0)) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
  if (width % 2 == 0) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX;
  } else
 #endif
  {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
@ -1167,8 +1162,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
      src_v += src_stride_v;
    }
  }
  // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
  EMMS();
  return 0;
 }
@ -1201,11 +1194,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
      (width % 2 == 0)) {
    FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOBGRAROW_MMX)
  if (width % 2 == 0) {
    FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_MMX;
  } else
 #endif
  {
    FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C;
@ -1219,7 +1207,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
      src_v += src_stride_v;
    }
  }
  EMMS();
  return 0;
 }
@ -1252,11 +1239,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
      (width % 2 == 0)) {
    FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOABGRROW_MMX)
  if (width % 2 == 0) {
    FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_MMX;
  } else
 #endif
  {
    FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C;
@ -1270,7 +1252,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
      src_v += src_stride_v;
    }
  }
  EMMS();
  return 0;
 }
@ -1303,11 +1284,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
      (width % 2 == 0)) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
  if (width % 2 == 0) {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX;
  } else
 #endif
  {
    FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
@ -1319,8 +1295,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
    src_u += src_stride_u;
    src_v += src_stride_v;
  }
  // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
  EMMS();
  return 0;
 }
@ -1353,13 +1327,9 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
    FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
    FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_MMX;
 #else
  {
    FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C;
  }
 #endif
  for (int y = 0; y < height; ++y) {
    FastConvertYUV444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
    dst_argb += dst_stride_argb;
@ -1367,8 +1337,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
    src_u += src_stride_u;
    src_v += src_stride_v;
  }
  // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
  EMMS();
  return 0;
 }
@ -1391,11 +1359,6 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
      IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
    FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
  } else
 #endif
 #if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
  if (width % 2 == 0) {
    FastConvertYToARGBRow = FastConvertYToARGBRow_MMX;
  } else
 #endif
  {
    FastConvertYToARGBRow = FastConvertYToARGBRow_C;
@ -1405,8 +1368,6 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
  }
  // MMX used for FastConvertYUVToARGBRow requires an emms instruction.
  EMMS();
  return 0;
 }
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -13,21 +13,19 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "rotate_priv.h"
 #include "row.h"
 namespace libyuv {
-#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
+#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \
-    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+    !defined(__APPLE__) && \
-#if defined(_MSC_VER)
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+
 #else
 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
 #endif
 // Shuffle table for reversing the bytes.
-extern "C" TALIGN16(const uint8, kShuffleReverse[16]) =
+static const uvec8 kShuffleReverse =
  { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u };
 // Shuffle table for reversing the bytes of UV channels.
-extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) =
+static const uvec8 kShuffleReverseUV =
  { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u };
 #endif
@ -73,7 +71,7 @@ __asm {
    mov       edx, [esp + 12 + 12]  // dst
    mov       esi, [esp + 12 + 16]  // dst_stride
    mov       ecx, [esp + 12 + 20]  // width
- convertloop :
+ convertloop:
    // Read in the data from the source pointer.
    // First round of bit swap.
    movq      xmm0, qword ptr [eax]
@ -172,7 +170,7 @@ __asm {
    and       esp, ~15
    mov       [esp + 16], ecx
    mov       ecx, [ecx + 16 + 28]  // w
- convertloop :
+ convertloop:
    // Read in the data from the source pointer.
    // First round of bit swap.
    movdqa    xmm0, [eax]
@ -863,9 +861,9 @@ __asm {
    mov       eax, [esp + 4]   // src
    mov       edx, [esp + 8]   // dst
    mov       ecx, [esp + 12]  // width
-    movdqa    xmm5, _kShuffleReverse
+    movdqa    xmm5, kShuffleReverse
    lea       eax, [eax + ecx - 16]
- convertloop :
+ convertloop:
    movdqa    xmm0, [eax]
    lea       eax, [eax - 16]
    pshufb    xmm0, xmm5
@ -878,12 +876,16 @@ __asm {
 }
 #elif (defined(__i386__) || defined(__x86_64__)) && \
    !defined(__APPLE__) && \
    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_REVERSE_LINE_SSSE3
 static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
-  "movdqa     (%3),%%xmm5                      \n"
+  "movdqa     %0,%%xmm5                        \n"
  :: "m"(kShuffleReverse)
  );
  asm volatile (
  "lea        -0x10(%0,%2,1),%0                \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
@ -896,12 +898,12 @@ static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
  : "+r"(src),  // %0
    "+r"(dst),  // %1
    "+r"(temp_width)  // %2
-  : "r"(kShuffleReverse)  // %3
+  :
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm5"
 #endif
-);
+  );
 }
 #endif
@ -1066,10 +1068,10 @@ __asm {
    mov       edx, [esp + 4 + 8]   // dst_a
    mov       edi, [esp + 4 + 12]  // dst_b
    mov       ecx, [esp + 4 + 16]  // width
-    movdqa    xmm5, _kShuffleReverseUV
+    movdqa    xmm5, kShuffleReverseUV
    lea       eax, [eax + ecx * 2 - 16]
- convertloop :
+ convertloop:
    movdqa    xmm0, [eax]
    lea       eax, [eax - 16]
    pshufb    xmm0, xmm5
@ -1085,6 +1087,7 @@ __asm {
 }
 #elif (defined(__i386__) || defined(__x86_64__)) && \
    !defined(__APPLE__) && \
    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
 #define HAS_REVERSE_LINE_UV_SSSE3
 void ReverseLineUV_SSSE3(const uint8* src,
@ -1092,28 +1095,31 @@ void ReverseLineUV_SSSE3(const uint8* src,
                         int width) {
  intptr_t temp_width = static_cast<intptr_t>(width);
  asm volatile (
-  "movdqa     (%4),%%xmm5                      \n"
+  "movdqa     %0,%%xmm5                        \n"
-  "lea        -0x10(%0,%3,2),%0                \n"
+  :: "m"(kShuffleReverseUV)
  );
  asm volatile (
  "lea        -16(%0,%3,2),%0                  \n"
 "1:                                            \n"
  "movdqa     (%0),%%xmm0                      \n"
-  "lea        -0x10(%0),%0                     \n"
+  "lea        -16(%0),%0                       \n"
  "pshufb     %%xmm5,%%xmm0                    \n"
  "movlpd     %%xmm0,(%1)                      \n"
-  "lea        0x8(%1),%1                       \n"
+  "lea        8(%1),%1                         \n"
  "movhpd     %%xmm0,(%2)                      \n"
-  "lea        0x8(%2),%2                       \n"
+  "lea        8(%2),%2                         \n"
-  "sub        $0x8,%3                          \n"
+  "sub        $8,%3                            \n"
  "ja         1b                               \n"
  : "+r"(src),      // %0
    "+r"(dst_a),    // %1
    "+r"(dst_b),    // %2
    "+r"(temp_width)  // %3
-  : "r"(kShuffleReverseUV)  // %4
+  :
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm5"
 #endif
-);
+  );
 }
 #endif
--- a/source/row.h
+++ b/source/row.h
@ -51,15 +51,6 @@
 #define HAS_FASTCONVERTYTOARGBROW_SSE2
 #endif
 // The following are available on Windows and GCC 32 bit
 #if (defined(WIN32) || \
    defined(__i386__)) && \
    !defined(LIBYUV_DISABLE_ASM)
 #define HAS_FASTCONVERTYUVTOARGBROW_MMX
 #define HAS_FASTCONVERTYUVTOBGRAROW_MMX
 #define HAS_FASTCONVERTYUVTOABGRROW_MMX
 #endif
 // The following are available on Windows
 #if defined(WIN32) && \
    !defined(LIBYUV_DISABLE_ASM)
@ -128,12 +119,14 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
 #if defined(_MSC_VER)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
-#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
+typedef __declspec(align(16)) signed char vec8[16];
 typedef __declspec(align(16)) unsigned char uvec8[16];
 typedef __declspec(align(16)) signed short vec16[8];
 #else // __GNUC__
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
 typedef signed char __attribute__((vector_size(16))) vec8;
 typedef unsigned char __attribute__((vector_size(16))) uvec8;
 typedef signed short __attribute__((vector_size(16))) vec16;
 #endif
 extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
@ -204,36 +197,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
                                int width);
 #endif
 #ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
 void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width);
 void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width);
 void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width);
 void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* rgb_buf,
                                    int width);
 void FastConvertYToARGBRow_MMX(const uint8* y_buf,
                               uint8* rgb_buf,
                               int width);
 #endif
 #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
 void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
                                   const uint8* u_buf,
@ -268,42 +231,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
 #endif
 // Method to force C version.
 //#define USE_MMX 0
 //#define USE_SSE2 0
 #if !defined(USE_MMX)
 // Windows, Mac and Linux use MMX
 #if defined(__i386__) || defined(_MSC_VER)
 #define USE_MMX 1
 #else
 #define USE_MMX 0
 #endif
 #endif
 #if !defined(USE_SSE2)
 #if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
 #define USE_SSE2 1
 #else
 #define USE_SSE2 0
 #endif
 #endif
 // x64 uses MMX2 (SSE) so emms is not required.
 // Warning C4799: function has no EMMS instruction.
 // EMMS() is slow and should be called by the calling function once per image.
 #if USE_MMX && !defined(ARCH_CPU_X86_64)
 #if defined(_MSC_VER)
 #define EMMS() __asm emms
 #pragma warning(disable: 4799)
 #else
 #define EMMS() asm("emms")
 #endif
 #else
 #define EMMS()
 #endif
 }  // extern "C"
 #endif  // LIBYUV_SOURCE_ROW_H_
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -542,231 +542,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,  // rdi
 #endif
 );
 }
 #endif
 #ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
 // 32 bit mmx gcc version
 #ifdef OSX
 #define UNDERSCORE "_"
 #else
 #define UNDERSCORE ""
 #endif
 void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width);
  asm(
  ".text                                       \n"
 #if defined(OSX) || defined(IOS)
  ".globl _FastConvertYUVToARGBRow_MMX         \n"
 "_FastConvertYUVToARGBRow_MMX:                 \n"
 #else
  ".global FastConvertYUVToARGBRow_MMX         \n"
 "FastConvertYUVToARGBRow_MMX:                  \n"
 #endif
  "pusha                                       \n"
  "mov    0x24(%esp),%edx                      \n"
  "mov    0x28(%esp),%edi                      \n"
  "mov    0x2c(%esp),%esi                      \n"
  "mov    0x30(%esp),%ebp                      \n"
  "mov    0x34(%esp),%ecx                      \n"
 "1:                                            \n"
  "movzbl (%edi),%eax                          \n"
  "lea    1(%edi),%edi                         \n"
  "movzbl (%esi),%ebx                          \n"
  "lea    1(%esi),%esi                         \n"
  "movq   " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  "movzbl (%edx),%eax                          \n"
  "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
  "movzbl 0x1(%edx),%ebx                       \n"
  "movq   " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n"
  "lea    2(%edx),%edx                         \n"
  "movq   " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n"
  "paddsw %mm0,%mm1                            \n"
  "paddsw %mm0,%mm2                            \n"
  "psraw  $0x6,%mm1                            \n"
  "psraw  $0x6,%mm2                            \n"
  "packuswb %mm2,%mm1                          \n"
  "movq   %mm1,0x0(%ebp)                       \n"
  "lea    8(%ebp),%ebp                         \n"
  "sub    $0x2,%ecx                            \n"
  "ja     1b                                   \n"
  "popa                                        \n"
  "ret                                         \n"
 );
 void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width);
  asm(
  ".text                                       \n"
 #if defined(OSX) || defined(IOS)
  ".globl _FastConvertYUVToBGRARow_MMX         \n"
 "_FastConvertYUVToBGRARow_MMX:                 \n"
 #else
  ".global FastConvertYUVToBGRARow_MMX         \n"
 "FastConvertYUVToBGRARow_MMX:                  \n"
 #endif
  "pusha                                       \n"
  "mov    0x24(%esp),%edx                      \n"
  "mov    0x28(%esp),%edi                      \n"
  "mov    0x2c(%esp),%esi                      \n"
  "mov    0x30(%esp),%ebp                      \n"
  "mov    0x34(%esp),%ecx                      \n"
 "1:                                            \n"
  "movzbl (%edi),%eax                          \n"
  "lea    1(%edi),%edi                         \n"
  "movzbl (%esi),%ebx                          \n"
  "lea    1(%esi),%esi                         \n"
  "movq   " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
  "movzbl (%edx),%eax                          \n"
  "paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
  "movzbl 0x1(%edx),%ebx                       \n"
  "movq   " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n"
  "lea    2(%edx),%edx                         \n"
  "movq   " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n"
  "paddsw %mm0,%mm1                            \n"
  "paddsw %mm0,%mm2                            \n"
  "psraw  $0x6,%mm1                            \n"
  "psraw  $0x6,%mm2                            \n"
  "packuswb %mm2,%mm1                          \n"
  "movq   %mm1,0x0(%ebp)                       \n"
  "lea    8(%ebp),%ebp                         \n"
  "sub    $0x2,%ecx                            \n"
  "ja     1b                                   \n"
  "popa                                        \n"
  "ret                                         \n"
 );
 void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width);
  asm(
  ".text                                       \n"
 #if defined(OSX) || defined(IOS)
  ".globl _FastConvertYUVToABGRRow_MMX         \n"
 "_FastConvertYUVToABGRRow_MMX:                 \n"
 #else
  ".global FastConvertYUVToABGRRow_MMX         \n"
 "FastConvertYUVToABGRRow_MMX:                  \n"
 #endif
  "pusha                                       \n"
  "mov    0x24(%esp),%edx                      \n"
  "mov    0x28(%esp),%edi                      \n"
  "mov    0x2c(%esp),%esi                      \n"
  "mov    0x30(%esp),%ebp                      \n"
  "mov    0x34(%esp),%ecx                      \n"
 "1:                                            \n"
  "movzbl (%edi),%eax                          \n"
  "lea    1(%edi),%edi                         \n"
  "movzbl (%esi),%ebx                          \n"
  "lea    1(%esi),%esi                         \n"
  "movq   " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
  "movzbl (%edx),%eax                          \n"
  "paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
  "movzbl 0x1(%edx),%ebx                       \n"
  "movq   " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n"
  "lea    2(%edx),%edx                         \n"
  "movq   " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n"
  "paddsw %mm0,%mm1                            \n"
  "paddsw %mm0,%mm2                            \n"
  "psraw  $0x6,%mm1                            \n"
  "psraw  $0x6,%mm2                            \n"
  "packuswb %mm2,%mm1                          \n"
  "movq   %mm1,0x0(%ebp)                       \n"
  "lea    8(%ebp),%ebp                         \n"
  "sub    $0x2,%ecx                            \n"
  "ja     1b                                   \n"
  "popa                                        \n"
  "ret                                         \n"
 );
 void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* rgb_buf,
                                    int width);
  asm(
  ".text                                       \n"
 #if defined(OSX) || defined(IOS)
  ".globl _FastConvertYUV444ToARGBRow_MMX      \n"
 "_FastConvertYUV444ToARGBRow_MMX:              \n"
 #else
  ".global FastConvertYUV444ToARGBRow_MMX      \n"
 "FastConvertYUV444ToARGBRow_MMX:               \n"
 #endif
  "pusha                                       \n"
  "mov    0x24(%esp),%edx                      \n"
  "mov    0x28(%esp),%edi                      \n"
  "mov    0x2c(%esp),%esi                      \n"
  "mov    0x30(%esp),%ebp                      \n"
  "mov    0x34(%esp),%ecx                      \n"
 "1:                                            \n"
  "movzbl (%edi),%eax                          \n"
  "lea    1(%edi),%edi                         \n"
  "movzbl (%esi),%ebx                          \n"
  "lea    1(%esi),%esi                         \n"
  "movq   " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  "movzbl (%edx),%eax                          \n"
  "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
  "lea    1(%edx),%edx                         \n"
  "paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n"
  "psraw  $0x6,%mm0                            \n"
  "packuswb %mm0,%mm0                          \n"
  "movd   %mm0,0x0(%ebp)                       \n"
  "lea    4(%ebp),%ebp                         \n"
  "sub    $0x1,%ecx                            \n"
  "ja     1b                                   \n"
  "popa                                        \n"
  "ret                                         \n"
 );
 void FastConvertYToARGBRow_MMX(const uint8* y_buf,
                               uint8* rgb_buf,
                               int width);
  asm(
  ".text                                       \n"
 #if defined(OSX) || defined(IOS)
  ".globl _FastConvertYToARGBRow_MMX           \n"
 "_FastConvertYToARGBRow_MMX:                   \n"
 #else
  ".global FastConvertYToARGBRow_MMX           \n"
 "FastConvertYToARGBRow_MMX:                    \n"
 #endif
  "push   %ebx                                 \n"
  "mov    0x8(%esp),%eax                       \n"
  "mov    0xc(%esp),%edx                       \n"
  "mov    0x10(%esp),%ecx                      \n"
 "1:                                            \n"
  "movzbl (%eax),%ebx                          \n"
  "movq   " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n"
  "psraw  $0x6,%mm0                            \n"
  "movzbl 0x1(%eax),%ebx                       \n"
  "movq   " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n"
  "psraw  $0x6,%mm1                            \n"
  "packuswb %mm1,%mm0                          \n"
  "lea    0x2(%eax),%eax                       \n"
  "movq   %mm0,(%edx)                          \n"
  "lea    0x8(%edx),%edx                       \n"
  "sub    $0x2,%ecx                            \n"
  "ja     1b                                   \n"
  "pop    %ebx                                 \n"
  "ret                                         \n"
 );
 #endif
 #ifdef HAS_ARGBTOYROW_SSSE3
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -15,71 +15,71 @@ extern "C" {
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Constant multiplication table for converting ARGB to I400.
-SIMD_ALIGNED(const int8 kARGBToY[16]) = {
+static const vec8 kARGBToY = {
  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
 };
-SIMD_ALIGNED(const int8 kARGBToU[16]) = {
+static const vec8 kARGBToU = {
  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
 };
-SIMD_ALIGNED(const int8 kARGBToV[16]) = {
+static const vec8 kARGBToV = {
  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
 // Constants for BGRA
-SIMD_ALIGNED(const int8 kBGRAToY[16]) = {
+static const vec8 kBGRAToY = {
  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
 };
-SIMD_ALIGNED(const int8 kBGRAToU[16]) = {
+static const vec8 kBGRAToU = {
  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
 };
-SIMD_ALIGNED(const int8 kBGRAToV[16]) = {
+static const vec8 kBGRAToV = {
  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
 };
 // Constants for ABGR
-SIMD_ALIGNED(const int8 kABGRToY[16]) = {
+static const vec8 kABGRToY = {
  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
 };
-SIMD_ALIGNED(const int8 kABGRToU[16]) = {
+static const vec8 kABGRToU = {
  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
 };
-SIMD_ALIGNED(const int8 kABGRToV[16]) = {
+static const vec8 kABGRToV = {
  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
 };
-SIMD_ALIGNED(const uint8 kAddY16[16]) = {
+static const uvec8 kAddY16 = {
  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
 };
-SIMD_ALIGNED(const uint8 kAddUV128[16]) = {
+static const uvec8 kAddUV128 = {
  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
 // Shuffle table for converting BG24 to ARGB.
-SIMD_ALIGNED(const uint8 kShuffleMaskBG24ToARGB[16]) = {
+static const uvec8 kShuffleMaskBG24ToARGB = {
  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
 };
 // Shuffle table for converting RAW to ARGB.
-SIMD_ALIGNED(const uint8 kShuffleMaskRAWToARGB[16]) = {
+static const uvec8 kShuffleMaskRAWToARGB = {
  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 // Shuffle table for converting ABGR to ARGB.
-SIMD_ALIGNED(const uint8 kShuffleMaskABGRToARGB[16]) = {
+static const uvec8 kShuffleMaskABGRToARGB = {
  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
 };
 // Shuffle table for converting BGRA to ARGB.
-SIMD_ALIGNED(const uint8 kShuffleMaskBGRAToARGB[16]) = {
+static const uvec8 kShuffleMaskBGRAToARGB = {
  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
 };
@ -118,7 +118,7 @@ __asm {
    mov       ecx, [esp + 12]  // pix
    movdqa    xmm5, kShuffleMaskABGRToARGB
- convertloop :
+ convertloop:
    movdqa    xmm0, [eax]
    lea       eax, [eax + 16]
    pshufb    xmm0, xmm5
@ -138,7 +138,7 @@ __asm {
    mov       ecx, [esp + 12]  // pix
    movdqa    xmm5, kShuffleMaskBGRAToARGB
- convertloop :
+ convertloop:
    movdqa    xmm0, [eax]
    lea       eax, [eax + 16]
    pshufb    xmm0, xmm5
@ -160,7 +160,7 @@ __asm {
    pslld     xmm5, 24
    movdqa    xmm4, kShuffleMaskBG24ToARGB
- convertloop :
+ convertloop:
    movdqa    xmm0, [eax]
    movdqa    xmm1, [eax + 16]
    movdqa    xmm3, [eax + 32]
@ -199,7 +199,7 @@ __asm {
    pslld     xmm5, 24
    movdqa    xmm4, kShuffleMaskRAWToARGB
- convertloop :
+ convertloop:
    movdqa    xmm0, [eax]
    movdqa    xmm1, [eax + 16]
    movdqa    xmm3, [eax + 32]
@ -237,7 +237,7 @@ __asm {
    movdqa     xmm5, kAddY16
    movdqa     xmm4, kARGBToY
- convertloop :
+ convertloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
    movdqa     xmm2, [eax + 32]
@ -270,7 +270,7 @@ __asm {
    movdqa     xmm5, kAddY16
    movdqa     xmm4, kBGRAToY
- convertloop :
+ convertloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
    movdqa     xmm2, [eax + 32]
@ -303,7 +303,7 @@ __asm {
    movdqa     xmm5, kAddY16
    movdqa     xmm4, kABGRToY
- convertloop :
+ convertloop:
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
    movdqa     xmm2, [eax + 32]
@ -343,7 +343,7 @@ __asm {
    movdqa     xmm5, kAddUV128
    sub        edi, edx             // stride from u to v
- convertloop :
+ convertloop:
    /* step 1 - subsample 16x2 argb pixels to 8x1 */
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@ -407,7 +407,7 @@ __asm {
    movdqa     xmm5, kAddUV128
    sub        edi, edx             // stride from u to v
- convertloop :
+ convertloop:
    /* step 1 - subsample 16x2 argb pixels to 8x1 */
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@ -471,7 +471,7 @@ __asm {
    movdqa     xmm5, kAddUV128
    sub        edi, edx             // stride from u to v
- convertloop :
+ convertloop:
    /* step 1 - subsample 16x2 argb pixels to 8x1 */
    movdqa     xmm0, [eax]
    movdqa     xmm1, [eax + 16]
@ -519,182 +519,6 @@ __asm {
  }
 }
 #define YUVTORGB_MMX(TABLE) __asm {                                            \
    __asm convertloop :                                                        \
    __asm movzx     eax, byte ptr [edi]                                        \
    __asm lea       edi, [edi + 1]                                             \
    __asm movzx     ebx, byte ptr [esi]                                        \
    __asm lea       esi, [esi + 1]                                             \
    __asm movq      mm0, [TABLE + 2048 + 8 * eax]                              \
    __asm movzx     eax, byte ptr [edx]                                        \
    __asm paddsw    mm0, [TABLE + 4096 + 8 * ebx]                              \
    __asm movzx     ebx, byte ptr [edx + 1]                                    \
    __asm movq      mm1, [TABLE + 8 * eax]                                     \
    __asm lea       edx, [edx + 2]                                             \
    __asm movq      mm2, [TABLE + 8 * ebx]                                     \
    __asm paddsw    mm1, mm0                                                   \
    __asm paddsw    mm2, mm0                                                   \
    __asm psraw     mm1, 6                                                     \
    __asm psraw     mm2, 6                                                     \
    __asm packuswb  mm1, mm2                                                   \
    __asm movq      [ebp], mm1                                                 \
    __asm lea       ebp, [ebp + 8]                                             \
    __asm sub       ecx, 2                                                     \
    __asm ja        convertloop                                                \
  }
 __declspec(naked)
 void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width) {
  __asm {
    push      ebx
    push      esi
    push      edi
    push      ebp
    mov       edx, [esp + 16 + 4]
    mov       edi, [esp + 16 + 8]
    mov       esi, [esp + 16 + 12]
    mov       ebp, [esp + 16 + 16]
    mov       ecx, [esp + 16 + 20]
    YUVTORGB_MMX(kCoefficientsRgbY)
    pop       ebp
    pop       edi
    pop       esi
    pop       ebx
    ret
  }
 }
 __declspec(naked)
 void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width) {
  __asm {
    push      ebx
    push      esi
    push      edi
    push      ebp
    mov       edx, [esp + 16 + 4]
    mov       edi, [esp + 16 + 8]
    mov       esi, [esp + 16 + 12]
    mov       ebp, [esp + 16 + 16]
    mov       ecx, [esp + 16 + 20]
    YUVTORGB_MMX(kCoefficientsBgraY)
    pop       ebp
    pop       edi
    pop       esi
    pop       ebx
    ret
  }
 }
 __declspec(naked)
 void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width) {
  __asm {
    push      ebx
    push      esi
    push      edi
    push      ebp
    mov       edx, [esp + 16 + 4]
    mov       edi, [esp + 16 + 8]
    mov       esi, [esp + 16 + 12]
    mov       ebp, [esp + 16 + 16]
    mov       ecx, [esp + 16 + 20]
    YUVTORGB_MMX(kCoefficientsAbgrY)
    pop       ebp
    pop       edi
    pop       esi
    pop       ebx
    ret
  }
 }
 __declspec(naked)
 void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* rgb_buf,
                                    int width) {
  __asm {
    push      ebx
    push      esi
    push      edi
    push      ebp
    mov       edx, [esp + 16 + 4]
    mov       edi, [esp + 16 + 8]
    mov       esi, [esp + 16 + 12]
    mov       ebp, [esp + 16 + 16]
    mov       ecx, [esp + 16 + 20]
 convertloop :
    movzx     eax, byte ptr [edi]
    lea       edi, [edi + 1]
    movzx     ebx, byte ptr [esi]
    lea       esi, [esi + 1]
    movq      mm0, [kCoefficientsRgbY + 2048 + 8 * eax]
    movzx     eax, byte ptr [edx]
    paddsw    mm0, [kCoefficientsRgbY + 4096 + 8 * ebx]
    lea       edx, [edx + 1]
    paddsw    mm0, [kCoefficientsRgbY + 8 * eax]
    psraw     mm0, 6
    packuswb  mm0, mm0
    movd      [ebp], mm0
    lea       ebp, [ebp + 4]
    sub       ecx, 1
    ja        convertloop
    pop       ebp
    pop       edi
    pop       esi
    pop       ebx
    ret
  }
 }
 __declspec(naked)
 void FastConvertYToARGBRow_MMX(const uint8* y_buf,
                               uint8* rgb_buf,
                               int width) {
  __asm {
    push      ebx
    mov       eax, [esp + 4 + 4]   // Y
    mov       edx, [esp + 4 + 8]   // rgb
    mov       ecx, [esp + 4 + 12]  // width
 convertloop :
    movzx     ebx, byte ptr [eax]
    movq      mm0, [kCoefficientsRgbY + 8 * ebx]
    psraw     mm0, 6
    movzx     ebx, byte ptr [eax + 1]
    movq      mm1, [kCoefficientsRgbY + 8 * ebx]
    psraw     mm1, 6
    packuswb  mm0, mm1
    lea       eax, [eax + 2]
    movq      [edx], mm0
    lea       edx, [edx + 8]
    sub       ecx, 2
    ja        convertloop
    pop       ebx
    ret
  }
 }
 #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
@ -712,35 +536,35 @@ void FastConvertYToARGBRow_MMX(const uint8* y_buf,
 #define BG UG * 128 + VG * 128
 #define BR UR * 128 + VR * 128
-SIMD_ALIGNED(const int8 kUVToB[16]) = {
+static const vec8 kUVToB = {
  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
 };
-SIMD_ALIGNED(const int8 kUVToR[16]) = {
+static const vec8 kUVToR = {
  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
 };
-SIMD_ALIGNED(const int8 kUVToG[16]) = {
+static const vec8 kUVToG = {
  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
 };
-SIMD_ALIGNED(const int16 kYToRgb[8]) = {
+static const vec16 kYToRgb = {
  YG, YG, YG, YG, YG, YG, YG, YG
 };
-SIMD_ALIGNED(const int16 kYSub16[8]) = {
+static const vec16 kYSub16 = {
  16, 16, 16, 16, 16, 16, 16, 16
 };
-SIMD_ALIGNED(const int16 kUVBiasB[8]) = {
+static const vec16 kUVBiasB = {
  BB, BB, BB, BB, BB, BB, BB, BB
 };
-SIMD_ALIGNED(const int16 kUVBiasG[8]) = {
+static const vec16 kUVBiasG = {
  BG, BG, BG, BG, BG, BG, BG, BG
 };
-SIMD_ALIGNED(const int16 kUVBiasR[8]) = {
+static const vec16 kUVBiasR = {
  BR, BR, BR, BR, BR, BR, BR, BR
 };
@ -794,7 +618,7 @@ void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
    pxor       xmm4, xmm4
- convertloop :
+ convertloop:
    YUVTORGB_SSSE3
    // Step 3: Weave into ARGB
@ -833,7 +657,7 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
    sub        edi, esi
    pxor       xmm4, xmm4
- convertloop :
+ convertloop:
    YUVTORGB_SSSE3
    // Step 3: Weave into BGRA
@ -874,7 +698,7 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
    pxor       xmm4, xmm4
- convertloop :
+ convertloop:
    YUVTORGB_SSSE3
    // Step 3: Weave into ARGB
@ -914,7 +738,7 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
    pxor       xmm4, xmm4
- convertloop :
+ convertloop:
    // Step 1: Find 4 UV contributions to 4 R,G,B values
    movd       xmm0, [esi]          // U
    movd       xmm1, [esi + edi]    // V
@ -978,7 +802,7 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
    movdqa     xmm3, kYSub16
    movdqa     xmm2, kYToRgb
- convertloop :
+ convertloop:
    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
    movq       xmm0, qword ptr [eax]
    lea        eax, [eax + 8]
--- a/source/scale.cc
+++ b/source/scale.cc
@ -14,6 +14,7 @@
 #include <string.h>
 #include "libyuv/cpu_id.h"
 #include "row.h"
 #if defined(_MSC_VER)
 #define ALIGN16(var) __declspec(align(16)) var
@ -21,6 +22,7 @@
 #define ALIGN16(var) var __attribute__((aligned(16)))
 #endif
 // Note: A Neon reference manual
 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
 // Note: Some SSE2 reference manuals