diff --git a/README.chromium b/README.chromium
index 980a40c78..a78bd81f9 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 745
+Version: 746
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 92c828846..ad6c50761 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -38,8 +38,17 @@ extern "C" {
 // The following are available on all x86 platforms, including NaCL:
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_ARGBBLENDROW_SSSE3
+// Effects:
+#define HAS_ARGBADDROW_SSE2
 #define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+
+// Conversions:
+#define HAS_FIXEDDIV_X86
+
 #endif
 
 // The following are available on all x86 platforms except NaCL x64:
@@ -47,7 +56,7 @@ extern "C" {
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
     !(defined(__native_client__) && defined(__x86_64__))
 
-// Conversions.
+// Conversions:
 #define HAS_ABGRTOUVROW_SSSE3
 #define HAS_ABGRTOYROW_SSSE3
 #define HAS_ARGB1555TOARGBROW_SSE2
@@ -110,19 +119,14 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
-#define HAS_FIXEDDIV
 
-// Effects
-#define HAS_ARGBADDROW_SSE2
+// Effects:
 #define HAS_ARGBAFFINEROW_SSE2
 #define HAS_ARGBCOLORMATRIXROW_SSSE3
 #define HAS_ARGBGRAYROW_SSSE3
 #define HAS_ARGBMIRRORROW_SSSE3
-#define HAS_ARGBMULTIPLYROW_SSE2
 #define HAS_ARGBQUANTIZEROW_SSE2
 #define HAS_ARGBSEPIAROW_SSSE3
-#define HAS_ARGBSHADEROW_SSE2
-#define HAS_ARGBSUBTRACTROW_SSE2
 #define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
@@ -134,12 +138,12 @@ extern "C" {
 #define HAS_SOBELYROW_SSSE3
 #endif
 
-// The following are Windows only.
+// The following are Windows only:
 // TODO(fbarchard): Port to gcc.
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_RGBCOLORTABLEROW_X86
-// Visual C 2012 required for AVX2.
+// Caveat: Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 #define HAS_ARGBSHUFFLEROW_AVX2
 #define HAS_ARGBTOUVROW_AVX2
@@ -157,7 +161,7 @@ extern "C" {
 #define HAS_YUY2TOUVROW_AVX2
 #define HAS_YUY2TOYROW_AVX2
 
-// Effects
+// Effects:
 #define HAS_ARGBADDROW_AVX2
 #define HAS_ARGBATTENUATEROW_AVX2
 #define HAS_ARGBMIRRORROW_AVX2
@@ -167,7 +171,7 @@ extern "C" {
 #endif
 #endif
 
-// The following are Yasm x86 only.
+// The following are Yasm x86 only:
 // TODO(fbarchard): Port AVX2 to inline.
 #if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
     (defined(_M_IX86) || defined(_M_X64) || \
@@ -194,7 +198,7 @@ extern "C" {
 #endif
 #endif
 
-// The following are available on Neon platforms
+// The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_ABGRTOUVROW_NEON
@@ -267,7 +271,7 @@ extern "C" {
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
 
-// Effects
+// Effects:
 #define HAS_ARGBADDROW_NEON
 #define HAS_ARGBATTENUATEROW_NEON
 #define HAS_ARGBBLENDROW_NEON
@@ -286,7 +290,7 @@ extern "C" {
 #define HAS_INTERPOLATEROW_NEON
 #endif
 
-// The following are available on Mips platforms
+// The following are available on Mips platforms:
 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
 #define HAS_COPYROW_MIPS
 #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
@@ -1534,8 +1538,9 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_C(int num, int div);
-#ifdef HAS_FIXEDDIV
-int FixedDiv(int num, int div);
+int FixedDiv_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
 #else
 #define FixedDiv FixedDiv_C
 #endif
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 5e853e981..3a78b51c6 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 745
+#define LIBYUV_VERSION 746
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/compare.cc b/source/compare.cc
index f8b358309..93935b1f3 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -30,7 +30,9 @@ extern "C" {
 uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
 
 // This module is for Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
+#if !defined(LIBYUV_DISABLE_X86) && \
+    !(defined(__native_client__) && defined(__x86_64__)) && \
+    (defined(_M_IX86) || \
     (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
 #define HAS_HASHDJB2_SSE41
 
@@ -73,8 +75,9 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
 #define HAS_SUMSQUAREERROR_NEON
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
 #endif
-#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
-    defined(__x86_64__) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+    !(defined(__native_client__) && defined(__x86_64__)) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #define HAS_SUMSQUAREERROR_SSE2
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
 #endif
diff --git a/source/compare_posix.cc b/source/compare_posix.cc
index 61b012364..b97a6eaa5 100644
--- a/source/compare_posix.cc
+++ b/source/compare_posix.cc
@@ -16,7 +16,9 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+    !(defined(__native_client__) && defined(__x86_64__)) && \
+    (defined(__x86_64__) || defined(__i386__))
 
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
   uint32 sse;
@@ -65,6 +67,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #if !defined(LIBYUV_DISABLE_X86) && \
+    !(defined(__native_client__) && defined(__x86_64__)) && \
     (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
 #define HAS_HASHDJB2_SSE41
 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
diff --git a/source/row_common.cc b/source/row_common.cc
index 60af38608..67ffc96b6 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -1904,7 +1904,7 @@ void I422ToUYVYRow_C(const uint8* src_y,
     }
 }
 
-#if !defined(LIBYUV_DISABLE_X86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.  5% slower.
 // TODO(fbarchard): Handle width > kMaxStride here instead of calling code.
 #if defined(__x86_64__) || defined(__i386__)
@@ -2001,7 +2001,6 @@ void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
   UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
   I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
 }
-
 #endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
 #endif  // !defined(LIBYUV_DISABLE_X86)
 #undef clamp0
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 239731a17..642a0a71e 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3027,6 +3027,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
 }
 #endif  // HAS_COPYROW_X86
 
+#ifdef HAS_COPYROW_ERMS
 // Unaligned Multiple of 1.
 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
   size_t width_tmp = static_cast<size_t>(width);
@@ -3039,6 +3040,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
   : "memory", "cc"
   );
 }
+#endif  // HAS_COPYROW_ERMS
 
 #ifdef HAS_SETROW_X86
 void SetRow_X86(uint8* dst, uint32 v32, int width) {
@@ -4167,14 +4169,14 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                        uint32 value) {
   asm volatile (
     "movd      %3,%%xmm2                       \n"
-    "sub       %0,%1                           \n"
     "punpcklbw %%xmm2,%%xmm2                   \n"
     "punpcklqdq %%xmm2,%%xmm2                  \n"
 
     // 4 pixel loop.
     ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    "MEMACCESS(0)",%%xmm0           \n"
+    "lea       "MEMLEA(0x10,0)",%0             \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
@@ -4184,8 +4186,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm0,"MEMACCESS(1)"           \n"
+    "lea       "MEMLEA(0x10,1)",%1             \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
@@ -4205,14 +4207,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   asm volatile (
     "pxor      %%xmm5,%%xmm5                   \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
 
     // 4 pixel loop.
     ".p2align  4                               \n"
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    (%0,%1),%%xmm2                  \n"
+    "movdqu    "MEMACCESS(0)",%%xmm0           \n"
+    "lea       "MEMLEA(0x10,0)",%0             \n"
+    "movdqu    "MEMACCESS(1)",%%xmm2           \n"
+    "lea       "MEMLEA(0x10,1)",%1             \n"
     "movdqu    %%xmm0,%%xmm1                   \n"
     "movdqu    %%xmm2,%%xmm3                   \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
@@ -4223,8 +4225,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "pmulhuw   %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
-    "movdqu    %%xmm0,(%0,%2,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    %%xmm0,"MEMACCESS(2)"           \n"
+    "lea       "MEMLEA(0x10,2)",%2             \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4244,18 +4246,17 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-
     // 4 pixel loop.
     ".p2align  4                               \n"
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    (%0,%1),%%xmm1                  \n"
+    "movdqu    "MEMACCESS(0)",%%xmm0           \n"
+    "lea       "MEMLEA(0x10,0)",%0             \n"
+    "movdqu    "MEMACCESS(1)",%%xmm1           \n"
+    "lea       "MEMLEA(0x10,1)",%1             \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
-    "movdqu    %%xmm0,(%0,%2,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    %%xmm0,"MEMACCESS(2)"           \n"
+    "lea       "MEMLEA(0x10,2)",%2             \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4275,18 +4276,17 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-
     // 4 pixel loop.
     ".p2align  4                               \n"
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    (%0,%1),%%xmm1                  \n"
+    "movdqu    "MEMACCESS(0)",%%xmm0           \n"
+    "lea       "MEMLEA(0x10,0)",%0             \n"
+    "movdqu    "MEMACCESS(1)",%%xmm1           \n"
+    "lea       "MEMLEA(0x10,1)",%1             \n"
     "psubusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
-    "movdqu    %%xmm0,(%0,%2,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    %%xmm0,"MEMACCESS(2)"           \n"
+    "lea       "MEMLEA(0x10,2)",%2             \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4793,6 +4793,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
+#ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           ptrdiff_t src_stride, int dst_width,
@@ -4895,6 +4896,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 #endif
   );
 }
+#endif  // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
@@ -5009,6 +5011,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
+#ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                                     ptrdiff_t src_stride, int dst_width,
@@ -5111,6 +5114,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 #endif
   );
 }
+#endif   // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
@@ -5225,6 +5229,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
+#ifdef HAS_HALFROW_SSE2
 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                   uint8* dst_uv, int pix) {
   asm volatile (
@@ -5247,7 +5252,9 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
 #endif
   );
 }
+#endif  // HAS_HALFROW_SSE2
 
+#ifdef HAS_ARGBTOBAYERROW_SSSE3
 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
                           uint32 selector, int pix) {
   asm volatile (
@@ -5275,7 +5282,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
 #endif
   );
 }
+#endif  // HAS_ARGBTOBAYERROW_SSSE3
 
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                           const uint8* shuffler, int pix) {
@@ -5330,7 +5339,9 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
 #endif
   );
 }
+#endif  // HAS_ARGBSHUFFLEROW_SSSE3
 
+#ifdef HAS_I422TOYUY2ROW_SSE2
 void I422ToYUY2Row_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -5365,7 +5376,9 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
 #endif
   );
 }
+#endif  // HAS_I422TOYUY2ROW_SSE2
 
+#ifdef HAS_I422TOUYVYROW_SSE2
 void I422ToUYVYRow_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -5400,9 +5413,11 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
 #endif
   );
 }
+#endif  // HAS_I422TOUYVYROW_SSE2
 
+#ifdef HAS_FIXEDDIV_X86
 // Divide num by div and return as 16.16 fixed point result.
-int FixedDiv(int num, int div) {
+int FixedDiv_X86(int num, int div) {
   asm volatile (
     "cdq                                       \n"
     "shld      $0x10,%%eax,%%edx               \n"
@@ -5415,6 +5430,7 @@ int FixedDiv(int num, int div) {
   );
   return num;
 }
+#endif  // HAS_FIXEDDIV_X86
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/source/row_win.cc b/source/row_win.cc
index a255293b0..25c07b3f8 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -5239,13 +5239,13 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     movd       xmm2, [esp + 16]  // value
-    sub        edx, eax
     punpcklbw  xmm2, xmm2
     punpcklqdq xmm2, xmm2
 
     align      16
  convertloop:
     movdqa     xmm0, [eax]      // read 4 pixels
+    lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm0       // first 2
     punpckhbw  xmm1, xmm1       // next 2
@@ -5255,8 +5255,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     sub        ecx, 4
-    movdqa     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
     jg         convertloop
 
     ret
@@ -5276,25 +5276,25 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     pxor       xmm5, xmm5  // constant 0
-    sub        esi, eax
-    sub        edx, eax
 
     align      16
  convertloop:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm2, [eax + esi]  // read 4 pixels from src_argb1
+    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
     movdqu     xmm1, xmm0
     movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0       // first 2
-    punpckhbw  xmm1, xmm1       // next 2
-    punpcklbw  xmm2, xmm5       // first 2
-    punpckhbw  xmm3, xmm5       // next 2
-    pmulhuw    xmm0, xmm2       // src_argb0 * src_argb1 first 2
-    pmulhuw    xmm1, xmm3       // src_argb0 * src_argb1 next 2
+    punpcklbw  xmm0, xmm0         // first 2
+    punpckhbw  xmm1, xmm1         // next 2
+    punpcklbw  xmm2, xmm5         // first 2
+    punpckhbw  xmm3, xmm5         // next 2
+    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    lea        eax, [eax + 16]
+    lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
     sub        ecx, 4
-    movdqu     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     jg         convertloop
 
     pop        esi
@@ -5315,8 +5315,6 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
 
     sub        ecx, 4
     jl         convertloop49
@@ -5324,11 +5322,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     align      16
  convertloop4:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm1, [eax + esi]  // read 4 pixels from src_argb1
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
     sub        ecx, 4
-    movdqu     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     jge        convertloop4
 
  convertloop49:
@@ -5337,11 +5337,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
  convertloop1:
     movd       xmm0, [eax]        // read 1 pixels from src_argb0
-    movd       xmm1, [eax + esi]  // read 1 pixels from src_argb1
+    lea        eax, [eax + 4]
+    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    lea        esi, [esi + 4]
     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
     sub        ecx, 1
-    movd       [eax + edx], xmm0
-    lea        eax, [eax + 4]
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
     jge        convertloop1
 
  convertloop19:
@@ -5362,17 +5364,17 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
 
     align      16
  convertloop:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm1, [eax + esi]  // read 4 pixels from src_argb1
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
     psubusb    xmm0, xmm1         // src_argb0 - src_argb1
     sub        ecx, 4
-    movdqu     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     jg         convertloop
 
     pop        esi
@@ -5392,14 +5394,14 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5  // constant 0
-    sub        esi, eax
-    sub        edx, eax
+    vpxor      ymm5, ymm5, ymm5     // constant 0
 
     align      16
  convertloop:
     vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
-    vmovdqu    ymm3, [eax + esi]  // read 8 pixels from src_argb1
+    lea        eax, [eax + 32]
+    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    lea        esi, [esi + 32]
     vpunpcklbw ymm0, ymm1, ymm1   // low 4
     vpunpckhbw ymm1, ymm1, ymm1   // high 4
     vpunpcklbw ymm2, ymm3, ymm5   // low 4
@@ -5407,8 +5409,8 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
     vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
     vpackuswb  ymm0, ymm0, ymm1
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -5430,15 +5432,15 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
 
     align      16
  convertloop:
     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
-    vpaddusb   ymm0, ymm0, [eax + esi]  // add 8 pixels from src_argb1
-    vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
+    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -5460,15 +5462,15 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
 
     align      16
  convertloop:
     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
-    vpsubusb   ymm0, ymm0, [eax + esi]  // src_argb0 - src_argb1
-    vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
+    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -6646,9 +6648,10 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
   }
 }
 
+#ifdef HAS_FIXEDDIV_X86
 // Divide num by div and return as 16.16 fixed point result.
 __declspec(naked) __declspec(align(16))
-int FixedDiv(int num, int div) {
+int FixedDiv_X86(int num, int div) {
   __asm {
     mov        eax, [esp + 4]    // num
     cdq                          // extend num to 64 bits
@@ -6658,6 +6661,7 @@ int FixedDiv(int num, int div) {
     ret
   }
 }
+#endif  // HAS_FIXEDDIV_X86
 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
 #ifdef __cplusplus