diff --git a/README.chromium b/README.chromium
index ec57a1d6b..7447b9fc0 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 541
+Version: 542
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 76b6517f3..7247f880e 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -209,6 +209,13 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
 
+// Multiply ARGB image by ARGB image.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
 // Convert I422 to YUY2.
 LIBYUV_API
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
@@ -265,11 +272,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height, uint32 value);
 
-// Multiply ARGB image by ARGB image.
-int ARGBMultiply(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
-
 // Interpolate between two ARGB images using specified amount of interpolation
 // (0 to 255) and store to destination.
 // 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 22eccce76..ca594af74 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -967,6 +967,15 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
 void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
                     uint8* dst_argb, int width);
 
+// ARGB preattenuated alpha blend.  Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
 void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -1270,7 +1279,6 @@ void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
 void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
                                const int32* previous_cumsum, int width);
 
-
 LIBYUV_API
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                      uint8* dst_argb, const float* uv_dudv, int width);
@@ -1287,10 +1295,6 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
 void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb,
                              ptrdiff_t src_stride_argb, int dst_width,
                              int source_y_fraction);
-void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                              int width);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 73eb3fad9..69e5ff069 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 541
+#define LIBYUV_VERSION 542
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 750670ce3..9aca6555d 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -404,6 +404,50 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
   return 0;
 }
 
+// Multiply 2 ARGB images together and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+
+  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBMultiplyRow_C;
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+      IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
+      IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+    }
+  }
+#elif defined(HAS_ARGBMULTIPLYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+  }
+#endif
+
+  // Multiply plane
+  for (int y = 0; y < height; ++y) {
+    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
 // Convert I422 to BGRA.
 LIBYUV_API
 int I422ToBGRA(const uint8* src_y, int src_stride_y,
@@ -1170,47 +1214,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-// ARGB multiply 2 images together.
-LIBYUV_API
-int ARGBMultiply(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
-  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-
-  void (*ARGBMultiplyRow)(const uint8* src, uint8* dst, int width) =
-      ARGBMultiplyRow_C;
-#if defined(HAS_ARGBMULTIPLYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
-    }
-  }
-#elif defined(HAS_ARGBMULTIPLYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
-    ARGBMultiplyRow = ARGBMultiplyRow_NEON;
-  }
-#endif
-
-  // Multiply plane
-  for (int y = 0; y < height; ++y) {
-    ARGBMultiplyRow(src_argb, dst_argb, width);
-    src_argb += src_stride_argb;
-    dst_argb += dst_stride_argb;
-  }
-  return 0;
-}
-
 // Interpolate 2 ARGB images by specified amount (0 to 255).
 // TODO(fbarchard): Consider selecting a specialization for interpolation so
 //     row function doesn't need to check interpolation on each row.
diff --git a/source/row_any.cc b/source/row_any.cc
index 1d928a3fd..8c6a8122a 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -373,10 +373,12 @@ MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
 #undef MergeUVRow_ANY
 
 #define MultiplyRow_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK)              \
-    void NAMEANY(const uint8* src_argb, uint8* dst_argb, int width) {          \
+    void NAMEANY(const uint8* src_argb0, const uint8* src_argb1,               \
+                 uint8* dst_argb, int width) {                                 \
       int n = width & ~MASK;                                                   \
-      ARGBMULT_SIMD(src_argb, dst_argb, n);                                    \
-      ARGBMULT_C(src_argb + n * 4,                                             \
+      ARGBMULT_SIMD(src_argb0, src_argb1, dst_argb, n);                        \
+      ARGBMULT_C(src_argb0 + n * 4,                                            \
+                 src_argb1 + n * 4,                                            \
                  dst_argb + n * 4,                                             \
                  width & MASK);                                                \
     }
diff --git a/source/row_common.cc b/source/row_common.cc
index f4c55b6d2..b9f02d0e3 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -704,21 +704,23 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
 #define REPEAT8(v) (v) | ((v) << 8)
 #define SHADE(f, v) v * f >> 16
 
-void ARGBMultiplyRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
   for (int i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb[0]);
-    const uint32 g = REPEAT8(src_argb[1]);
-    const uint32 r = REPEAT8(src_argb[2]);
-    const uint32 a = REPEAT8(src_argb[3]);
-    const uint32 b_scale = dst_argb[0];
-    const uint32 g_scale = dst_argb[1];
-    const uint32 r_scale = dst_argb[2];
-    const uint32 a_scale = dst_argb[3];
+    const uint32 b = REPEAT8(src_argb0[0]);
+    const uint32 g = REPEAT8(src_argb0[1]);
+    const uint32 r = REPEAT8(src_argb0[2]);
+    const uint32 a = REPEAT8(src_argb0[3]);
+    const uint32 b_scale = src_argb1[0];
+    const uint32 g_scale = src_argb1[1];
+    const uint32 r_scale = src_argb1[2];
+    const uint32 a_scale = src_argb1[3];
     dst_argb[0] = SHADE(b, b_scale);
     dst_argb[1] = SHADE(g, g_scale);
     dst_argb[2] = SHADE(r, r_scale);
     dst_argb[3] = SHADE(a, a_scale);
-    src_argb += 4;
+    src_argb0 += 4;
+    src_argb1 += 4;
     dst_argb += 4;
   }
 }
diff --git a/source/row_posix.cc b/source/row_posix.cc
index d62be2383..c710241ff 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3963,10 +3963,12 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
 // Aligned to 16 bytes.
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
   asm volatile (
     "pxor      %%xmm5,%%xmm5                   \n"
     "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
 
     // 4 pixel loop.
     ".p2align  4                               \n"
@@ -3982,13 +3984,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     "pmulhuw   %%xmm2,%%xmm0                   \n"
     "pmulhuw   %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,(%0,%2,1)                \n"
     "lea       0x10(%0),%0                     \n"
     "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
+  : "+r"(src_argb0),   // %0
+    "+r"(src_argb1),   // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
   :
   : "memory", "cc"
 #if defined(__SSE2__)
diff --git a/source/row_win.cc b/source/row_win.cc
index f988312ed..5b6b5448b 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4280,18 +4280,22 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 // Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
 // Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
-    mov        ecx, [esp + 12]  // width
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
     pxor       xmm5, xmm5  // constant 0
+    sub        esi, eax
     sub        edx, eax
 
     align      16
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
-    movdqa     xmm2, [eax + edx]  // read 4 dest pixels
+    movdqa     xmm0, [eax]      // read 4 pixels from src_argb0
+    movdqa     xmm2, [eax + esi]  // read 4 pixels from src_argb1
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     punpcklbw  xmm0, xmm0       // first 2
@@ -4306,6 +4310,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     lea        eax, [eax + 16]
     jg         convertloop
 
+    pop        esi
     ret
   }
 }
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 6e987f5c9..bfbf1bead 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -913,17 +913,21 @@ static int TestMultiply(int width, int height, int benchmark_iterations,
     src_argb_a[i + off] = (random() & 0xff);
     src_argb_b[i + off] = (random() & 0xff);
   }
-  memcpy(dst_argb_c, src_argb_b + off, kStride * height);
-  memcpy(dst_argb_opt, src_argb_b + off, kStride * height);
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
 
   MaskCpuFlags(0);
   ARGBMultiply(src_argb_a + off, kStride,
+               src_argb_b + off, kStride,
                dst_argb_c, kStride,
                width, invert * height);
   MaskCpuFlags(-1);
-  ARGBMultiply(src_argb_a + off, kStride,
-               dst_argb_opt, kStride,
-               width, invert * height);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBMultiply(src_argb_a + off, kStride,
+                 src_argb_b + off, kStride,
+                 dst_argb_opt, kStride,
+                 width, invert * height);
+  }
   int max_diff = 0;
   for (int i = 0; i < kStride * height; ++i) {
     int abs_diff =
@@ -933,12 +937,6 @@ static int TestMultiply(int width, int height, int benchmark_iterations,
       max_diff = abs_diff;
     }
   }
-  // Benchmark.
-  for (int i = 0; i < benchmark_iterations - 1; ++i) {
-    ARGBMultiply(src_argb_a + off, kStride,
-                dst_argb_opt, kStride,
-                width, invert * height);
-  }
   free_aligned_buffer_64(src_argb_a)
   free_aligned_buffer_64(src_argb_b)
   free_aligned_buffer_64(dst_argb_c)