diff --git a/README.chromium b/README.chromium
index 64d025562..dddf0ae85 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 587
+Version: 588
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index f051b3def..b7cbd6e29 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -141,6 +141,9 @@ extern "C" {
 // Effects
 #define HAS_ARGBATTENUATEROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
+#define HAS_ARGBADDROW_AVX2
+#define HAS_ARGBSUBTRACTROW_AVX2
 #endif
 #endif
 
@@ -1011,6 +1014,10 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
                           uint8* dst_argb, int width);
 void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
                               uint8* dst_argb, int width);
+void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
 void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
                           uint8* dst_argb, int width);
 void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
@@ -1023,6 +1030,10 @@ void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
                      uint8* dst_argb, int width);
 void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
                          uint8* dst_argb, int width);
+void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
 void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
                      uint8* dst_argb, int width);
 void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
@@ -1036,6 +1047,10 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
                           uint8* dst_argb, int width);
 void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
                               uint8* dst_argb, int width);
+void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
 void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
                           uint8* dst_argb, int width);
 void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index d4755a768..9907e217e 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 587
+#define LIBYUV_VERSION 588
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 357b081a5..db8ad43b5 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -28,6 +28,7 @@ LIBYUV_API
 void CopyPlane(const uint8* src_y, int src_stride_y,
                uint8* dst_y, int dst_stride_y,
                int width, int height) {
+  // Coalesce contiguous rows.
   if (src_stride_y == width && dst_stride_y == width) {
     CopyPlane(src_y, 0, dst_y, 0, width * height, 1);
     return;
@@ -503,7 +504,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
   return 0;
 }
 
-// Multiply 2 ARGB images together and store to destination.
+// Multiply 2 ARGB images and store to destination.
 LIBYUV_API
 int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
                  const uint8* src_argb1, int src_stride_argb1,
@@ -518,6 +519,15 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
+  // Coalesce contiguous rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    return ARGBMultiply(src_argb0, 0,
+                        src_argb1, 0,
+                        dst_argb, 0,
+                        width * height, 1);
+  }
 
   void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
                           int width) = ARGBMultiplyRow_C;
@@ -531,7 +541,18 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
       ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
     }
   }
-#elif defined(HAS_ARGBMULTIPLYROW_NEON)
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+    clear = true;
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
@@ -547,10 +568,16 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     src_argb1 += src_stride_argb1;
     dst_argb += dst_stride_argb;
   }
+
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
   return 0;
 }
 
-// Add 2 ARGB images together and store to destination.
+// Add 2 ARGB images and store to destination.
 LIBYUV_API
 int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
             const uint8* src_argb1, int src_stride_argb1,
@@ -565,6 +592,15 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
+  // Coalesce contiguous rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    return ARGBAdd(src_argb0, 0,
+                   src_argb1, 0,
+                   dst_argb, 0,
+                   width * height, 1);
+  }
 
   void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
                      int width) = ARGBAddRow_C;
@@ -578,7 +614,18 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
       ARGBAddRow = ARGBAddRow_SSE2;
     }
   }
-#elif defined(HAS_ARGBADDROW_NEON)
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+    clear = true;
+    ARGBAddRow = ARGBAddRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGBAddRow = ARGBAddRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
@@ -594,6 +641,12 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
     src_argb1 += src_stride_argb1;
     dst_argb += dst_stride_argb;
   }
+
+#if defined(HAS_ARGBADDROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
   return 0;
 }
 
@@ -612,6 +665,15 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
+  // Coalesce contiguous rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    return ARGBSubtract(src_argb0, 0,
+                        src_argb1, 0,
+                        dst_argb, 0,
+                        width * height, 1);
+  }
 
   void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
                           int width) = ARGBSubtractRow_C;
@@ -625,7 +687,18 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
       ARGBSubtractRow = ARGBSubtractRow_SSE2;
     }
   }
-#elif defined(HAS_ARGBSUBTRACTROW_NEON)
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+  bool clear = false;
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+    clear = true;
+    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
@@ -641,6 +714,12 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     src_argb1 += src_stride_argb1;
     dst_argb += dst_stride_argb;
   }
+
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+  if (clear) {
+    __asm vzeroupper;
+  }
+#endif
   return 0;
 }
 
diff --git a/source/row_any.cc b/source/row_any.cc
index 6b434f384..723a56652 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -420,6 +420,17 @@ MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3)
 MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C,
             3)
 #endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C,
+            7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C,
+            7)
+#endif
 #ifdef HAS_ARGBMULTIPLYROW_NEON
 MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C,
             7)
diff --git a/source/row_common.cc b/source/row_common.cc
index 61bf5e269..ee5c4b9e7 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -751,7 +751,7 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
 }
 #undef SHADE
 
-#define SHADE(f, v) (v >= f) ? 0 : (f - v)
+#define SHADE(f, v) ((f - v) > f) ? 0 : (f - v)
 
 void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width) {
diff --git a/source/row_win.cc b/source/row_win.cc
index bb070d06d..a05828c5e 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4915,6 +4915,102 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpxor      ymm5, ymm5, ymm5  // constant 0
+    sub        esi, eax
+    sub        edx, eax
+
+    align      16
+ convertloop:
+    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
+    vmovdqu    ymm3, [eax + esi]  // read 8 pixels from src_argb1
+    vpunpcklbw ymm0, ymm1, ymm1   // low 4
+    vpunpckhbw ymm1, ymm1, ymm1   // high 4
+    vpunpcklbw ymm2, ymm3, ymm5   // low 4
+    vpunpckhbw ymm3, ymm3, ymm5   // high 4
+    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
+    vpackuswb  ymm0, ymm0, ymm1
+    sub        ecx, 8
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpxor      ymm5, ymm5, ymm5  // constant 0
+    sub        esi, eax
+    sub        edx, eax
+
+    align      16
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    vpaddusb   ymm0, ymm0, [eax + esi]  // add 8 pixels from src_argb1
+    sub        ecx, 8
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    sub        edx, eax
+
+    align      16
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    vpsubusb   ymm0, ymm0, [eax + esi]  // src_argb0 - src_argb1
+    sub        ecx, 8
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 // Consider float CumulativeSum.
 // Consider calling CumulativeSum one row at time as needed.