From bea690b3e03d24f77fea45c9a8592ea480a4acd8 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Sat, 5 Dec 2015 22:23:29 -0800
Subject: [PATCH] AVX2 YUV alpha blender and improved unittests

AVX2 version can process 16 pixels at a time for improved memory bandwidth and fewer instructions.

unittests improved to test unaligned memory, and test exactness when alpha is 0 or 255.

R=dhrosa@google.com, harryjin@google.com
BUG=libyuv:527

Review URL: https://codereview.chromium.org/1505433002 .
---
 README.chromium                   |   2 +-
 include/libyuv/planar_functions.h |  26 +++
 include/libyuv/row.h              |  13 +-
 include/libyuv/version.h          |   2 +-
 source/planar_functions.cc        | 162 ++++++++++++++++++
 source/row_gcc.cc                 |  51 +++++-
 source/row_win.cc                 |  62 ++++++-
 unit_test/planar_test.cc          | 267 ++++++++++++++++++++++++++----
 8 files changed, 539 insertions(+), 46 deletions(-)

diff --git a/README.chromium b/README.chromium
index 3b3aed2b2..b0bc90214 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1547
+Version: 1548
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 6d5dd082f..9d30225d4 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -302,6 +302,7 @@ LIBYUV_API
 ARGBBlendRow GetARGBBlend();
 
 // Alpha Blend ARGB images and store to destination.
+// Source is pre-multiplied by alpha using ARGBAttenuate.
 // Alpha of destination is set to 255.
 LIBYUV_API
 int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
@@ -309,6 +310,31 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
 
+// Alpha Blend plane and store to destination.
+// Source is not pre-multiplied by alpha.
+LIBYUV_API
+int BlendPlane(const uint8* src_y0, int src_stride_y0,
+               const uint8* src_y1, int src_stride_y1,
+               const uint8* alpha, int alpha_stride,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alpha Blend YUV images and store to destination.
+// Source is not pre-multiplied by alpha.
+// Alpha is full width x height and subsampled to half size to apply to UV.
+LIBYUV_API
+int I420Blend(const uint8* src_y0, int src_stride_y0,
+              const uint8* src_u0, int src_stride_u0,
+              const uint8* src_v0, int src_stride_v0,
+              const uint8* src_y1, int src_stride_y1,
+              const uint8* src_u1, int src_stride_u1,
+              const uint8* src_v1, int src_stride_v1,
+              const uint8* alpha, int alpha_stride,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height);
+
 // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
 LIBYUV_API
 int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index deed8a422..cf96c0516 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -233,6 +233,7 @@ extern "C" {
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
+#define HAS_BLENDPLANEROW_AVX2
 #endif
 
 // The following are available for AVX2 Visual C and clangcl 32 bit:
@@ -253,12 +254,6 @@ extern "C" {
 #define HAS_RGB565TOARGBROW_AVX2
 #endif
 
-// The following are available for 32 bit Visual C and clangcl 32 bit:
-// TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
-#define HAS_BLENDPLANEROW_SSSE3
-#endif
-
 // The following are also available on x64 Visual C.
 #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
     (!defined(__clang__) || defined(__SSSE3__))
@@ -1464,6 +1459,12 @@ void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
 // Unattenuated planar alpha blend.
 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
                          const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,
+                             const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+                        const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,
+                            const uint8* alpha, uint8* dst, int width);
 void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
                      const uint8* alpha, uint8* dst, int width);
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 10754081b..2c47a4c9f 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1547
+#define LIBYUV_VERSION 1548
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index b15b6e523..85425feaf 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -17,6 +17,7 @@
 #include "libyuv/mjpeg_decoder.h"
 #endif
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"  // for ScaleRowDown2
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -577,6 +578,167 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
   return 0;
 }
 
+// Alpha Blend plane and store to destination.
+LIBYUV_API
+int BlendPlane(const uint8* src_y0, int src_stride_y0,
+               const uint8* src_y1, int src_stride_y1,
+               const uint8* alpha, int alpha_stride,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
+      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+  if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+  // Coalesce rows for Y plane.
+  if (src_stride_y0 == width &&
+      src_stride_y1 == width &&
+      alpha_stride == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
+  }
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+// TODO(fbarchard): Implement any versions for odd width.
+//  BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      BlendPlaneRow = BlendPlaneRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+//  BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      BlendPlaneRow = BlendPlaneRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
+    src_y0 += src_stride_y0;
+    src_y1 += src_stride_y1;
+    alpha += alpha_stride;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+#define MAXTWIDTH 2048
+// Alpha Blend YUV images and store to destination.
+LIBYUV_API
+int I420Blend(const uint8* src_y0, int src_stride_y0,
+              const uint8* src_u0, int src_stride_u0,
+              const uint8* src_v0, int src_stride_v0,
+              const uint8* src_y1, int src_stride_y1,
+              const uint8* src_u1, int src_stride_u1,
+              const uint8* src_v1, int src_stride_v1,
+              const uint8* alpha, int alpha_stride,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height) {
+  int y;
+  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
+      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+  if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
+      !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+
+  // Blend Y plane.
+  BlendPlane(src_y0, src_stride_y0,
+             src_y1, src_stride_y1,
+             alpha, alpha_stride,
+             dst_y, dst_stride_y,
+             width, height);
+
+  // Half width/height for UV.
+  width = (width + 1) >> 1;
+  height = (height + 1) >> 1;
+
+#if defined(HAS_BLENDPLANEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+// TODO(fbarchard): Implement any versions for odd width.
+//  BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      BlendPlaneRow = BlendPlaneRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BLENDPLANEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+//  BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      BlendPlaneRow = BlendPlaneRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ScaleRowDown2 = ScaleRowDown2Box_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      ScaleRowDown2 = ScaleRowDown2Box_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ScaleRowDown2 = ScaleRowDown2Box_AVX2;
+    }
+  }
+#endif
+
+  // Row buffer for intermediate alpha pixels.
+  align_buffer_64(halfalpha, width);
+  for (y = 0; y < height; ++y) {
+    // Subsample 2 rows of UV to half width and half height.
+    ScaleRowDown2(alpha, alpha_stride, halfalpha, width);
+    alpha += alpha_stride * 2;
+    BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, width);
+    BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, width);
+    src_u0 += src_stride_u0;
+    src_u1 += src_stride_u1;
+    dst_u += dst_stride_u;
+    src_v0 += src_stride_v0;
+    src_v1 += src_stride_v1;
+    dst_v += dst_stride_v;
+  }
+  free_aligned_buffer_64(halfalpha);
+  return 0;
+}
+
 // Multiply 2 ARGB images and store to destination.
 LIBYUV_API
 int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index c3ff96282..12c7dd884 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -3467,7 +3467,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
-
 #ifdef HAS_BLENDPLANEROW_SSSE3
 // Blend 8 pixels at a time.
 // =((G2*C2)+(H2*(D2))+32768+127)/256
@@ -3514,6 +3513,56 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
 }
 #endif  // HAS_BLENDPLANEROW_SSSE3
 
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 16 pixels at a time.
+// =((G2*C2)+(H2*(D2))+32768+127)/256
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+                        const uint8* alpha, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
+    "mov        $0x80808080,%%eax              \n"
+    "vmovd      %%eax,%%xmm6                   \n"
+    "vbroadcastss %%xmm6,%%ymm6                \n"
+    "mov        $0x807f807f,%%eax              \n"
+    "vmovd      %%eax,%%xmm7                   \n"
+    "vbroadcastss %%xmm7,%%ymm7                \n"
+    "sub        %2,%0                          \n"
+    "sub        %2,%1                          \n"
+    "sub        %2,%3                          \n"
+
+    // 16 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    (%2),%%xmm0                    \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
+    "vmovdqu    (%0,%2,1),%%xmm1               \n"
+    "vmovdqu    (%1,%2,1),%%xmm2               \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%xmm0,(%3,%2,1)               \n"
+    "lea        0x10(%2),%2                    \n"
+    "sub        $0x10,%4                       \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src0),       // %0
+    "+r"(src1),       // %1
+    "+r"(alpha),      // %2
+    "+r"(dst),        // %3
+    "+r"(width)       // %4
+  :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_BLENDPLANEROW_AVX2
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha
diff --git a/source/row_win.cc b/source/row_win.cc
index e3353cabf..13076ce60 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -525,7 +525,7 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
     vmovd      xmm5, eax
     vbroadcastss ymm5, xmm5
     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
-    movd       xmm6, eax
+    vmovd      xmm6, eax
     vbroadcastss ymm6, xmm6
     vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
     vpsllw     ymm3, ymm3, 11
@@ -576,7 +576,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
     vmovd      xmm5, eax
     vbroadcastss ymm5, xmm5
     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
-    movd       xmm6, eax
+    vmovd      xmm6, eax
     vbroadcastss ymm6, xmm6
     vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
     vpsllw     ymm3, ymm3, 11
@@ -4106,7 +4106,7 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
     movq       qword ptr [edi + esi], xmm0
     lea        esi, [esi + 8]
     sub        ecx, 8
-    jge        convertloop8
+    jg         convertloop8
 
     pop        edi
     pop        esi
@@ -4115,6 +4115,62 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
 }
 #endif  // HAS_BLENDPLANEROW_SSSE3
 
+#ifdef HAS_BLENDPLANEROW_AVX2
+// Blend 16 pixels at a time.
+// =((G2*C2)+(H2*(D2))+32768+127)/256
+__declspec(naked)
+void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
+                         const uint8* alpha, uint8* dst, int width) {
+  __asm {
+    push        esi
+    push        edi
+    vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00
+    vpsllw      ymm5, ymm5, 8
+    mov         eax, 0x80808080  // 128 for biasing image to signed.
+    vmovd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
+    vmovd       xmm7, eax
+    vbroadcastss ymm7, xmm7
+    mov         eax, [esp + 8 + 4]   // src0
+    mov         edx, [esp + 8 + 8]   // src1
+    mov         esi, [esp + 8 + 12]  // alpha
+    mov         edi, [esp + 8 + 16]  // dst
+    mov         ecx, [esp + 8 + 20]  // width
+    sub         eax, esi
+    sub         edx, esi
+    sub         edi, esi
+
+    // 16 pixel loop.
+  convertloop16:
+    vmovdqu     xmm0, [esi]        // alpha
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpcklbw  ymm0, ymm0, ymm0
+    vpxor       ymm0, ymm0, ymm5   // a, 255-a
+    vmovdqu     xmm1, [eax + esi]  // src0
+    vmovdqu     xmm2, [edx + esi]  // src1
+    vpermq      ymm1, ymm1, 0xd8
+    vpermq      ymm2, ymm2, 0xd8
+    vpunpcklbw  ymm1, ymm1, ymm2
+    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
+    vpmaddubsw  ymm0, ymm0, ymm1
+    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8
+    vmovdqu     [edi + esi], xmm0
+    lea         esi, [esi + 16]
+    sub         ecx, 16
+    jg          convertloop16
+
+    pop         edi
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_BLENDPLANEROW_AVX2
+
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
 static const uvec8 kShuffleAlpha = {
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index fc22fe139..f5a8b2129 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -1163,16 +1163,14 @@ TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
   EXPECT_LE(max_diff, 1);
 }
 
-#ifdef HAS_BLENDPLANEROW_SSSE3
+#ifdef HAS_BLENDPLANEROW_AVX2
 // TODO(fbarchard): Switch to I420Blend.
-static void TestBlendPlane(int width, int height, int benchmark_iterations,
-                          int invert, int off) {
+static void TestBlendPlaneRow(int width, int height, int benchmark_iterations,
+                              int invert, int off) {
   int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
   width = width * height;
   height = 1;
-  if (width < 1) {
-    width = 1;
-  }
   if (width < 256) {
     width = 256;
   }
@@ -1181,23 +1179,39 @@ static void TestBlendPlane(int width, int height, int benchmark_iterations,
   align_buffer_64(src_argb_a, kStride * height + off);
   align_buffer_64(src_argb_b, kStride * height + off);
   align_buffer_64(src_argb_alpha, kStride * height + off);
-  align_buffer_64(dst_argb_c, kStride * height);
-  align_buffer_64(dst_argb_opt, kStride * height);
+  align_buffer_64(dst_argb_c, kStride * height + off);
+  align_buffer_64(dst_argb_opt, kStride * height + off);
+  memset(dst_argb_c, 255, kStride * height + off);
+  memset(dst_argb_opt, 255, kStride * height + off);
 
   if (has_ssse3) {
-    for (int i = 0; i < 255; ++i) {
-      src_argb_a[i] = i;
-      src_argb_b[i] = 255 - i;
-      src_argb_alpha[i] = 255;
+    // Test source is maintained exactly if alpha is 255.
+    for (int i = 0; i < 256; ++i) {
+      src_argb_a[i + off] = i;
+      src_argb_b[i + off] = 255 - i;
+      src_argb_alpha[i + off] = 255;
     }
-    memset(dst_argb_opt, 0xfb, kStride * height);
     BlendPlaneRow_SSSE3(src_argb_a + off,
                         src_argb_b + off,
                         src_argb_alpha + off,
-                        dst_argb_opt,
-                        width * height);
-    for (int i = 0; i < kStride * height; ++i) {
-      EXPECT_EQ(src_argb_a[i], dst_argb_opt[i]);
+                        dst_argb_opt + off,
+                        256);
+    for (int i = 0; i < 256; ++i) {
+      EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]);
+    }
+    // Test destination is maintained exactly if alpha is 0.
+    for (int i = 0; i < 256; ++i) {
+      src_argb_a[i + off] = i;
+      src_argb_b[i + off] = 255 - i;
+      src_argb_alpha[i + off] = 0;
+    }
+    BlendPlaneRow_SSSE3(src_argb_a + off,
+                        src_argb_b + off,
+                        src_argb_alpha + off,
+                        dst_argb_opt + off,
+                        256);
+    for (int i = 0; i < 256; ++i) {
+      EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]);
     }
   }
   for (int i = 0; i < kStride * height; ++i) {
@@ -1205,34 +1219,122 @@ static void TestBlendPlane(int width, int height, int benchmark_iterations,
     src_argb_b[i + off] = (fastrand() & 0xff);
     src_argb_alpha[i + off] = (fastrand() & 0xff);
   }
-  memset(dst_argb_c, 255, kStride * height);
-  memset(dst_argb_opt, 255, kStride * height);
 
   BlendPlaneRow_C(src_argb_a + off,
                   src_argb_b + off,
                   src_argb_alpha + off,
-                  dst_argb_c,
+                  dst_argb_c + off,
                   width * height);
   for (int i = 0; i < benchmark_iterations; ++i) {
-    if (has_ssse3) {
-      BlendPlaneRow_SSSE3(src_argb_a + off,
-                          src_argb_b + off,
-                          src_argb_alpha + off,
-                          dst_argb_opt,
-                          width * height);
+    if (has_avx2) {
+      BlendPlaneRow_AVX2(src_argb_a + off,
+                         src_argb_b + off,
+                         src_argb_alpha + off,
+                         dst_argb_opt + off,
+                         width * height);
     } else {
-      BlendPlaneRow_C(src_argb_a + off,
-                      src_argb_b + off,
-                      src_argb_alpha + off,
-                      dst_argb_opt,
-                      width * height);
+      if (has_ssse3) {
+        BlendPlaneRow_SSSE3(src_argb_a + off,
+                            src_argb_b + off,
+                            src_argb_alpha + off,
+                            dst_argb_opt + off,
+                            width * height);
+      } else {
+        BlendPlaneRow_C(src_argb_a + off,
+                        src_argb_b + off,
+                        src_argb_alpha + off,
+                        dst_argb_opt + off,
+                        width * height);
+      }
     }
   }
   for (int i = 0; i < kStride * height; ++i) {
-    EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
+    EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]);
   }
   free_aligned_buffer_64(src_argb_a);
   free_aligned_buffer_64(src_argb_b);
+  free_aligned_buffer_64(src_argb_alpha);
+  free_aligned_buffer_64(dst_argb_c);
+  free_aligned_buffer_64(dst_argb_opt);
+  return;
+}
+
+TEST_F(LibYUVPlanarTest, BlendPlaneRow_Opt) {
+  TestBlendPlaneRow(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                    +1, 0);
+}
+TEST_F(LibYUVPlanarTest, BlendPlaneRow_Unaligned) {
+  TestBlendPlaneRow(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                    +1, 1);
+}
+#endif
+
+static void TestBlendPlane(int width, int height, int benchmark_iterations,
+                           int disable_cpu_flags, int benchmark_cpu_info,
+                           int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 1;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(src_argb_b, kStride * height + off);
+  align_buffer_64(src_argb_alpha, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height + off);
+  align_buffer_64(dst_argb_opt, kStride * height + off);
+  memset(dst_argb_c, 255, kStride * height + off);
+  memset(dst_argb_opt, 255, kStride * height + off);
+
+  // Test source is maintained exactly if alpha is 255.
+  for (int i = 0; i < width; ++i) {
+    src_argb_a[i + off] = i & 255;
+    src_argb_b[i + off] = 255 - (i & 255);
+  }
+  memset(src_argb_alpha + off, 255, width);
+  BlendPlane(src_argb_a + off, width,
+             src_argb_b + off, width,
+             src_argb_alpha + off, width,
+             dst_argb_opt + off, width,
+             width, 1);
+  for (int i = 0; i < width; ++i) {
+    EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]);
+  }
+  // Test destination is maintained exactly if alpha is 0.
+  memset(src_argb_alpha + off, 0, width);
+  BlendPlane(src_argb_a + off, width,
+             src_argb_b + off, width,
+             src_argb_alpha + off, width,
+             dst_argb_opt + off, width,
+             width, 1);
+  for (int i = 0; i < width; ++i) {
+    EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]);
+  }
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (fastrand() & 0xff);
+    src_argb_b[i + off] = (fastrand() & 0xff);
+    src_argb_alpha[i + off] = (fastrand() & 0xff);
+  }
+
+  MaskCpuFlags(disable_cpu_flags);
+  BlendPlane(src_argb_a + off, width,
+             src_argb_b + off, width,
+             src_argb_alpha + off, width,
+             dst_argb_c + off, width,
+             width, height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    BlendPlane(src_argb_a + off, width,
+               src_argb_b + off, width,
+               src_argb_alpha + off, width,
+               dst_argb_opt + off, width,
+               width, height);
+  }
+  for (int i = 0; i < kStride * height; ++i) {
+    EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]);
+  }
+  free_aligned_buffer_64(src_argb_a);
+  free_aligned_buffer_64(src_argb_b);
+  free_aligned_buffer_64(src_argb_alpha);
   free_aligned_buffer_64(dst_argb_c);
   free_aligned_buffer_64(dst_argb_opt);
   return;
@@ -1240,9 +1342,106 @@ static void TestBlendPlane(int width, int height, int benchmark_iterations,
 
 TEST_F(LibYUVPlanarTest, BlendPlane_Opt) {
   TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
-                 +1, 0);
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, BlendPlane_Unaligned) {
+  TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
+}
+
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+
+static void TestI420Blend(int width, int height, int benchmark_iterations,
+                          int disable_cpu_flags, int benchmark_cpu_info,
+                          int invert, int off) {
+  width = ((width) > 0) ? (width) : 1;
+  const int kStrideUV = SUBSAMPLE(width, 2);
+  const int kSizeUV = kStrideUV * SUBSAMPLE(height, 2);
+  align_buffer_64(src_y0, width * height + off);
+  align_buffer_64(src_u0, kSizeUV + off);
+  align_buffer_64(src_v0, kSizeUV + off);
+  align_buffer_64(src_y1, width * height + off);
+  align_buffer_64(src_u1, kSizeUV + off);
+  align_buffer_64(src_v1, kSizeUV + off);
+  align_buffer_64(src_a, width * height + off);
+  align_buffer_64(dst_y_c, width * height + off);
+  align_buffer_64(dst_u_c, kSizeUV + off);
+  align_buffer_64(dst_v_c, kSizeUV + off);
+  align_buffer_64(dst_y_opt, width * height + off);
+  align_buffer_64(dst_u_opt, kSizeUV + off);
+  align_buffer_64(dst_v_opt, kSizeUV + off);
+
+  MemRandomize(src_y0, width * height + off);
+  MemRandomize(src_u0, kSizeUV + off);
+  MemRandomize(src_v0, kSizeUV + off);
+  MemRandomize(src_y1, width * height + off);
+  MemRandomize(src_u1, kSizeUV + off);
+  MemRandomize(src_v1, kSizeUV + off);
+  MemRandomize(src_a, width * height + off);
+  memset(dst_y_c, 255, width * height + off);
+  memset(dst_u_c, 255, kSizeUV + off);
+  memset(dst_v_c, 255, kSizeUV + off);
+  memset(dst_y_opt, 255, width * height + off);
+  memset(dst_u_opt, 255, kSizeUV + off);
+  memset(dst_v_opt, 255, kSizeUV + off);
+
+  MaskCpuFlags(disable_cpu_flags);
+  I420Blend(src_y0 + off, width,
+            src_u0 + off, kStrideUV,
+            src_v0 + off, kStrideUV,
+            src_y1 + off, width,
+            src_u1 + off, kStrideUV,
+            src_v1 + off, kStrideUV,
+            src_a + off, width,
+            dst_y_c + off, width,
+            dst_u_c + off, kStrideUV,
+            dst_v_c + off, kStrideUV,
+            width, height);
+  MaskCpuFlags(benchmark_cpu_info);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I420Blend(src_y0 + off, width,
+              src_u0 + off, kStrideUV,
+              src_v0 + off, kStrideUV,
+              src_y1 + off, width,
+              src_u1 + off, kStrideUV,
+              src_v1 + off, kStrideUV,
+              src_a + off, width,
+              dst_y_opt + off, width,
+              dst_u_opt + off, kStrideUV,
+              dst_v_opt + off, kStrideUV,
+              width, height);
+  }
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
+  }
+  for (int i = 0; i < kSizeUV; ++i) {
+    EXPECT_NEAR(dst_u_c[i + off], dst_u_opt[i + off], 1);  // Subsample off by 1
+    EXPECT_NEAR(dst_v_c[i + off], dst_v_opt[i + off], 1);
+  }
+  free_aligned_buffer_64(src_y0);
+  free_aligned_buffer_64(src_u0);
+  free_aligned_buffer_64(src_v0);
+  free_aligned_buffer_64(src_y1);
+  free_aligned_buffer_64(src_u1);
+  free_aligned_buffer_64(src_v1);
+  free_aligned_buffer_64(src_a);
+  free_aligned_buffer_64(dst_y_c);
+  free_aligned_buffer_64(dst_u_c);
+  free_aligned_buffer_64(dst_v_c);
+  free_aligned_buffer_64(dst_y_opt);
+  free_aligned_buffer_64(dst_u_opt);
+  free_aligned_buffer_64(dst_v_opt);
+  return;
+}
+
+TEST_F(LibYUVPlanarTest, I420Blend_Opt) {
+  TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
+}
+TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) {
+  TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
 }
-#endif
 
 TEST_F(LibYUVPlanarTest, TestAffine) {
   SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);