Neon optimized argb filter row for bilinear scale and Effects Interpolate.

BUG=none TEST=./libyuv_unittest --gtest_filter=*ARGBScale* Review URL: https://webrtc-codereview.appspot.com/964017 git-svn-id: http://libyuv.googlecode.com/svn/trunk@497 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-07 01:06:46 +08:00 · 2012-11-20 09:44:46 +00:00 · 2012-11-20 09:44:46 +00:00 · b5491759b4
commit b5491759b4
parent 958a0b0c19
8 changed files with 451 additions and 55 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -217,6 +217,9 @@ extern "C" {
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
 #define HAS_ARGBMIRRORROW_NEON
 // Effects
 #define HAS_ARGBINTERPOLATEROW_NEON
 #endif
 // The following are available on Mips platforms
@ -1241,6 +1244,9 @@ void ARGBInterpolateRow_C(uint8* dst_argb, const uint8* src_argb,
 void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
                              ptrdiff_t src_stride_argb, int dst_width,
                              int source_y_fraction);
 void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb,
                             ptrdiff_t src_stride_argb, int dst_width,
                             int source_y_fraction);
 #ifdef __cplusplus
 }  // extern "C"
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -1121,6 +1121,9 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
 }
 // Interpolate 2 ARGB images by specified amount (0 to 255).
 // TODO(fbarchard): Check width is multiple of 16.  Do Any version.
 // TODO(fbarchard): Consider selecting a specialized interpolator so
 //     interpolation doesn't need to be checked on each row.
 LIBYUV_API
 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                    const uint8* src_argb1, int src_stride_argb1,
@ -1145,6 +1148,11 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
  }
 #elif defined(HAS_ARGBINTERPOLATEROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) &&
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
    ARGBInterpolateRow = ARGBInterpolateRow_NEON;
  }
 #endif
  for (int y = 0; y < height; ++y) {
    ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -2275,6 +2275,89 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
 }
 #endif  // HAS_RAWTOYROW_NEON
 // 4x2 -> 4x1
 // Same as ScaleARGBFilterRows_NEON but with last pixel not duplicated.
 void ARGBInterpolateRow_NEON(uint8* dst_ptr,
                             const uint8* src_ptr, ptrdiff_t src_stride,
                             int dst_width, int source_y_fraction) {
  asm volatile (
    "cmp          %4, #0                       \n"
    "beq          100f                         \n"
    "add          %2, %1                       \n"
    "cmp          %4, #64                      \n"
    "beq          75f                          \n"
    "cmp          %4, #128                     \n"
    "beq          50f                          \n"
    "cmp          %4, #192                     \n"
    "beq          25f                          \n"
    "vdup.8       d5, %4                       \n"
    "rsb          %4, #256                     \n"
    "vdup.8       d4, %4                       \n"
    // General purpose row blend.
  "1:                                          \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "vld1.u8      {q1}, [%2]!                  \n"
    "subs         %3, #4                       \n"
    "vmull.u8     q13, d0, d4                  \n"
    "vmull.u8     q14, d1, d4                  \n"
    "vmlal.u8     q13, d2, d5                  \n"
    "vmlal.u8     q14, d3, d5                  \n"
    "vrshrn.u16   d0, q13, #8                  \n"
    "vrshrn.u16   d1, q14, #8                  \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          1b                           \n"
    "b            99f                          \n"
    // Blend 25 / 75.
  "25:                                         \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "vld1.u8      {q1}, [%2]!                  \n"
    "subs         %3, #4                       \n"
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          25b                          \n"
    "b            99f                          \n"
    // Blend 50 / 50.
  "50:                                         \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "vld1.u8      {q1}, [%2]!                  \n"
    "subs         %3, #4                       \n"
    "vrhadd.u8    q0, q1                       \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          50b                          \n"
    "b            99f                          \n"
    // Blend 75 / 25.
  "75:                                         \n"
    "vld1.u8      {q1}, [%1]!                  \n"
    "vld1.u8      {q0}, [%2]!                  \n"
    "subs         %3, #4                       \n"
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          75b                          \n"
    "b            99f                          \n"
    // Blend 100 / 0 - Copy row unchanged.
  "100:                                        \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "subs         %3, #4                       \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          100b                         \n"
  "99:                                         \n"
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(src_stride),       // %2
    "+r"(dst_width),        // %3
    "+r"(source_y_fraction) // %4
  :
  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
  );
 }
 #endif  // __ARM_NEON__
 #ifdef __cplusplus
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -4241,17 +4241,23 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+// Bilinear image filtering.
-void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
 void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
                              ptrdiff_t src_stride, int dst_width,
                              int source_y_fraction) {
  asm volatile (
    "sub       %1,%0                           \n"
    "shr       %3                              \n"
    "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
+    "je        100f                            \n"
    "cmp       $0x20,%3                        \n"
    "je        75f                             \n"
    "cmp       $0x40,%3                        \n"
-    "je        3f                              \n"
+    "je        50f                             \n"
    "cmp       $0x60,%3                        \n"
    "je        25f                             \n"
    "movd      %3,%%xmm0                       \n"
    "neg       %3                              \n"
    "add       $0x80,%3                        \n"
@ -4259,6 +4265,8 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    "punpcklbw %%xmm0,%%xmm5                   \n"
    "punpcklwd %%xmm5,%%xmm5                   \n"
    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    // General purpose row blend.
    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%1),%%xmm0                     \n"
@ -4275,27 +4283,59 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
    "jg        1b                              \n"
-    "jmp       4f                              \n"
+    "jmp       99f                             \n"
    // Blend 25 / 75.
    ".p2align  4                               \n"
-  "2:                                          \n"
+  "25:                                         \n"
    "movdqa    (%1),%%xmm0                     \n"
    "movdqa    (%1,%4,1),%%xmm1                \n"
    "pavgb     %%xmm1,%%xmm0                   \n"
    "pavgb     %%xmm1,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
    "jg        25b                             \n"
    "jmp       99f                             \n"
    // Blend 50 / 50.
    ".p2align  4                               \n"
  "50:                                         \n"
    "movdqa    (%1),%%xmm0                     \n"
    "movdqa    (%1,%4,1),%%xmm1                \n"
    "pavgb     %%xmm1,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
    "jg        50b                             \n"
    "jmp       99f                             \n"
    // Blend 75 / 25.
    ".p2align  4                               \n"
  "75:                                         \n"
    "movdqa    (%1),%%xmm1                     \n"
    "movdqa    (%1,%4,1),%%xmm0                \n"
    "pavgb     %%xmm1,%%xmm0                   \n"
    "pavgb     %%xmm1,%%xmm0                   \n"
    "sub       $0x4,%2                         \n"
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
    "jg        75b                             \n"
    "jmp       99f                             \n"
    // Blend 100 / 0 - Copy row unchanged.
    ".p2align  4                               \n"
  "100:                                        \n"
    "movdqa    (%1),%%xmm0                     \n"
    "sub       $0x4,%2                         \n"
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
+    "jg        100b                            \n"
-    "jmp       4f                              \n"
+
-    ".p2align  4                               \n"
+    // Extrude last pixel.
-  "3:                                          \n"
+  "99:                                         \n"
-    "movdqa    (%1),%%xmm0                     \n"
+  : "+r"(dst_argb),    // %0
-    "pavgb     (%1,%4,1),%%xmm0                \n"
+    "+r"(src_argb),    // %1
    "sub       $0x4,%2                         \n"
    "movdqa    %%xmm0,(%1,%0,1)                \n"
    "lea       0x10(%1),%1                     \n"
    "jg        3b                              \n"
  "4:                                          \n"
    ".p2align  4                               \n"
  : "+r"(dst_ptr),     // %0
    "+r"(src_ptr),     // %1
    "+r"(dst_width),  // %2
    "+r"(source_y_fraction)  // %3
  : "r"(static_cast<intptr_t>(src_stride))  // %4
@ -4306,6 +4346,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  );
 }
 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                  uint8* dst_uv, int pix) {
  asm volatile (
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -3580,8 +3580,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    // 1 pixel loop until destination pointer is aligned.
  alignloop1:
-    test       edx, 15          // aligned?
+//    test       edx, 15          // aligned?
-    je         alignloop1b
+//    je         alignloop1b
    movd       xmm3, [eax]
    lea        eax, [eax + 4]
    movdqa     xmm0, xmm3       // src argb
@ -4439,25 +4439,31 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
+// Bilinear image filtering.
 // Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
 __declspec(naked) __declspec(align(16))
-void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
                              ptrdiff_t src_stride, int dst_width,
                              int source_y_fraction) {
  __asm {
    push       esi
    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        edi, [esp + 8 + 4]   // dst_argb
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        esi, [esp + 8 + 8]   // src_argb
    mov        edx, [esp + 8 + 12]  // src_stride
    mov        ecx, [esp + 8 + 16]  // dst_width
    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
    sub        edi, esi
    shr        eax, 1
-    cmp        eax, 0
+    cmp        eax, 0  // dispatch to specialized filters if applicable.
-    je         xloop1
+    je         xloop100
    cmp        eax, 32
    je         xloop75
    cmp        eax, 64
-    je         xloop2
+    je         xloop50
    cmp        eax, 96
    je         xloop25
    movd       xmm0, eax  // high fraction 0..127
    neg        eax
    add        eax, 128
@ -4482,32 +4488,57 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    movdqa     [esi + edi], xmm0
    lea        esi, [esi + 16]
    jg         xloop
    jmp        xloop99
-    pop        edi
+    // Blend 25 / 75.
    pop        esi
    ret
    align      16
-  xloop1:
+  xloop25:
    movdqa     xmm0, [esi]
    movdqa     xmm1, [esi + edx]
    pavgb      xmm0, xmm1
    pavgb      xmm0, xmm1
    sub        ecx, 4
    movdqa     [esi + edi], xmm0
    lea        esi, [esi + 16]
    jg         xloop25
    jmp        xloop99
    // Blend 50 / 50.
    align      16
  xloop50:
    movdqa     xmm0, [esi]
    movdqa     xmm1, [esi + edx]
    pavgb      xmm0, xmm1
    sub        ecx, 4
    movdqa     [esi + edi], xmm0
    lea        esi, [esi + 16]
    jg         xloop50
    jmp        xloop99
    // Blend 75 / 25.
    align      16
  xloop75:
    movdqa     xmm1, [esi]
    movdqa     xmm0, [esi + edx]
    pavgb      xmm0, xmm1
    pavgb      xmm0, xmm1
    sub        ecx, 4
    movdqa     [esi + edi], xmm0
    lea        esi, [esi + 16]
    jg         xloop75
    jmp        xloop99
    // Blend 100 / 0 - Copy row unchanged.
    align      16
  xloop100:
    movdqa     xmm0, [esi]
    sub        ecx, 4
    movdqa     [esi + edi], xmm0
    lea        esi, [esi + 16]
-    jg         xloop1
+    jg         xloop100
    pop        edi
    pop        esi
    ret
    align      16
  xloop2:
    movdqa     xmm0, [esi]
    pavgb      xmm0, [esi + edx]
    sub        ecx, 4
    movdqa     [esi + edi], xmm0
    lea        esi, [esi + 16]
    jg         xloop2
    // Extrude last pixel.
  xloop99:
    pop        edi
    pop        esi
    ret
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@ -30,20 +30,21 @@ extern "C" {
 #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
 #define HAS_SCALEARGBFILTERROWS_NEON
 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
                               int src_stepx,
                               uint8* dst_argb, int dst_width);
 void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, int src_stride,
                                  int src_stepx,
                                  uint8* dst_argb, int dst_width);
 #define HAS_SCALEARGBROWDOWN2_NEON
 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                            uint8* dst, int dst_width);
 void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width);
-
+void ScaleARGBFilterRows_NEON(uint8* dst_ptr,
                              const uint8* src_ptr, ptrdiff_t src_stride,
                              int dst_width, int source_y_fraction);
 #endif
 /**
@ -964,6 +965,7 @@ static void ScaleARGBBilinear(int src_width, int src_height,
                              ptrdiff_t src_stride,
                              int dst_width, int source_y_fraction) =
      ScaleARGBFilterRows_C;
 // TODO(fbarchard): Check aligned width.
 #if defined(HAS_SCALEARGBFILTERROWS_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) {
@ -975,6 +977,11 @@ static void ScaleARGBBilinear(int src_width, int src_height,
      IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) {
    ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
  }
 #endif
 #if defined(HAS_SCALEARGBFILTERROWS_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ScaleARGBFilterRows = ScaleARGBFilterRows_NEON;
  }
 #endif
  int dx = (src_width << 16) / dst_width;
  int dy = (src_height << 16) / dst_height;
--- a/source/scale_argb_neon.cc
+++ b/source/scale_argb_neon.cc
@ -136,6 +136,90 @@ void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride,
  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
  );
 }
 // 4x2 -> 4x1
 void ScaleARGBFilterRows_NEON(uint8* dst_ptr,
                              const uint8* src_ptr, ptrdiff_t src_stride,
                              int dst_width, int source_y_fraction) {
  asm volatile (
    "cmp          %4, #0                       \n"
    "beq          100f                         \n"
    "add          %2, %1                       \n"
    "cmp          %4, #64                      \n"
    "beq          75f                          \n"
    "cmp          %4, #128                     \n"
    "beq          50f                          \n"
    "cmp          %4, #192                     \n"
    "beq          25f                          \n"
    "vdup.8       d5, %4                       \n"
    "rsb          %4, #256                     \n"
    "vdup.8       d4, %4                       \n"
    // General purpose row blend.
  "1:                                          \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "vld1.u8      {q1}, [%2]!                  \n"
    "subs         %3, #4                       \n"
    "vmull.u8     q13, d0, d4                  \n"
    "vmull.u8     q14, d1, d4                  \n"
    "vmlal.u8     q13, d2, d5                  \n"
    "vmlal.u8     q14, d3, d5                  \n"
    "vrshrn.u16   d0, q13, #8                  \n"
    "vrshrn.u16   d1, q14, #8                  \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          1b                           \n"
    "b            99f                          \n"
    // Blend 25 / 75.
  "25:                                         \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "vld1.u8      {q1}, [%2]!                  \n"
    "subs         %3, #4                       \n"
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          25b                          \n"
    "b            99f                          \n"
    // Blend 50 / 50.
  "50:                                         \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "vld1.u8      {q1}, [%2]!                  \n"
    "subs         %3, #4                       \n"
    "vrhadd.u8    q0, q1                       \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          50b                          \n"
    "b            99f                          \n"
    // Blend 75 / 25.
  "75:                                         \n"
    "vld1.u8      {q1}, [%1]!                  \n"
    "vld1.u8      {q0}, [%2]!                  \n"
    "subs         %3, #4                       \n"
    "vrhadd.u8    q0, q1                       \n"
    "vrhadd.u8    q0, q1                       \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          75b                          \n"
    "b            99f                          \n"
    // Blend 100 / 0 - Copy row unchanged.
  "100:                                        \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "subs         %3, #4                       \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          100b                         \n"
  "99:                                         \n"
    "vst1.u32     {d1[1]}, [%0]                \n"
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(src_stride),       // %2
    "+r"(dst_width),        // %3
    "+r"(source_y_fraction) // %4
  :
  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
  );
 }
 #endif  // __ARM_NEON__
 #ifdef __cplusplus
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -585,6 +585,142 @@ TEST_F(libyuvTest, TestInterpolate) {
  }
 }
 #define TESTTERP(FMT_A, BPP_A, STRIDE_A,                                       \
                 FMT_B, BPP_B, STRIDE_B,                                       \
                 W1280, TERP, DIFF, N, NEG, OFF)                               \
 TEST_F(libyuvTest, ARGBInterpolate##TERP##N) {                                 \
  const int kWidth = W1280;                                                    \
  const int kHeight = benchmark_height_;                                       \
  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
  align_buffer_64(src_argb_a, kStrideA * kHeight + OFF);                       \
  align_buffer_64(src_argb_b, kStrideA * kHeight + OFF);                       \
  align_buffer_64(dst_argb_c, kStrideB * kHeight);                             \
  align_buffer_64(dst_argb_opt, kStrideB * kHeight);                           \
  srandom(time(NULL));                                                         \
  for (int i = 0; i < kStrideA * kHeight; ++i) {                               \
    src_argb_a[i + OFF] = (random() & 0xff);                                   \
    src_argb_b[i + OFF] = (random() & 0xff);                                   \
  }                                                                            \
  MaskCpuFlags(0);                                                             \
  ARGBInterpolate(src_argb_a + OFF, kStrideA,                                  \
                  src_argb_b + OFF, kStrideA,                                  \
                  dst_argb_c, kStrideB,                                        \
                  kWidth, NEG kHeight, TERP);                                  \
  MaskCpuFlags(-1);                                                            \
  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
    ARGBInterpolate(src_argb_a + OFF, kStrideA,                                \
                    src_argb_b + OFF, kStrideA,                                \
                    dst_argb_opt, kStrideB,                                    \
                    kWidth, NEG kHeight, TERP);                                \
  }                                                                            \
  int max_diff = 0;                                                            \
  for (int i = 0; i < kStrideB * kHeight; ++i) {                               \
    int abs_diff =                                                             \
        abs(static_cast<int>(dst_argb_c[i]) -                                  \
            static_cast<int>(dst_argb_opt[i]));                                \
    if (abs_diff > max_diff) {                                                 \
      max_diff = abs_diff;                                                     \
    }                                                                          \
  }                                                                            \
  EXPECT_LE(max_diff, DIFF);                                                   \
  free_aligned_buffer_64(src_argb_a)                                           \
  free_aligned_buffer_64(src_argb_b)                                           \
  free_aligned_buffer_64(dst_argb_c)                                           \
  free_aligned_buffer_64(dst_argb_opt)                                         \
 }
 #define TESTINTERPOLATE(TERP)                                                  \
    TESTTERP(ARGB, 4, 1, ARGB, 4, 1,                                           \
             benchmark_width_ - 4, TERP, 1, _Any, +, 0)                        \
    TESTTERP(ARGB, 4, 1, ARGB, 4, 1,                                           \
             benchmark_width_, TERP, 1, _Unaligned, +, 1)                      \
    TESTTERP(ARGB, 4, 1, ARGB, 4, 1,                                           \
             benchmark_width_, TERP, 1, _Invert, -, 0)                         \
    TESTTERP(ARGB, 4, 1, ARGB, 4, 1,                                           \
             benchmark_width_, TERP, 1, _Opt, +, 0)
 TESTINTERPOLATE(0)
 TESTINTERPOLATE(64)
 TESTINTERPOLATE(128)
 TESTINTERPOLATE(192)
 TESTINTERPOLATE(255)
 static int TestBlend(int kWidth, int kHeight, int benchmark_iterations,
                     int NEG, int OFF) {
  const int BPP_A = 4;
  const int STRIDE_A = 1;
  const int BPP_B = 4;
  const int STRIDE_B = 1;
  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;
  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;
  align_buffer_64(src_argb_a, kStrideA * kHeight + OFF);
  align_buffer_64(src_argb_b, kStrideA * kHeight + OFF);
  align_buffer_64(dst_argb_c, kStrideB * kHeight);
  align_buffer_64(dst_argb_opt, kStrideB * kHeight);
  srandom(time(NULL));
  for (int i = 0; i < kStrideA * kHeight; ++i) {
    src_argb_a[i + OFF] = (random() & 0xff);
    src_argb_b[i + OFF] = (random() & 0xff);
  }
  ARGBAttenuate(src_argb_a, kStrideA, src_argb_a, kStrideA, kWidth, kHeight);
  ARGBAttenuate(src_argb_b, kStrideA, src_argb_b, kStrideA, kWidth, kHeight);
  memset(dst_argb_c, 255, kStrideB * kHeight);
  memset(dst_argb_opt, 255, kStrideB * kHeight);
  MaskCpuFlags(0);
  ARGBBlend(src_argb_a + OFF, kStrideA,
            src_argb_b + OFF, kStrideA,
            dst_argb_c, kStrideB,
            kWidth, NEG * kHeight);
  MaskCpuFlags(-1);
  for (int i = 0; i < benchmark_iterations; ++i) {
    ARGBBlend(src_argb_a + OFF, kStrideA,
              src_argb_b + OFF, kStrideA,
              dst_argb_opt, kStrideB,
              kWidth, NEG * kHeight);
  }
  int max_diff = 0;
  for (int i = 0; i < kStrideB * kHeight; ++i) {
    int abs_diff =
        abs(static_cast<int>(dst_argb_c[i]) -
            static_cast<int>(dst_argb_opt[i]));
    if (abs_diff > max_diff) {
      max_diff = abs_diff;
    }
  }
  free_aligned_buffer_64(src_argb_a)
  free_aligned_buffer_64(src_argb_b)
  free_aligned_buffer_64(dst_argb_c)
  free_aligned_buffer_64(dst_argb_opt)
  return max_diff;
 }
 TEST_F(libyuvTest, ARGBBlend_Any) {
  int max_diff = TestBlend(benchmark_width_ - 4, benchmark_height_,
                           benchmark_iterations_, +1, 0);
  EXPECT_LE(max_diff, 1);
 }
 // TODO(fbarchard): Enable unaligned blend test.
 // TEST_F(libyuvTest, ARGBBlend_Unaligned) {
 //   int max_diff = TestBlend(benchmark_width_, benchmark_height_,
 //                            benchmark_iterations_, +1, 1);
 //   EXPECT_LE(max_diff, 1);
 // }
 TEST_F(libyuvTest, ARGBBlend_Invert) {
  int max_diff = TestBlend(benchmark_width_, benchmark_height_,
                           benchmark_iterations_, -1, 0);
  EXPECT_LE(max_diff, 1);
 }
 TEST_F(libyuvTest, ARGBBlend_Opt) {
  int max_diff = TestBlend(benchmark_width_, benchmark_height_,
                           benchmark_iterations_, +1, 0);
  EXPECT_LE(max_diff, 1);
 }
 TEST_F(libyuvTest, TestAffine) {
  SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
  SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]);