From 933bd40c3c894583f2e0243f5409a8e17d868ba0 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com" <fbarchard@google.com>
Date: Fri, 27 Feb 2015 21:15:28 +0000
Subject: [PATCH] port ARGBToRGB565 and ARGB1555 to AVX2.  Enable functions
 that use ARGBToRGB565 AVX2 code.  Add ARGBToRGB565Dither function. BUG=403
 TESTED=local windows build R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/42109004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1302 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 include/libyuv/convert_from_argb.h |   7 ++
 include/libyuv/row.h               |  20 ++++--
 source/convert_from_argb.cc        |  56 +++++++++++++++
 source/row_any.cc                  |   4 ++
 source/row_common.cc               |  29 +++++++-
 source/row_win.cc                  |  83 +++++++++++++++++++++-
 unit_test/convert_test.cc          | 106 +++++++++++++++++++++++++++++
 7 files changed, 295 insertions(+), 10 deletions(-)

diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h
index 75747fe7d..c592fc235 100644
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@@ -61,6 +61,13 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
                  uint8* dst_rgb565, int dst_stride_rgb565,
                  int width, int height);
 
+// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
+// Values in dither matrix from 0 to 255.  128 is best for no dither.
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither8x8, int width, int height);
+
 // Convert ARGB To ARGB1555.
 LIBYUV_API
 int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 4592c16a1..80e844bae 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -190,7 +190,14 @@ extern "C" {
 #define HAS_I422TORGBAROW_AVX2
 #define HAS_NV12TOARGBROW_AVX2
 #define HAS_NV21TOARGBROW_AVX2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
 #define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TORGB565ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
 #endif
 
 // The following are available on all x86 platforms, but
@@ -223,12 +230,6 @@ extern "C" {
 #if defined(HAS_I422TOARGBROW_AVX2)
 #define HAS_YUY2TOARGBROW_AVX2
 #define HAS_UYVYTOARGBROW_AVX2
-// TODO(fbarchard): Enable once low levels are ported to AVX2
-// #define HAS_NV12TORGB565ROW_AVX2
-// #define HAS_NV21TORGB565ROW_AVX2
-// #define HAS_I422TORGB565ROW_AVX2
-// #define HAS_I422TOARGB1555ROW_AVX2
-#define HAS_I422TOARGB4444ROW_AVX2
 #endif
 
 // Effects:
@@ -904,6 +905,8 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -919,6 +922,9 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint8* dither8x8, int pix);
+
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
 void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
@@ -1369,6 +1375,8 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
 void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 7ce430dac..dc2186a6a 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -804,6 +804,46 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+static const uint8 kDither8x8[64] = {
+  0, 128, 32, 160,  8, 136, 40, 168,
+  192, 64, 224, 96, 200, 72, 232, 104,
+  48, 176, 16, 144, 56, 184, 24, 152,
+  240, 112, 208, 80, 248, 120, 216, 88,
+  12, 140, 44, 172,  4, 132, 36, 164,
+  204, 76, 236, 108, 196, 68, 228, 100,
+  60, 188, 28, 156, 52, 180, 20, 148,
+  252, 124, 220, 92, 244, 116, 212, 84,
+};
+
+// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither8x8, int width, int height) {
+  int y;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (!dither8x8) {
+    dither8x8 = kDither8x8;
+
+  }
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+                          dither8x8 + ((y & 7) << 3), width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
 // Convert ARGB To RGB565.
 LIBYUV_API
 int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
@@ -835,6 +875,14 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB565ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
@@ -883,6 +931,14 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOARGB1555ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
diff --git a/source/row_any.cc b/source/row_any.cc
index 8a678f2fb..19340b3b7 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -175,6 +175,10 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
        4, 2, 3)
 #endif
 #if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+RGBANY(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, ARGBToRGB565Row_C,
+       4, 2, 7)
+RGBANY(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, ARGBToARGB1555Row_C,
+       4, 2, 7)
 RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C,
        4, 2, 7)
 #endif
diff --git a/source/row_common.cc b/source/row_common.cc
index 49efd67da..e0e2bf426 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -199,6 +199,32 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint8* dither8x8, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    int dither0 = dither8x8[x & 7] - 128;
+    int dither1 = dither8x8[(x & 7) + 1] - 128;
+    uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
+    uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
+    uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
+    uint8 b1 = Clamp(src_argb[4] + dither1) >> 3;
+    uint8 g1 = Clamp(src_argb[5] + dither1) >> 2;
+    uint8 r1 = Clamp(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    int dither0 = dither8x8[(width - 1) & 7] - 128;
+    uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
+    uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
+    uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -2258,8 +2284,7 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
 }
 #endif  // !defined(LIBYUV_DISABLE_X86)
 
-#if defined(HAS_I422TORGB565ROW_AVX2) && !defined(_MSC_VER)
-// row_win.cc has asm version, but GCC uses 2 step wrapper.
+#if defined(HAS_I422TORGB565ROW_AVX2)
 void I422ToRGB565Row_AVX2(const uint8* src_y,
                           const uint8* src_u,
                           const uint8* src_v,
diff --git a/source/row_win.cc b/source/row_win.cc
index 68c50cd50..5c06b6078 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -693,6 +693,85 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpcmpeqb   ymm5, ymm5, ymm5    // generate mask 0xfffff800
+    vpslld     ymm5, ymm5, 11
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpslld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpsrad     ymm0, ymm0, 16      // R
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm4, ymm4, ymm4
+    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpslld     ymm7, ymm7, 15
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9       // R
+    vpsrld     ymm2, ymm0, 6       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrad     ymm0, ymm0, 16      // A
+    vpand      ymm3, ymm3, ymm6    // R
+    vpand      ymm2, ymm2, ymm5    // G
+    vpand      ymm1, ymm1, ymm4    // B
+    vpand      ymm0, ymm0, ymm7    // A
+    vpor       ymm0, ymm0, ymm1    // BA
+    vpor       ymm2, ymm2, ymm3    // GR
+    vpor       ymm0, ymm0, ymm2    // BGRA
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB1555ROW_AVX2
+
 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
 __declspec(naked) __declspec(align(16))
 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
@@ -700,9 +779,9 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
     mov        eax, [esp + 4]   // src_argb
     mov        edx, [esp + 8]   // dst_rgb
     mov        ecx, [esp + 12]  // pix
-    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0xf000f000
+    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
     vpsllw     ymm4, ymm4, 12
-    vpsrlw     ymm3, ymm4, 8          // generate mask 0x00f000f0
+    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
 
  convertloop:
     vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index d9a6d450d..d186a9e8d 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -934,6 +934,112 @@ TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
 TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0)
 TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
 
+#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                           \
+                   FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                           \
+                   W1280, DIFF, N, NEG, OFF)                                   \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##Dither##N) {                              \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;         \
+  const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;         \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
+  align_buffer_64(src_argb, kStrideA * kHeightA + OFF);                        \
+  align_buffer_64(dst_argb_c, kStrideB * kHeightB);                            \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeightB);                          \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    src_argb[i + OFF] = (random() & 0xff);                                     \
+  }                                                                            \
+  memset(dst_argb_c, 1, kStrideB * kHeightB);                                  \
+  memset(dst_argb_opt, 101, kStrideB * kHeightB);                              \
+  MaskCpuFlags(0);                                                             \
+  FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA,                           \
+                           dst_argb_c, kStrideB,                               \
+                           NULL, kWidth, NEG kHeight);                         \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA,                         \
+                             dst_argb_opt, kStrideB,                           \
+                             NULL, kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kStrideB * kHeightB; ++i) {                              \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb_c[i]) -                                  \
+            static_cast<int>(dst_argb_opt[i]));                                \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(src_argb);                                            \
+  free_aligned_buffer_64(dst_argb_c);                                          \
+  free_aligned_buffer_64(dst_argb_opt);                                        \
+}
+
+#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                      \
+                       FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                 \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##Dither_Random) {                          \
+  srandom(time(NULL));                                                         \
+  for (int times = 0; times < benchmark_iterations_; ++times) {                \
+    const int kWidth = (random() & 63) + 1;                                    \
+    const int kHeight = (random() & 31) + 1;                                   \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;       \
+    const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
+    const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
+    align_buffer_page_end(src_argb, kStrideA * kHeightA);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                    \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                  \
+    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
+      src_argb[i] = (random() & 0xff);                                         \
+    }                                                                          \
+    memset(dst_argb_c, 123, kStrideB * kHeightB);                              \
+    memset(dst_argb_opt, 123, kStrideB * kHeightB);                            \
+    MaskCpuFlags(0);                                                           \
+    FMT_A##To##FMT_B##Dither(src_argb, kStrideA,                               \
+                             dst_argb_c, kStrideB,                             \
+                             NULL, kWidth, kHeight);                           \
+    MaskCpuFlags(-1);                                                          \
+    FMT_A##To##FMT_B##Dither(src_argb, kStrideA,                               \
+                             dst_argb_opt, kStrideB,                           \
+                             NULL, kWidth, kHeight);                           \
+    int max_diff = 0;                                                          \
+    for (int i = 0; i < kStrideB * kHeightB; ++i) {                            \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb_c[i]) -                                \
+              static_cast<int>(dst_argb_opt[i]));                              \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+    EXPECT_LE(max_diff, DIFF);                                                 \
+    free_aligned_buffer_page_end(src_argb);                                    \
+    free_aligned_buffer_page_end(dst_argb_c);                                  \
+    free_aligned_buffer_page_end(dst_argb_opt);                                \
+  }                                                                            \
+}
+
+#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                            \
+                  FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                      \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_ - 4, DIFF, _Any, +, 0)                         \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_, DIFF, _Unaligned, +, 1)                       \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_, DIFF, _Invert, -, 0)                          \
+    TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                               \
+               FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                               \
+               benchmark_width_, DIFF, _Opt, +, 0)                             \
+    TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                          \
+                    FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
+
+TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+
 #define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A,                          \
                  W1280, N, NEG, OFF)                                           \
 TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) {                                   \