Add MergeARGBPlane and SplitARGBPlane

These functions convert between planar and interleaved ARGB, optionally fill 255 to alpha / discard alpha. This can help handle YUV(A) with Identity matrix, which is basically planar ARGB. libyuv_unittest --gtest_filter=LibYUVPlanarTest.*ARGBPlane*:LibYUVPlanarTest.*XRGBPlane* R=fbarchard@google.com Change-Id: I522a189b434f490ba1723ce51317727e7c5eb112 Bug: libyuv:877 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2649887 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-07 01:06:46 +08:00 · 2021-01-27 08:55:58 +08:00 · 2021-01-27 08:55:58 +08:00 · a85cc26fde
commit a85cc26fde
parent f7c0a73a3e
10 changed files with 1497 additions and 2 deletions
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -153,6 +153,38 @@ void MergeRGBPlane(const uint8_t* src_r,
                   int width,
                   int height);
 // Split interleaved ARGB plane into separate R, G, B and A planes.
 // dst_a can be NULL to discard alpha plane.
 LIBYUV_API
 void SplitARGBPlane(const uint8_t* src_argb,
                    int src_stride_argb,
                    uint8_t* dst_r,
                    int dst_stride_r,
                    uint8_t* dst_g,
                    int dst_stride_g,
                    uint8_t* dst_b,
                    int dst_stride_b,
                    uint8_t* dst_a,
                    int dst_stride_a,
                    int width,
                    int height);
 // Merge separate R, G, B and A planes into one interleaved ARGB plane.
 // src_a can be NULL to fill opaque value to alpha.
 LIBYUV_API
 void MergeARGBPlane(const uint8_t* src_r,
                    int src_stride_r,
                    const uint8_t* src_g,
                    int src_stride_g,
                    const uint8_t* src_b,
                    int src_stride_b,
                    const uint8_t* src_a,
                    int src_stride_a,
                    uint8_t* dst_argb,
                    int dst_stride_argb,
                    int width,
                    int height);
 // Copy I400.  Supports inverting.
 LIBYUV_API
 int I400ToI400(const uint8_t* src_y,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -280,12 +280,14 @@ extern "C" {
 #define HAS_I210TOARGBROW_SSSE3
 #define HAS_I400TOARGBROW_SSE2
 #define HAS_I422TOAR30ROW_SSSE3
 #define HAS_MERGEARGBROW_SSE2
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_MIRRORUVROW_AVX2
 #define HAS_MIRRORUVROW_SSSE3
 #define HAS_RAWTORGBAROW_SSSE3
 #define HAS_RGB24MIRRORROW_SSSE3
 #define HAS_RGBATOYJROW_SSSE3
 #define HAS_SPLITARGBROW_SSE2
 #define HAS_SPLITARGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
 #define HAS_SWAPUVROW_SSSE3
 #endif
@ -304,6 +306,7 @@ extern "C" {
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
 #define HAS_MERGEARGBROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
 #define HAS_I210TOARGBROW_AVX2
 #define HAS_I400TOARGBROW_AVX2
@ -311,8 +314,10 @@ extern "C" {
 #define HAS_I422TOUYVYROW_AVX2
 #define HAS_I422TOYUY2ROW_AVX2
 #define HAS_MERGEUVROW_16_AVX2
 #define HAS_MIRRORUVROW_AVX2
 #define HAS_MULTIPLYROW_16_AVX2
 #define HAS_RGBATOYJROW_AVX2
 #define HAS_SPLITARGBROW_AVX2
 #define HAS_SWAPUVROW_AVX2
 // TODO(fbarchard): Fix AVX2 version of YUV24
 // #define HAS_NV21TOYUV24ROW_AVX2
@ -373,6 +378,7 @@ extern "C" {
 #define HAS_I422TOYUY2ROW_NEON
 #define HAS_I444TOARGBROW_NEON
 #define HAS_J400TOARGBROW_NEON
 #define HAS_MERGEARGBROW_NEON
 #define HAS_MERGEUVROW_NEON
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORUVROW_NEON
@ -400,6 +406,7 @@ extern "C" {
 #define HAS_RGBATOYJROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
 #define HAS_SPLITARGBROW_NEON
 #define HAS_SPLITRGBROW_NEON
 #define HAS_SPLITUVROW_NEON
 #define HAS_SWAPUVROW_NEON
@ -1823,6 +1830,182 @@ void MergeRGBRow_Any_MMI(const uint8_t* src_r,
                         const uint8_t* src_b,
                         uint8_t* dst_rgb,
                         int width);
 void MergeARGBRow_C(const uint8_t* src_r,
                    const uint8_t* src_g,
                    const uint8_t* src_b,
                    const uint8_t* src_a,
                    uint8_t* dst_argb,
                    int width);
 void MergeARGBRow_SSE2(const uint8_t* src_r,
                        const uint8_t* src_g,
                        const uint8_t* src_b,
                        const uint8_t* src_a,
                        uint8_t* dst_argb,
                        int width);
 void MergeARGBRow_AVX2(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       const uint8_t* src_a,
                       uint8_t* dst_argb,
                       int width);
 void MergeARGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       const uint8_t* src_a,
                       uint8_t* dst_argb,
                       int width);
 void MergeARGBRow_Any_SSE2(const uint8_t* src_r,
                            const uint8_t* src_g,
                            const uint8_t* src_b,
                            const uint8_t* src_a,
                            uint8_t* dst_argb,
                            int width);
 void MergeARGBRow_Any_AVX2(const uint8_t* src_r,
                           const uint8_t* src_g,
                           const uint8_t* src_b,
                           const uint8_t* src_a,
                           uint8_t* dst_argb,
                           int width);
 void MergeARGBRow_Any_NEON(const uint8_t* src_r,
                           const uint8_t* src_g,
                           const uint8_t* src_b,
                           const uint8_t* src_a,
                           uint8_t* dst_argb,
                           int width);
 void SplitARGBRow_C(const uint8_t* src_argb,
                    uint8_t* dst_r,
                    uint8_t* dst_g,
                    uint8_t* dst_b,
                    uint8_t* dst_a,
                    int width);
 void SplitARGBRow_SSE2(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       uint8_t* dst_a,
                       int width);
 void SplitARGBRow_SSSE3(const uint8_t* src_argb,
                        uint8_t* dst_r,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width);
 void SplitARGBRow_AVX2(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       uint8_t* dst_a,
                       int width);
 void SplitARGBRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       uint8_t* dst_a,
                       int width);
 void SplitARGBRow_Any_SSE2(const uint8_t* src_argb,
                           uint8_t* dst_r,
                           uint8_t* dst_g,
                           uint8_t* dst_b,
                           uint8_t* dst_a,
                           int width);
 void SplitARGBRow_Any_SSSE3(const uint8_t* src_argb,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
                            uint8_t* dst_b,
                            uint8_t* dst_a,
                            int width);
 void SplitARGBRow_Any_AVX2(const uint8_t* src_argb,
                           uint8_t* dst_r,
                           uint8_t* dst_g,
                           uint8_t* dst_b,
                           uint8_t* dst_a,
                           int width);
 void SplitARGBRow_Any_NEON(const uint8_t* src_argb,
                           uint8_t* dst_r,
                           uint8_t* dst_g,
                           uint8_t* dst_b,
                           uint8_t* dst_a,
                           int width);
 void MergeXRGBRow_C(const uint8_t* src_r,
                    const uint8_t* src_g,
                    const uint8_t* src_b,
                    uint8_t* dst_argb,
                    int width);
 void MergeXRGBRow_SSE2(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width);
 void MergeXRGBRow_AVX2(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width);
 void MergeXRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width);
 void MergeXRGBRow_Any_SSE2(const uint8_t* src_r,
                           const uint8_t* src_g,
                           const uint8_t* src_b,
                           uint8_t* dst_argb,
                           int width);
 void MergeXRGBRow_Any_AVX2(const uint8_t* src_r,
                           const uint8_t* src_g,
                           const uint8_t* src_b,
                           uint8_t* dst_argb,
                           int width);
 void MergeXRGBRow_Any_NEON(const uint8_t* src_r,
                           const uint8_t* src_g,
                           const uint8_t* src_b,
                           uint8_t* dst_argb,
                           int width);
 void SplitXRGBRow_C(const uint8_t* src_argb,
                    uint8_t* dst_r,
                    uint8_t* dst_g,
                    uint8_t* dst_b,
                    int width);
 void SplitXRGBRow_SSE2(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width);
 void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
                        uint8_t* dst_r,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width);
 void SplitXRGBRow_AVX2(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width);
 void SplitXRGBRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width);
 void SplitXRGBRow_Any_SSE2(const uint8_t* src_argb,
                           uint8_t* dst_r,
                           uint8_t* dst_g,
                           uint8_t* dst_b,
                           int width);
 void SplitXRGBRow_Any_SSSE3(const uint8_t* src_argb,
                            uint8_t* dst_r,
                            uint8_t* dst_g,
                            uint8_t* dst_b,
                            int width);
 void SplitXRGBRow_Any_AVX2(const uint8_t* src_argb,
                           uint8_t* dst_r,
                           uint8_t* dst_g,
                           uint8_t* dst_b,
                           int width);
 void SplitXRGBRow_Any_NEON(const uint8_t* src_argb,
                           uint8_t* dst_r,
                           uint8_t* dst_g,
                           uint8_t* dst_b,
                           int width);
 void MergeUVRow_16_C(const uint16_t* src_u,
                     const uint16_t* src_v,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1772
+#define LIBYUV_VERSION 1773
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -772,6 +772,270 @@ void MergeRGBPlane(const uint8_t* src_r,
  }
 }
 LIBYUV_API
 void SplitARGBPlane(const uint8_t* src_argb,
                    int src_stride_argb,
                    uint8_t* dst_r,
                    int dst_stride_r,
                    uint8_t* dst_g,
                    int dst_stride_g,
                    uint8_t* dst_b,
                    int dst_stride_b,
                    uint8_t* dst_a,
                    int dst_stride_a,
                    int width,
                    int height) {
  int y;
  void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
                       uint8_t* dst_b, uint8_t* dst_a, int width) =
      SplitARGBRow_C;
  void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
                       uint8_t* dst_b, int width) = SplitXRGBRow_C;
  if (dst_a == NULL) {
    // Negative height means invert the image.
    if (height < 0) {
      height = -height;
      dst_r = dst_r + (height - 1) * dst_stride_r;
      dst_g = dst_g + (height - 1) * dst_stride_g;
      dst_b = dst_b + (height - 1) * dst_stride_b;
      dst_stride_r = -dst_stride_r;
      dst_stride_g = -dst_stride_g;
      dst_stride_b = -dst_stride_b;
    }
    // Coalesce rows.
    if (src_stride_argb == width * 4 && dst_stride_r == width &&
        dst_stride_g == width && dst_stride_b == width) {
      width *= height;
      height = 1;
      src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
          dst_stride_a = 0;
    }
 #if defined(HAS_SPLITARGBROW_SSE2)
    if (TestCpuFlag(kCpuHasSSE2)) {
      SplitXRGBRow = SplitXRGBRow_Any_SSE2;
      if (IS_ALIGNED(width, 8)) {
        SplitXRGBRow = SplitXRGBRow_SSE2;
      }
    }
 #endif
 #if defined(HAS_SPLITARGBROW_SSSE3)
    if (TestCpuFlag(kCpuHasSSSE3)) {
      SplitXRGBRow = SplitXRGBRow_Any_SSSE3;
      if (IS_ALIGNED(width, 8)) {
        SplitXRGBRow = SplitXRGBRow_SSSE3;
      }
    }
 #endif
 #if defined(HAS_SPLITARGBROW_AVX2)
    if (TestCpuFlag(kCpuHasAVX2)) {
      SplitXRGBRow = SplitXRGBRow_Any_AVX2;
      if (IS_ALIGNED(width, 16)) {
        SplitXRGBRow = SplitXRGBRow_AVX2;
      }
    }
 #endif
 #if defined(HAS_SPLITRGBROW_NEON)
    if (TestCpuFlag(kCpuHasNEON)) {
      SplitXRGBRow = SplitXRGBRow_Any_NEON;
      if (IS_ALIGNED(width, 16)) {
        SplitXRGBRow = SplitXRGBRow_NEON;
      }
    }
 #endif
    for (y = 0; y < height; ++y) {
      SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
      dst_r += dst_stride_r;
      dst_g += dst_stride_g;
      dst_b += dst_stride_b;
      src_argb += src_stride_argb;
    }
  } else {
    if (height < 0) {
      height = -height;
      dst_r = dst_r + (height - 1) * dst_stride_r;
      dst_g = dst_g + (height - 1) * dst_stride_g;
      dst_b = dst_b + (height - 1) * dst_stride_b;
      dst_a = dst_a + (height - 1) * dst_stride_a;
      dst_stride_r = -dst_stride_r;
      dst_stride_g = -dst_stride_g;
      dst_stride_b = -dst_stride_b;
      dst_stride_a = -dst_stride_a;
    }
    if (src_stride_argb == width * 4 && dst_stride_r == width &&
        dst_stride_g == width && dst_stride_b == width &&
        dst_stride_a == width) {
      width *= height;
      height = 1;
      src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
          dst_stride_a = 0;
    }
 #if defined(HAS_SPLITARGBROW_SSE2)
    if (TestCpuFlag(kCpuHasSSE2)) {
      SplitARGBRow = SplitARGBRow_Any_SSE2;
      if (IS_ALIGNED(width, 8)) {
        SplitARGBRow = SplitARGBRow_SSE2;
      }
    }
 #endif
 #if defined(HAS_SPLITARGBROW_SSSE3)
    if (TestCpuFlag(kCpuHasSSSE3)) {
      SplitARGBRow = SplitARGBRow_Any_SSSE3;
      if (IS_ALIGNED(width, 8)) {
        SplitARGBRow = SplitARGBRow_SSSE3;
      }
    }
 #endif
 #if defined(HAS_SPLITARGBROW_AVX2)
    if (TestCpuFlag(kCpuHasAVX2)) {
      SplitARGBRow = SplitARGBRow_Any_AVX2;
      if (IS_ALIGNED(width, 16)) {
        SplitARGBRow = SplitARGBRow_AVX2;
      }
    }
 #endif
 #if defined(HAS_SPLITRGBROW_NEON)
    if (TestCpuFlag(kCpuHasNEON)) {
      SplitARGBRow = SplitARGBRow_Any_NEON;
      if (IS_ALIGNED(width, 16)) {
        SplitARGBRow = SplitARGBRow_NEON;
      }
    }
 #endif
    for (y = 0; y < height; ++y) {
      SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
      dst_r += dst_stride_r;
      dst_g += dst_stride_g;
      dst_b += dst_stride_b;
      dst_a += dst_stride_a;
      src_argb += src_stride_argb;
    }
  }
 }
 LIBYUV_API
 void MergeARGBPlane(const uint8_t* src_r,
                    int src_stride_r,
                    const uint8_t* src_g,
                    int src_stride_g,
                    const uint8_t* src_b,
                    int src_stride_b,
                    const uint8_t* src_a,
                    int src_stride_a,
                    uint8_t* dst_argb,
                    int dst_stride_argb,
                    int width,
                    int height) {
  int y;
  void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g,
                       const uint8_t* src_b, const uint8_t* src_a,
                       uint8_t* dst_argb, int width) = MergeARGBRow_C;
  void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
                       const uint8_t* src_b, uint8_t* dst_argb, int width) =
      MergeXRGBRow_C;
  if (src_a == NULL) {
    // Negative height means invert the image.
    if (height < 0) {
      height = -height;
      dst_argb = dst_argb + (height - 1) * dst_stride_argb;
      dst_stride_argb = -dst_stride_argb;
    }
    // Coalesce rows.
    if (src_stride_r == width && src_stride_g == width &&
        src_stride_b == width && dst_stride_argb == width * 4) {
      width *= height;
      height = 1;
      src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
    }
 #if defined(HAS_MERGEARGBROW_SSE2)
    if (TestCpuFlag(kCpuHasSSE2)) {
      MergeXRGBRow = MergeXRGBRow_Any_SSE2;
      if (IS_ALIGNED(width, 8)) {
        MergeXRGBRow = MergeXRGBRow_SSE2;
      }
    }
 #endif
 #if defined(HAS_MERGEARGBROW_AVX2)
    if (TestCpuFlag(kCpuHasAVX2)) {
      MergeXRGBRow = MergeXRGBRow_Any_AVX2;
      if (IS_ALIGNED(width, 16)) {
        MergeXRGBRow = MergeXRGBRow_AVX2;
      }
    }
 #endif
 #if defined(HAS_MERGERGBROW_NEON)
    if (TestCpuFlag(kCpuHasNEON)) {
      MergeXRGBRow = MergeXRGBRow_Any_NEON;
      if (IS_ALIGNED(width, 16)) {
        MergeXRGBRow = MergeXRGBRow_NEON;
      }
    }
 #endif
    for (y = 0; y < height; ++y) {
      MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
      src_r += src_stride_r;
      src_g += src_stride_g;
      src_b += src_stride_b;
      dst_argb += dst_stride_argb;
    }
  } else {
    if (height < 0) {
      height = -height;
      dst_argb = dst_argb + (height - 1) * dst_stride_argb;
      dst_stride_argb = -dst_stride_argb;
    }
    if (src_stride_r == width && src_stride_g == width &&
        src_stride_b == width && src_stride_a == width &&
        dst_stride_argb == width * 4) {
      width *= height;
      height = 1;
      src_stride_r = src_stride_g = src_stride_b = src_stride_a =
          dst_stride_argb = 0;
    }
 #if defined(HAS_MERGEARGBROW_SSE2)
    if (TestCpuFlag(kCpuHasSSE2)) {
      MergeARGBRow = MergeARGBRow_Any_SSE2;
      if (IS_ALIGNED(width, 8)) {
        MergeARGBRow = MergeARGBRow_SSE2;
      }
    }
 #endif
 #if defined(HAS_MERGEARGBROW_AVX2)
    if (TestCpuFlag(kCpuHasAVX2)) {
      MergeARGBRow = MergeARGBRow_Any_AVX2;
      if (IS_ALIGNED(width, 16)) {
        MergeARGBRow = MergeARGBRow_AVX2;
      }
    }
 #endif
 #if defined(HAS_MERGERGBROW_NEON)
    if (TestCpuFlag(kCpuHasNEON)) {
      MergeARGBRow = MergeARGBRow_Any_NEON;
      if (IS_ALIGNED(width, 16)) {
        MergeARGBRow = MergeARGBRow_NEON;
      }
    }
 #endif
    for (y = 0; y < height; ++y) {
      MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
      src_r += src_stride_r;
      src_g += src_stride_g;
      src_b += src_stride_b;
      dst_argb += dst_stride_argb;
    }
  }
 }
 // Convert YUY2 to I422.
 LIBYUV_API
 int YUY2ToI422(const uint8_t* src_yuy2,
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -30,6 +30,37 @@ extern "C" {
 // Subsampled source needs to be increase by 1 of not even.
 #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
 // Any 4 planes to 1
 #define ANY41(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)               \
  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
               int width) {                                                  \
    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
    memset(temp, 0, 64 * 4); /* for msan */                                  \
    int r = width & MASK;                                                    \
    int n = width & ~MASK;                                                   \
    if (n > 0) {                                                             \
      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n);                      \
    }                                                                        \
    memcpy(temp, y_buf + n, r);                                              \
    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
    memcpy(temp + 192, a_buf + n, r);                                        \
    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, MASK + 1); \
    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
           SS(r, DUVSHIFT) * BPP);                                           \
  }
 #ifdef HAS_MERGEARGBROW_SSE2
 ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7)
 #endif
 #ifdef HAS_MERGEARGBROW_AVX2
 ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15)
 #endif
 #ifdef HAS_MERGEARGBROW_NEON
 ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15)
 #endif
 // Any 4 planes to 1 with yuvconstants
 #define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \
  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
@ -113,6 +144,15 @@ ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
 #ifdef HAS_MERGERGBROW_MMI
 ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
 #endif
 #ifdef HAS_MERGEARGBROW_SSE2
 ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7)
 #endif
 #ifdef HAS_MERGEARGBROW_AVX2
 ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15)
 #endif
 #ifdef HAS_MERGEARGBROW_NEON
 ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15)
 #endif
 #ifdef HAS_I422TOYUY2ROW_SSE2
 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
 ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
@ -1382,6 +1422,51 @@ ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
 #ifdef HAS_SPLITRGBROW_MMI
 ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
 #endif
 #ifdef HAS_SPLITARGBROW_SSE2
 ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7)
 #endif
 #ifdef HAS_SPLITARGBROW_SSSE3
 ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7)
 #endif
 #ifdef HAS_SPLITARGBROW_AVX2
 ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15)
 #endif
 #ifdef HAS_SPLITARGBROW_NEON
 ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
 #endif
 // Any 1 to 4.  Outputs ARGB planes.
 #define ANY14(NAMEANY, ANY_SIMD, BPP, MASK)                                    \
  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,         \
               uint8_t* dst_b, uint8_t* dst_a, int width) {                    \
    SIMD_ALIGNED(uint8_t temp[16 * 8]);                                        \
    memset(temp, 0, 16 * 4); /* for msan */                                    \
    int r = width & MASK;                                                      \
    int n = width & ~MASK;                                                     \
    if (n > 0) {                                                               \
      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n);                        \
    }                                                                          \
    memcpy(temp, src_ptr + n * BPP, r * BPP);                                  \
    ANY_SIMD(temp, temp + 16 * 4, temp + 16 * 5, temp + 16 * 6, temp + 16 * 7, \
             MASK + 1);                                                        \
    memcpy(dst_r + n, temp + 16 * 4, r);                                       \
    memcpy(dst_g + n, temp + 16 * 5, r);                                       \
    memcpy(dst_b + n, temp + 16 * 6, r);                                       \
    memcpy(dst_a + n, temp + 16 * 7, r);                                       \
  }
 #ifdef HAS_SPLITARGBROW_SSE2
 ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7)
 #endif
 #ifdef HAS_SPLITARGBROW_SSSE3
 ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7)
 #endif
 #ifdef HAS_SPLITARGBROW_AVX2
 ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15)
 #endif
 #ifdef HAS_SPLITARGBROW_NEON
 ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
 #endif
 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
 // 128 byte row allows for 32 avx ARGB pixels.
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -2476,6 +2476,67 @@ void MergeRGBRow_C(const uint8_t* src_r,
  }
 }
 void SplitARGBRow_C(const uint8_t* src_argb,
                    uint8_t* dst_r,
                    uint8_t* dst_g,
                    uint8_t* dst_b,
                    uint8_t* dst_a,
                    int width) {
  int x;
  for (x = 0; x < width; ++x) {
    dst_b[x] = src_argb[0];
    dst_g[x] = src_argb[1];
    dst_r[x] = src_argb[2];
    dst_a[x] = src_argb[3];
    src_argb += 4;
  }
 }
 void MergeARGBRow_C(const uint8_t* src_r,
                    const uint8_t* src_g,
                    const uint8_t* src_b,
                    const uint8_t* src_a,
                    uint8_t* dst_argb,
                    int width) {
  int x;
  for (x = 0; x < width; ++x) {
    dst_argb[0] = src_b[x];
    dst_argb[1] = src_g[x];
    dst_argb[2] = src_r[x];
    dst_argb[3] = src_a[x];
    dst_argb += 4;
  }
 }
 void SplitXRGBRow_C(const uint8_t* src_argb,
                    uint8_t* dst_r,
                    uint8_t* dst_g,
                    uint8_t* dst_b,
                    int width) {
  int x;
  for (x = 0; x < width; ++x) {
    dst_b[x] = src_argb[0];
    dst_g[x] = src_argb[1];
    dst_r[x] = src_argb[2];
    src_argb += 4;
  }
 }
 void MergeXRGBRow_C(const uint8_t* src_r,
                    const uint8_t* src_g,
                    const uint8_t* src_b,
                    uint8_t* dst_argb,
                    int width) {
  int x;
  for (x = 0; x < width; ++x) {
    dst_argb[0] = src_b[x];
    dst_argb[1] = src_g[x];
    dst_argb[2] = src_r[x];
    dst_argb[3] = 255;
    dst_argb += 4;
  }
 }
 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 128 = 9 bits
 // 64 = 10 bits
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -4075,6 +4075,446 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
 }
 #endif  // HAS_MERGERGBROW_SSSE3
 #ifdef HAS_MERGEARGBROW_SSE2
 void MergeARGBRow_SSE2(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       const uint8_t* src_a,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      "sub         %0,%1                         \n"
      "sub         %0,%2                         \n"
      "sub         %0,%3                         \n"
      LABELALIGN
      "1:                                        \n"
      "movq        (%0,%2),%%xmm0                \n"  // B
      "movq        (%0),%%xmm1                   \n"  // R
      "movq        (%0,%1),%%xmm2                \n"  // G
      "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
      "movq        (%0,%3),%%xmm1                \n"  // A
      "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
      "movdqa      %%xmm0,%%xmm1                 \n"  // BR
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
      "movdqu      %%xmm0,(%4)                   \n"
      "movdqu      %%xmm1,16(%4)                 \n"
      "lea         8(%0),%0                      \n"
      "lea         32(%4),%4                     \n"
      "sub         $0x8,%5                       \n"
      "jg          1b                            \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
        "+r"(src_a),     // %3
        "+r"(dst_argb),  // %4
        "+r"(width)      // %5
      :
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 void MergeXRGBRow_SSE2(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      LABELALIGN
      "1:                                        \n"
      "movq        (%2),%%xmm0                   \n"  // B
      "movq        (%0),%%xmm1                   \n"  // R
      "movq        (%1),%%xmm2                   \n"  // G
      "punpcklbw   %%xmm1,%%xmm0                 \n"  // BR
      "pcmpeqd     %%xmm1,%%xmm1                 \n"  // A(255)
      "punpcklbw   %%xmm1,%%xmm2                 \n"  // GA
      "movdqa      %%xmm0,%%xmm1                 \n"  // BR
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // BGRA (hi)
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // BGRA (lo)
      "movdqu      %%xmm0,(%3)                   \n"
      "movdqu      %%xmm1,16(%3)                 \n"
      "lea         8(%0),%0                      \n"
      "lea         8(%1),%1                      \n"
      "lea         8(%2),%2                      \n"
      "lea         32(%3),%3                     \n"
      "sub         $0x8,%4                       \n"
      "jg          1b                            \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
        "+r"(dst_argb),  // %3
        "+r"(width)      // %4
      :
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEARGBROW_SSE2
 #ifdef HAS_MERGEARGBROW_AVX2
 void MergeARGBRow_AVX2(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       const uint8_t* src_a,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      "sub         %0,%1                         \n"
      "sub         %0,%2                         \n"
      "sub         %0,%3                         \n"
      LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0,%2),%%xmm0                \n"  // B
      "vmovdqu     (%0,%1),%%xmm1                \n"  // R
      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
      "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // A
      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
      "vmovdqu     %%ymm0,(%4)                   \n"  // First 8
      "vmovdqu     %%ymm1,32(%4)                 \n"  // Next 8
      "lea         16(%0),%0                     \n"
      "lea         64(%4),%4                     \n"
      "sub         $0x10,%5                      \n"
      "jg          1b                            \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
        "+r"(src_a),     // %3
        "+r"(dst_argb),  // %4
        "+r"(width)      // %5
      :
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 void MergeXRGBRow_AVX2(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%2),%%xmm0                   \n"  // B
      "vpcmpeqd    %%ymm1,%%ymm1,%%ymm1          \n"  // A(255)
      "vinserti128 $0,(%1),%%ymm1,%%ymm1         \n"  // R
      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // G
      "vpunpckhbw  %%ymm1,%%ymm0,%%ymm2          \n"
      "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
      "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"
      "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"
      "vperm2i128  $0x31,%%ymm2,%%ymm0,%%ymm1    \n"
      "vperm2i128  $0x20,%%ymm2,%%ymm0,%%ymm0    \n"
      "vmovdqu     %%ymm0,(%3)                   \n"  // First 8
      "vmovdqu     %%ymm1,32(%3)                 \n"  // Next 8
      "lea         16(%0),%0                     \n"
      "lea         16(%1),%1                     \n"
      "lea         16(%2),%2                     \n"
      "lea         64(%3),%3                     \n"
      "sub         $0x10,%4                      \n"
      "jg          1b                            \n"
      : "+r"(src_r),     // %0
        "+r"(src_g),     // %1
        "+r"(src_b),     // %2
        "+r"(dst_argb),  // %3
        "+r"(width)      // %4
      :
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEARGBROW_AVX2
 #ifdef HAS_SPLITARGBROW_SSE2
 void SplitARGBRow_SSE2(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       uint8_t* dst_a,
                       int width) {
  asm volatile(
      "subq        %1,%2                         \n"
      "subq        %1,%3                         \n"
      "subq        %1,%4                         \n"
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
      "movdqa      %%xmm0,%%xmm2                 \n"
      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
      "movdqa      %%xmm0,%%xmm1                 \n"
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
      "movdqa      %%xmm0,%%xmm2                 \n"
      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
      "movdqa      %%xmm0,%%xmm1                 \n"
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
      "movdqa      %%xmm0,%%xmm2                 \n"
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
      "movlps      %%xmm0,(%1,%3)                \n"  // B
      "movhps      %%xmm0,(%1,%2)                \n"  // G
      "movlps      %%xmm2,(%1)                   \n"  // R
      "movhps      %%xmm2,(%1,%4)                \n"  // A
      "lea         32(%0),%0                     \n"
      "lea         8(%1),%1                      \n"
      "sub         $0x8,%5                       \n"
      "jg          1b                            \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_r),     // %1
        "+r"(dst_g),     // %2
        "+r"(dst_b),     // %3
        "+r"(dst_a),     // %4
        "+r"(width)      // %5
      :
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 void SplitXRGBRow_SSE2(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
  asm volatile(
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
      "movdqa      %%xmm0,%%xmm2                 \n"
      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 00-07 10-17
      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 08-0F 18-1F
      "movdqa      %%xmm0,%%xmm1                 \n"
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 08192A3B4C5D6E7F (lo)
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 08192A3B4C5D6E7F (hi)
      "movdqa      %%xmm0,%%xmm2                 \n"
      "punpcklqdq  %%xmm1,%%xmm0                 \n"  // 08192A3B08192A3B
      "punpckhqdq  %%xmm1,%%xmm2                 \n"  // 4C5D6E7F4C5D6E7F
      "movdqa      %%xmm0,%%xmm1                 \n"
      "punpcklbw   %%xmm2,%%xmm0                 \n"  // 048C159D26AE37BF (lo)
      "punpckhbw   %%xmm2,%%xmm1                 \n"  // 048C159D26AE37BF (hi)
      "movdqa      %%xmm0,%%xmm2                 \n"
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
      "movlps      %%xmm0,(%3)                   \n"  // B
      "movhps      %%xmm0,(%2)                   \n"  // G
      "movlps      %%xmm2,(%1)                   \n"  // R
      "lea         32(%0),%0                     \n"
      "lea         8(%1),%1                      \n"
      "lea         8(%2),%2                      \n"
      "lea         8(%3),%3                      \n"
      "sub         $0x8,%4                       \n"
      "jg          1b                            \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_r),     // %1
        "+r"(dst_g),     // %2
        "+r"(dst_b),     // %3
        "+r"(width)      // %4
      :
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif
 #ifdef HAS_SPLITARGBROW_SSSE3
 static const uvec8 kShuffleMaskARGBSplit = {0u, 4u, 8u,  12u, 1u, 5u, 9u,  13u,
                                            2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u};
 void SplitARGBRow_SSSE3(const uint8_t* src_argb,
                        uint8_t* dst_r,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        uint8_t* dst_a,
                        int width) {
  asm volatile(
      "subq        %1,%2                         \n"
      "subq        %1,%3                         \n"
      "subq        %1,%4                         \n"
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
      "pshufb      %6,%%xmm0                     \n"  // 048C159D26AE37BF (lo)
      "pshufb      %6,%%xmm1                     \n"  // 048C159D26AE37BF (hi)
      "movdqa      %%xmm0,%%xmm2                 \n"
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
      "movlps      %%xmm0,(%1,%3)                \n"  // B
      "movhps      %%xmm0,(%1,%2)                \n"  // G
      "movlps      %%xmm2,(%1)                   \n"  // R
      "movhps      %%xmm2,(%1,%4)                \n"  // A
      "lea         32(%0),%0                     \n"
      "lea         8(%1),%1                      \n"
      "sub         $0x8,%5                       \n"
      "jg          1b                            \n"
      : "+r"(src_argb),             // %0
        "+r"(dst_r),                // %1
        "+r"(dst_g),                // %2
        "+r"(dst_b),                // %3
        "+r"(dst_a),                // %4
        "+r"(width)                 // %5
      : "m"(kShuffleMaskARGBSplit)  // %6
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
                        uint8_t* dst_r,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
  asm volatile(
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
      "movdqu      16(%0),%%xmm1                 \n"  // 10-1F
      "pshufb      %5,%%xmm0                     \n"  // 048C159D26AE37BF (lo)
      "pshufb      %5,%%xmm1                     \n"  // 048C159D26AE37BF (hi)
      "movdqa      %%xmm0,%%xmm2                 \n"
      "punpckldq   %%xmm1,%%xmm0                 \n"  // 048C048C159D159D (BG)
      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 26AE26AE37BF37BF (RA)
      "movlps      %%xmm0,(%3)                   \n"  // B
      "movhps      %%xmm0,(%2)                   \n"  // G
      "movlps      %%xmm2,(%1)                   \n"  // R
      "lea         32(%0),%0                     \n"
      "lea         8(%1),%1                      \n"
      "lea         8(%2),%2                      \n"
      "lea         8(%3),%3                      \n"
      "sub         $0x8,%4                       \n"
      "jg          1b                            \n"
      : "+r"(src_argb),             // %0
        "+r"(dst_r),                // %1
        "+r"(dst_g),                // %2
        "+r"(dst_b),                // %3
        "+r"(width)                 // %4
      : "m"(kShuffleMaskARGBSplit)  // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif
 #ifdef HAS_SPLITARGBROW_AVX2
 static const lvec8 kShuffleMaskARGBSplit_AVX2 = {
    0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u,
    0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u};
 static const ulvec32 kShuffleMaskARGBPermute_AVX2 = {0u, 4u, 1u, 5u,
                                                     2u, 6u, 3u, 7u};
 void SplitARGBRow_AVX2(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       uint8_t* dst_a,
                       int width) {
  asm volatile(
      "subq         %1,%2                        \n"
      "subq         %1,%3                        \n"
      "subq         %1,%4                        \n"
      "vmovdqu      %7,%%ymm3                    \n"
      LABELALIGN
      "1:                                        \n"
      "vmovdqu      (%0),%%xmm0                  \n"  // 00-0F
      "vmovdqu      16(%0),%%xmm1                \n"  // 10-1F
      "vinserti128  $1,32(%0),%%ymm0,%%ymm0      \n"  // 00-0F 20-2F
      "vinserti128  $1,48(%0),%%ymm1,%%ymm1      \n"  // 10-1F 30-3F
      "vpshufb      %6,%%ymm0,%%ymm0             \n"
      "vpshufb      %6,%%ymm1,%%ymm1             \n"
      "vpermd       %%ymm0,%%ymm3,%%ymm0         \n"
      "vpermd       %%ymm1,%%ymm3,%%ymm1         \n"
      "vpunpckhdq   %%ymm1,%%ymm0,%%ymm2         \n"  // GA
      "vpunpckldq   %%ymm1,%%ymm0,%%ymm0         \n"  // BR
      "vmovdqu      %%xmm0,(%1,%3)               \n"  // B
      "vextracti128 $1,%%ymm0,(%1)               \n"  // R
      "vmovdqu      %%xmm2,(%1,%2)               \n"  // G
      "vextracti128 $1,%%ymm2,(%1,%4)            \n"  // A
      "lea          64(%0),%0                    \n"
      "lea          16(%1),%1                    \n"
      "sub          $0x10,%5                     \n"
      "jg           1b                           \n"
      : "+r"(src_argb),                    // %0
        "+r"(dst_r),                       // %1
        "+r"(dst_g),                       // %2
        "+r"(dst_b),                       // %3
        "+r"(dst_a),                       // %4
        "+r"(width)                        // %5
      : "m"(kShuffleMaskARGBSplit_AVX2),   // %6
        "m"(kShuffleMaskARGBPermute_AVX2)  // %7
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 void SplitXRGBRow_AVX2(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
  asm volatile(
      "vmovdqu      %6,%%ymm3                    \n" LABELALIGN
      "1:                                        \n"
      "vmovdqu      (%0),%%xmm0                  \n"  // 00-0F
      "vmovdqu      16(%0),%%xmm1                \n"  // 10-1F
      "vinserti128  $1,32(%0),%%ymm0,%%ymm0      \n"  // 00-0F 20-2F
      "vinserti128  $1,48(%0),%%ymm1,%%ymm1      \n"  // 10-1F 30-3F
      "vpshufb      %5,%%ymm0,%%ymm0             \n"
      "vpshufb      %5,%%ymm1,%%ymm1             \n"
      "vpermd       %%ymm0,%%ymm3,%%ymm0         \n"
      "vpermd       %%ymm1,%%ymm3,%%ymm1         \n"
      "vpunpckhdq   %%ymm1,%%ymm0,%%ymm2         \n"  // GA
      "vpunpckldq   %%ymm1,%%ymm0,%%ymm0         \n"  // BR
      "vmovdqu      %%xmm0,(%3)                  \n"  // B
      "vextracti128 $1,%%ymm0,(%1)               \n"  // R
      "vmovdqu      %%xmm2,(%2)                  \n"  // G
      "lea         64(%0),%0                     \n"
      "lea         16(%1),%1                     \n"
      "lea         16(%2),%2                     \n"
      "lea         16(%3),%3                     \n"
      "sub         $0x10,%4                      \n"
      "jg          1b                            \n"
      : "+r"(src_argb),                    // %0
        "+r"(dst_r),                       // %1
        "+r"(dst_g),                       // %2
        "+r"(dst_b),                       // %3
        "+r"(width)                        // %4
      : "m"(kShuffleMaskARGBSplit_AVX2),   // %5
        "m"(kShuffleMaskARGBPermute_AVX2)  // %6
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 #endif
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -666,6 +666,113 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
  );
 }
 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
 void SplitARGBRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       uint8_t* dst_a,
                       int width) {
  asm volatile(
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
      "subs        %5, %5, #16                   \n"  // 16 processed per loop
      "vst1.8      {q0}, [%3]!                   \n"  // store B
      "vst1.8      {q1}, [%2]!                   \n"  // store G
      "vst1.8      {q2}, [%1]!                   \n"  // store R
      "vst1.8      {q3}, [%4]!                   \n"  // store A
      "bgt         1b                            \n"
      : "+r"(src_rgba),                         // %0
        "+r"(dst_r),                            // %1
        "+r"(dst_g),                            // %2
        "+r"(dst_b),                            // %3
        "+r"(dst_a),                            // %4
        "+r"(width)                             // %5
      :                                         // Input registers
      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
  );
 }
 // Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
 void MergeARGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       const uint8_t* src_a,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      "1:                                        \n"
      "vld1.8      {q2}, [%0]!                   \n"  // load R
      "vld1.8      {q1}, [%1]!                   \n"  // load G
      "vld1.8      {q0}, [%2]!                   \n"  // load B
      "vld1.8      {q3}, [%3]!                   \n"  // load A
      "subs        %5, %5, #16                   \n"  // 16 processed per loop
      "vst4.8      {d0, d2, d4, d6}, [%4]!           \n"  // store 8 ARGB
      "vst4.8      {d1, d3, d5, d7}, [%4]!           \n"  // next 8 ARGB
      "bgt         1b                            \n"
      : "+r"(src_r),                            // %0
        "+r"(src_g),                            // %1
        "+r"(src_b),                            // %2
        "+r"(src_a),                            // %3
        "+r"(dst_argb),                         // %4
        "+r"(width)                             // %5
      :                                         // Input registers
      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
  );
 }
 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
 void SplitXRGBRow_NEON(const uint8_t* src_argb,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
  asm volatile(
      "1:                                        \n"
      "vld4.8      {d0, d2, d4, d6}, [%0]!       \n"  // load 8 ARGB
      "vld4.8      {d1, d3, d5, d7}, [%0]!       \n"  // next 8 ARGB
      "subs        %4, %4, #16                   \n"  // 16 processed per loop
      "vst1.8      {q0}, [%3]!                   \n"  // store B
      "vst1.8      {q1}, [%2]!                   \n"  // store G
      "vst1.8      {q2}, [%1]!                   \n"  // store R
      "bgt         1b                            \n"
      : "+r"(src_rgba),                         // %0
        "+r"(dst_r),                            // %1
        "+r"(dst_g),                            // %2
        "+r"(dst_b),                            // %3
        "+r"(width)                             // %4
      :                                         // Input registers
      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
  );
 }
 // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
 void MergeXRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      "vmov.u8     q3, #255                      \n"  // load A(255)
      "1:                                        \n"
      "vld1.8      {q2}, [%0]!                   \n"  // load R
      "vld1.8      {q1}, [%1]!                   \n"  // load G
      "vld1.8      {q0}, [%2]!                   \n"  // load B
      "subs        %4, %4, #16                   \n"  // 16 processed per loop
      "vst4.8      {d0, d2, d4, d6}, [%4]!           \n"  // store 8 ARGB
      "vst4.8      {d1, d3, d5, d7}, [%4]!           \n"  // next 8 ARGB
      "bgt         1b                            \n"
      : "+r"(src_r),                            // %0
        "+r"(src_g),                            // %1
        "+r"(src_b),                            // %2
        "+r"(dst_argb),                         // %3
        "+r"(width)                             // %4
      :                                         // Input registers
      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
  );
 }
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -763,6 +763,118 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
  );
 }
 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
 void SplitARGBRow_NEON(const uint8_t* src_rgba,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       uint8_t* dst_a,
                       int width) {
  asm volatile(
      "1:                                        \n"
      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
      "prfm        pldl1keep, [%0, 448]          \n"
      "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
      "st1         {v0.16b}, [%3], #16           \n"  // store B
      "st1         {v1.16b}, [%2], #16           \n"  // store G
      "st1         {v2.16b}, [%1], #16           \n"  // store R
      "st1         {v3.16b}, [%4], #16           \n"  // store A
      "b.gt        1b                            \n"
      : "+r"(src_rgba),                         // %0
        "+r"(dst_r),                            // %1
        "+r"(dst_g),                            // %2
        "+r"(dst_b),                            // %3
        "+r"(dst_a),                            // %4
        "+r"(width)                             // %5
      :                                         // Input registers
      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
  );
 }
 // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
 void MergeARGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       const uint8_t* src_a,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      "1:                                        \n"
      "ld1         {v2.16b}, [%0], #16           \n"  // load R
      "ld1         {v1.16b}, [%1], #16           \n"  // load G
      "ld1         {v0.16b}, [%2], #16           \n"  // load B
      "ld1         {v3.16b}, [%3], #16           \n"  // load A
      "prfm        pldl1keep, [%0, 448]          \n"
      "prfm        pldl1keep, [%1, 448]          \n"
      "prfm        pldl1keep, [%2, 448]          \n"
      "prfm        pldl1keep, [%3, 448]          \n"
      "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
      "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n"  // store 16ARGB
      "b.gt        1b                            \n"
      : "+r"(src_r),                            // %0
        "+r"(src_g),                            // %1
        "+r"(src_b),                            // %2
        "+r"(src_a),                            // %3
        "+r"(dst_argb),                         // %4
        "+r"(width)                             // %5
      :                                         // Input registers
      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
  );
 }
 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
 void SplitXRGBRow_NEON(const uint8_t* src_rgba,
                       uint8_t* dst_r,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
  asm volatile(
      "1:                                        \n"
      "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
      "prfm        pldl1keep, [%0, 448]          \n"
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
      "st1         {v0.16b}, [%3], #16           \n"  // store B
      "st1         {v1.16b}, [%2], #16           \n"  // store G
      "st1         {v2.16b}, [%1], #16           \n"  // store R
      "b.gt        1b                            \n"
      : "+r"(src_rgba),                         // %0
        "+r"(dst_r),                            // %1
        "+r"(dst_g),                            // %2
        "+r"(dst_b),                            // %3
        "+r"(width)                             // %4
      :                                         // Input registers
      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
  );
 }
 // Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
 void MergeXRGBRow_NEON(const uint8_t* src_r,
                       const uint8_t* src_g,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width) {
  asm volatile(
      "movi        v3.16b, #255                  \n"  // load A(255)
      "1:                                        \n"
      "ld1         {v2.16b}, [%0], #16           \n"  // load R
      "ld1         {v1.16b}, [%1], #16           \n"  // load G
      "ld1         {v0.16b}, [%2], #16           \n"  // load B
      "prfm        pldl1keep, [%0, 448]          \n"
      "prfm        pldl1keep, [%1, 448]          \n"
      "prfm        pldl1keep, [%2, 448]          \n"
      "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
      "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n"  // store 16ARGB
      "b.gt        1b                            \n"
      : "+r"(src_r),                            // %0
        "+r"(src_g),                            // %1
        "+r"(src_b),                            // %2
        "+r"(dst_argb),                         // %3
        "+r"(width)                             // %4
      :                                         // Input registers
      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
  );
 }
 // Copy multiple of 32.
 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -2776,6 +2776,217 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
  free_aligned_buffer_page_end(dst_pixels_c);
 }
 TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) {
  // Round count up to multiple of 16
  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
  align_buffer_page_end(src_pixels, kPixels * 4);
  align_buffer_page_end(tmp_pixels_r, kPixels);
  align_buffer_page_end(tmp_pixels_g, kPixels);
  align_buffer_page_end(tmp_pixels_b, kPixels);
  align_buffer_page_end(tmp_pixels_a, kPixels);
  align_buffer_page_end(dst_pixels_opt, kPixels * 4);
  align_buffer_page_end(dst_pixels_c, kPixels * 4);
  MemRandomize(src_pixels, kPixels * 4);
  MemRandomize(tmp_pixels_r, kPixels);
  MemRandomize(tmp_pixels_g, kPixels);
  MemRandomize(tmp_pixels_b, kPixels);
  MemRandomize(tmp_pixels_a, kPixels);
  MemRandomize(dst_pixels_opt, kPixels * 4);
  MemRandomize(dst_pixels_c, kPixels * 4);
  MaskCpuFlags(disable_cpu_flags_);
  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
                 benchmark_width_, tmp_pixels_a, benchmark_width_,
                 benchmark_width_, benchmark_height_);
  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
                 tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
                 dst_pixels_c, benchmark_width_ * 4, benchmark_width_,
                 benchmark_height_);
  MaskCpuFlags(benchmark_cpu_info_);
  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
                 benchmark_width_, tmp_pixels_a, benchmark_width_,
                 benchmark_width_, benchmark_height_);
  for (int i = 0; i < benchmark_iterations_; ++i) {
    MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
                   benchmark_width_, tmp_pixels_b, benchmark_width_,
                   tmp_pixels_a, benchmark_width_, dst_pixels_opt,
                   benchmark_width_ * 4, benchmark_width_, benchmark_height_);
  }
  for (int i = 0; i < kPixels * 4; ++i) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(tmp_pixels_r);
  free_aligned_buffer_page_end(tmp_pixels_g);
  free_aligned_buffer_page_end(tmp_pixels_b);
  free_aligned_buffer_page_end(tmp_pixels_a);
  free_aligned_buffer_page_end(dst_pixels_opt);
  free_aligned_buffer_page_end(dst_pixels_c);
 }
 TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) {
  // Round count up to multiple of 16
  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
  align_buffer_page_end(src_pixels, kPixels * 4);
  align_buffer_page_end(tmp_pixels_r, kPixels);
  align_buffer_page_end(tmp_pixels_g, kPixels);
  align_buffer_page_end(tmp_pixels_b, kPixels);
  align_buffer_page_end(tmp_pixels_a, kPixels);
  align_buffer_page_end(dst_pixels_opt, kPixels * 4);
  align_buffer_page_end(dst_pixels_c, kPixels * 4);
  MemRandomize(src_pixels, kPixels * 4);
  MemRandomize(tmp_pixels_r, kPixels);
  MemRandomize(tmp_pixels_g, kPixels);
  MemRandomize(tmp_pixels_b, kPixels);
  MemRandomize(tmp_pixels_a, kPixels);
  MemRandomize(dst_pixels_opt, kPixels * 4);
  MemRandomize(dst_pixels_c, kPixels * 4);
  MaskCpuFlags(disable_cpu_flags_);
  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
                 benchmark_width_, tmp_pixels_a, benchmark_width_,
                 benchmark_width_, benchmark_height_);
  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
                 tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
                 dst_pixels_c, benchmark_width_ * 4, benchmark_width_,
                 benchmark_height_);
  MaskCpuFlags(benchmark_cpu_info_);
  for (int i = 0; i < benchmark_iterations_; ++i) {
    SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
                   benchmark_width_, tmp_pixels_g, benchmark_width_,
                   tmp_pixels_b, benchmark_width_, tmp_pixels_a,
                   benchmark_width_, benchmark_width_, benchmark_height_);
  }
  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
                 tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
                 dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
                 benchmark_height_);
  for (int i = 0; i < kPixels * 4; ++i) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(tmp_pixels_r);
  free_aligned_buffer_page_end(tmp_pixels_g);
  free_aligned_buffer_page_end(tmp_pixels_b);
  free_aligned_buffer_page_end(tmp_pixels_a);
  free_aligned_buffer_page_end(dst_pixels_opt);
  free_aligned_buffer_page_end(dst_pixels_c);
 }
 TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
  // Round count up to multiple of 16
  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
  align_buffer_page_end(src_pixels, kPixels * 4);
  align_buffer_page_end(tmp_pixels_r, kPixels);
  align_buffer_page_end(tmp_pixels_g, kPixels);
  align_buffer_page_end(tmp_pixels_b, kPixels);
  align_buffer_page_end(dst_pixels_opt, kPixels * 4);
  align_buffer_page_end(dst_pixels_c, kPixels * 4);
  MemRandomize(src_pixels, kPixels * 4);
  MemRandomize(tmp_pixels_r, kPixels);
  MemRandomize(tmp_pixels_g, kPixels);
  MemRandomize(tmp_pixels_b, kPixels);
  MemRandomize(dst_pixels_opt, kPixels * 4);
  MemRandomize(dst_pixels_c, kPixels * 4);
  MaskCpuFlags(disable_cpu_flags_);
  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
                 benchmark_width_, NULL, 0, benchmark_width_,
                 benchmark_height_);
  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
                 tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_c,
                 benchmark_width_ * 4, benchmark_width_, benchmark_height_);
  MaskCpuFlags(benchmark_cpu_info_);
  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
                 benchmark_width_, NULL, 0, benchmark_width_,
                 benchmark_height_);
  for (int i = 0; i < benchmark_iterations_; ++i) {
    MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
                   benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0,
                   dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
                   benchmark_height_);
  }
  for (int i = 0; i < kPixels * 4; ++i) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(tmp_pixels_r);
  free_aligned_buffer_page_end(tmp_pixels_g);
  free_aligned_buffer_page_end(tmp_pixels_b);
  free_aligned_buffer_page_end(dst_pixels_opt);
  free_aligned_buffer_page_end(dst_pixels_c);
 }
 TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
  // Round count up to multiple of 16
  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
  align_buffer_page_end(src_pixels, kPixels * 4);
  align_buffer_page_end(tmp_pixels_r, kPixels);
  align_buffer_page_end(tmp_pixels_g, kPixels);
  align_buffer_page_end(tmp_pixels_b, kPixels);
  align_buffer_page_end(dst_pixels_opt, kPixels * 4);
  align_buffer_page_end(dst_pixels_c, kPixels * 4);
  MemRandomize(src_pixels, kPixels * 4);
  MemRandomize(tmp_pixels_r, kPixels);
  MemRandomize(tmp_pixels_g, kPixels);
  MemRandomize(tmp_pixels_b, kPixels);
  MemRandomize(dst_pixels_opt, kPixels * 4);
  MemRandomize(dst_pixels_c, kPixels * 4);
  MaskCpuFlags(disable_cpu_flags_);
  SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
                 benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
                 benchmark_width_, NULL, 0, benchmark_width_,
                 benchmark_height_);
  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
                 tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_c,
                 benchmark_width_ * 4, benchmark_width_, benchmark_height_);
  MaskCpuFlags(benchmark_cpu_info_);
  for (int i = 0; i < benchmark_iterations_; ++i) {
    SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
                   benchmark_width_, tmp_pixels_g, benchmark_width_,
                   tmp_pixels_b, benchmark_width_, NULL, 0, benchmark_width_,
                   benchmark_height_);
  }
  MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
                 tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_opt,
                 benchmark_width_ * 4, benchmark_width_, benchmark_height_);
  for (int i = 0; i < kPixels * 4; ++i) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(tmp_pixels_r);
  free_aligned_buffer_page_end(tmp_pixels_g);
  free_aligned_buffer_page_end(tmp_pixels_b);
  free_aligned_buffer_page_end(dst_pixels_opt);
  free_aligned_buffer_page_end(dst_pixels_c);
 }
 // TODO(fbarchard): improve test for platforms and cpu detect
 #ifdef HAS_MERGEUVROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {